pax_global_header 0000666 0000000 0000000 00000000064 14601504142 0014507 g ustar 00root root 0000000 0000000 52 comment=b90bdc876581e7e84db782cdffddc4782de1627c
raft-0.22.1/ 0000775 0000000 0000000 00000000000 14601504142 0012525 5 ustar 00root root 0000000 0000000 raft-0.22.1/.clang-format 0000664 0000000 0000000 00000000257 14601504142 0015104 0 ustar 00root root 0000000 0000000 BasedOnStyle: Chromium
BreakBeforeBraces: Custom
BraceWrapping:
AfterFunction: true
AfterStruct: true
IndentWidth: 4
PointerAlignment: Right
ForEachMacros: ['QUEUE_FOREACH'] raft-0.22.1/.dir-locals.el 0000664 0000000 0000000 00000000330 14601504142 0015152 0 ustar 00root root 0000000 0000000 ((nil . ((fill-column . 80)))
(c-mode . ((flycheck-clang-definitions . ("HAVE_LINUX_AIO_ABI_H" "HAVE_LINUX_IO_URING_H" "_GNU_SOURCE"))
(flycheck-clang-args . ("-Wpedantic" "-Wall" "-Wextra" "-Wconversion")))))
raft-0.22.1/.github/ 0000775 0000000 0000000 00000000000 14601504142 0014065 5 ustar 00root root 0000000 0000000 raft-0.22.1/.github/workflows/ 0000775 0000000 0000000 00000000000 14601504142 0016122 5 ustar 00root root 0000000 0000000 raft-0.22.1/.github/workflows/benchmark.yml 0000664 0000000 0000000 00000007130 14601504142 0020600 0 ustar 00root root 0000000 0000000 name: Benchmark
on:
workflow_dispatch:
inputs:
keep:
description: "Keep BMC server"
type: boolean
default: false
# schedule:
# - cron: '12 4 * * *'
jobs:
github:
name: On GitHub
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
with:
repository: cowsql/cowsql-ci
- name: Run checks
env:
BENCHER_API_TOKEN: ${{ secrets.BENCHER_API_TOKEN }}
run: |
./bin/check --testbed github
bmc-deploy:
name: On BMC - deploy
runs-on: ubuntu-22.04
outputs:
id: ${{ steps.create.outputs.id }}
address: ${{ steps.wait.outputs.address}}
steps:
- name: Create
uses: phoenixnap-github-actions/create-server-bmc@v1
id: create
with:
clientid: ${{secrets.BMC_CLIENT_ID}}
clientsecret: ${{secrets.BMC_CLIENT_SECRET}}
hostname: "bmc"
image: "ubuntu/jammy"
location: "ASH"
type: "s1.c1.medium"
- name: Wait
id: wait
env:
ID: ${{ steps.create.outputs.id }}
AUTH_URL: https://auth.phoenixnap.com/auth/realms/BMC/protocol/openid-connect/token
API_URL: https://api.phoenixnap.com/bmc/v1/servers
CLIENT_ID: ${{secrets.BMC_CLIENT_ID}}
CLIENT_SECRET: ${{secrets.BMC_CLIENT_SECRET}}
run: |
for i in $(seq 30); do
DATA="client_id=${CLIENT_ID}&client_secret=${CLIENT_SECRET}&grant_type=client_credentials"
ACCESS_TOKEN=$(curl -s -X POST -d "${DATA}" ${AUTH_URL} | jq -r .access_token)
HEADER="Authorization: Bearer ${ACCESS_TOKEN}"
STATUS=$(curl -s -H "${HEADER}" ${API_URL}/${ID}/ | jq -r .status)
echo status: $STATUS
if [ "$STATUS" = "powered-on" ]; then
break
fi
sleep 60
done
if [ "$STATUS" != "powered-on" ]; then
echo "Server still not ready: $STATUS"
exit 1
fi
ADDR=$(curl -s -H "${HEADER}" ${API_URL}/${ID}/ | jq -r .publicIpAddresses[0])
echo "address=$ADDR" >> $GITHUB_OUTPUT
- name: Update kernel
env:
SSH: "ssh -o StrictHostKeyChecking=no -i ~/.ssh/bmc ubuntu@${{ steps.wait.outputs.address}}"
run: |
mkdir -p ~/.ssh/
echo "${{secrets.BMC_SSH_KEY}}" > ~/.ssh/bmc
chmod 600 ~/.ssh/bmc
$SSH sudo apt-get install -y git
$SSH git clone --depth 1 https://github.com/cowsql/cowsql-ci.git ci
$SSH /home/ubuntu/ci/bin/install-kernel
$SSH sudo reboot || true
sleep 30
for i in $(seq 60); do
$SSH true 2>/dev/null && break
sleep 5
done
$SSH true
bmc-run:
name: On BMC - run
runs-on: ubuntu-22.04
needs: bmc-deploy
env:
SSH: "ssh -o StrictHostKeyChecking=no -i ~/.ssh/bmc ubuntu@${{ needs.bmc-deploy.outputs.address}}"
steps:
- name: Setup SSH key
run: |
mkdir -p ~/.ssh/
echo "${{secrets.BMC_SSH_KEY}}" > ~/.ssh/bmc
chmod 600 ~/.ssh/bmc
- name: Run checks
env:
BENCHER_API_TOKEN: ${{ secrets.BENCHER_API_TOKEN }}
run: |
$SSH "BENCHER_API_TOKEN=$BENCHER_API_TOKEN /home/ubuntu/ci/bin/check --testbed bmc"
bmc-delete:
name: On BMC - delete
runs-on: ubuntu-22.04
if: always() && (inputs.keep == '' || inputs.keep == 'false')
needs: [bmc-deploy, bmc-run]
steps:
- name: Delete
uses: phoenixnap-github-actions/delete-server-bmc@v1
with:
clientid: ${{secrets.BMC_CLIENT_ID}}
clientsecret: ${{secrets.BMC_CLIENT_SECRET}}
serverid: ${{ needs.bmc-deploy.outputs.id}}
raft-0.22.1/.github/workflows/coverity.yml 0000664 0000000 0000000 00000001234 14601504142 0020511 0 ustar 00root root 0000000 0000000 name: Coverity
on:
push:
branches:
- main
jobs:
coverity:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Install dependencies
run: |
sudo apt-get update -qq
sudo apt-get install -qq linux-libc-dev libuv1-dev
- name: Configure
run: |
autoreconf -i
mkdir build
cd build
../configure
- uses: vapier/coverity-scan-action@v1
with:
email: free@ekanayaka.io
token: ${{ secrets.COVERITY_SCAN_TOKEN }}
version: main
description: ${GITHUB_SHA}
command: make -C build -j$(nproc)
raft-0.22.1/.github/workflows/downstream.yml 0000664 0000000 0000000 00000020002 14601504142 0021022 0 ustar 00root root 0000000 0000000 name: Downstream
on:
push:
tags:
- '**'
pull_request:
types: [ labeled, opened, synchronize, reopened ]
jobs:
cowsql:
if: contains(github.event.pull_request.labels.*.name, 'downstream')
runs-on: ubuntu-22.04
steps:
- name: Install apt deps
run: |
sudo apt-get update -qq
sudo apt-get install -qq automake libtool gcc make libuv1-dev libsqlite3-dev
- name: Check out raft
uses: actions/checkout@v3
with:
ref: refs/pull/${{ github.event.number }}/head
path: raft
- name: Install raft
run: |
cd raft
autoreconf -i
./configure --prefix=/usr --enable-debug --enable-uv --enable-sanitize --enable-backtrace
sudo make -j$(nproc) install
- name: Check out cowsql
uses: actions/checkout@v3
with:
repository: cowsql/cowsql
path: cowsql
- name: Test and install cowsql
run: |
cd cowsql
autoreconf -i
./configure --prefix=/usr --enable-debug --enable-sanitize --enable-backtrace
sudo make -j$(nproc) check || (cat ./test-suite.log && false)
sudo make install
- name: Install Go
uses: actions/setup-go@v4
- name: Check out go-cowsql
uses: actions/checkout@v3
with:
repository: cowsql/go-cowsql
path: go-cowsql
- name: Test go-cowsql
env:
GO_DQLITE_MULTITHREAD: '1'
run: |
cd go-cowsql
go get -tags libsqlite3 -t ./...
go test -asan -v ./...
VERBOSE=1 ASAN=-asan ./test/cowsql-demo.sh
VERBOSE=1 ASAN=-asan ./test/roles.sh
VERBOSE=1 ASAN=-asan ./test/recover.sh
dqlite:
if: contains(github.event.pull_request.labels.*.name, 'downstream')
runs-on: ubuntu-22.04
steps:
- name: Install apt deps
run: |
sudo apt-get update -qq
sudo apt-get install -qq automake libtool gcc make libuv1-dev libsqlite3-dev
- name: Check out raft
uses: actions/checkout@v3
with:
ref: refs/pull/${{ github.event.number }}/head
path: raft
- name: Install raft
run: |
cd raft
autoreconf -i
./configure --prefix=/usr --enable-debug --enable-uv --enable-sanitize --enable-backtrace
sudo make -j$(nproc) install
- name: Check out dqlite
uses: actions/checkout@v3
with:
repository: canonical/dqlite
ref: v1.16.0
path: dqlite
- name: Test and install dqlite
run: |
cd dqlite
autoreconf -i
./configure --prefix=/usr --enable-debug --enable-sanitize --enable-backtrace
sudo make -j$(nproc) check || (cat ./test-suite.log && false)
sudo make install
- name: Install Go
uses: actions/setup-go@v4
- name: Check out go-dqlite
uses: actions/checkout@v3
with:
repository: canonical/go-dqlite
path: go-dqlite
- name: Test go-dqlite
env:
GO_DQLITE_MULTITHREAD: '1'
run: |
cd go-dqlite
go get -tags libsqlite3 -t ./...
go test -asan -v ./...
VERBOSE=1 ASAN=-asan ./test/dqlite-demo.sh
VERBOSE=1 ASAN=-asan ./test/roles.sh
VERBOSE=1 ASAN=-asan ./test/recover.sh
incus:
if: contains(github.event.pull_request.labels.*.name, 'downstream')
runs-on: ubuntu-22.04
env:
CGO_LDFLAGS_ALLOW: "(-Wl,-wrap,pthread_create)|(-Wl,-z,now)"
INCUS_SHIFTFS_DISABLE: "true"
INCUS_VERBOSE: "1"
INCUS_OFFLINE: "1"
INCUS_TMPFS: "1"
INCUS_REQUIRED_TESTS: "test_storage_buckets"
strategy:
fail-fast: false
matrix:
suite: ["cluster", "standalone"]
steps:
- name: Checkout
uses: actions/checkout@v4
with:
repository: lxc/incus
- name: Install Go
uses: actions/setup-go@v4
- name: Install dependencies
run: |
set -x
sudo add-apt-repository ppa:ubuntu-lxc/lxc-git-master -y --no-update
sudo apt-get update
sudo snap remove lxd --purge
sudo snap remove core20 --purge || true
sudo apt-get autopurge moby-containerd docker uidmap -y
sudo ip link delete docker0
sudo nft flush ruleset
sudo systemctl mask lxc.service
sudo systemctl mask lxc-net.service
sudo apt-get install --no-install-recommends -y \
curl \
git \
libacl1-dev \
libcap-dev \
libdbus-1-dev \
liblxc-dev \
libseccomp-dev \
libselinux-dev \
libsqlite3-dev \
libtool \
libudev-dev \
libuv1-dev \
automake \
make \
pkg-config\
acl \
attr \
bind9-dnsutils \
btrfs-progs \
busybox-static \
dnsmasq-base \
easy-rsa \
gettext \
jq \
lxc-utils \
lvm2 \
nftables \
quota \
rsync \
s3cmd \
socat \
sqlite3 \
squashfs-tools \
tar \
tcl \
thin-provisioning-tools \
uuid-runtime \
xfsprogs \
xz-utils \
zfsutils-linux
# reclaim some space
sudo apt-get clean
# Download minio.
curl -sSfL https://dl.min.io/server/minio/release/linux-amd64/archive/minio_20240116160738.0.0_amd64.deb --output /tmp/minio.deb
sudo apt-get install /tmp/minio.deb --yes
# Download latest release of openfga server.
mkdir -p "$(go env GOPATH)/bin"
curl -sSfL https://api.github.com/repos/openfga/openfga/releases/latest | jq -r '.assets | .[] | .browser_download_url | select(. | test("_linux_amd64.tar.gz$"))' | xargs -I {} curl -sSfL {} -o openfga.tar.gz
tar -xzf openfga.tar.gz -C "$(go env GOPATH)/bin/"
# Download latest release of openfga cli.
curl -sSfL https://api.github.com/repos/openfga/cli/releases/latest | jq -r '.assets | .[] | .browser_download_url | select(. | test("_linux_amd64.tar.gz$"))' | xargs -I {} curl -sSfL {} -o fga.tar.gz
tar -xzf fga.tar.gz -C "$(go env GOPATH)/bin/"
- name: Check out raft
uses: actions/checkout@v3
with:
ref: refs/pull/${{ github.event.number }}/head
path: raft
- name: Install raft
run: |
cd raft
autoreconf -i
./configure --prefix=/usr --enable-debug --enable-backtrace
sudo make -j$(nproc) install
- name: Check out cowsql
uses: actions/checkout@v3
with:
repository: cowsql/cowsql
path: cowsql
- name: Install cowsql
run: |
cd cowsql
autoreconf -i
./configure --prefix=/usr --enable-debug --enable-backtrace
sudo make -j$(nproc) install
- name: Download go dependencies
run: |
go mod download
- name: Run Incus build
run: |
make
- name: "Run system tests"
run: |
chmod +x ~
echo "root:1000000:1000000000" | sudo tee /etc/subuid /etc/subgid
cd test
sudo --preserve-env=PATH,GOPATH,GITHUB_ACTIONS,INCUS_VERBOSE,INCUS_BACKEND,INCUS_OFFLINE,INCUS_SKIP_TESTS,INCUS_REQUIRED_TESTS,INCUS_SHIFTFS_DISABLE INCUS_BACKEND=dir ./main.sh ${{ matrix.suite }}
jepsen:
if: contains(github.event.pull_request.labels.*.name, 'downstream')
uses: cowsql/jepsen.cowsql/.github/workflows/test-build-run.yml@main
with:
raft-ref: refs/pull/${{ github.event.number }}/head
workloads: >
['append', 'bank', 'set']
nemeses: >
['none', 'partition', 'kill', 'stop', 'disk', 'member',
'partition,stop', 'partition,kill', 'partition,member',
'packet,stop', 'pause']
raft-0.22.1/.github/workflows/tests.yml 0000664 0000000 0000000 00000010244 14601504142 0020010 0 ustar 00root root 0000000 0000000 name: Tests
on:
- push
- pull_request
jobs:
test:
name: Unit and integration tests
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Setup dependencies
run: |
sudo apt-get update -qq
sudo apt-get install -qq lcov linux-libc-dev libuv1-dev btrfs-progs xfsprogs zfsutils-linux
- name: Configure
run: |
autoreconf -i
./configure --enable-example \
--enable-debug \
--enable-code-coverage \
--enable-sanitize \
--enable-benchmark
- name: Build
run: |
make -j$(nproc --all)
- name: Amalgamation
run: |
git clone --depth 1 https://github.com/edlund/amalgamate.git
export PATH=$PATH:$PWD/amalgamate
amalgamate.py --config=amalgamation.json --source=$(pwd)
gcc raft.c -c -D_GNU_SOURCE -DHAVE_LINUX_AIO_ABI_H -Wall -Wextra -Wpedantic -fpic
- name: Test
run: |
export LIBRAFT_TRACE=1
./test/lib/fs.sh setup
make check CFLAGS=-O0 $(./test/lib/fs.sh detect) || (cat ./test-suite.log && false)
./test/lib/fs.sh teardown
- name: Coverage
run: |
make code-coverage-capture
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
verbose: true
linting:
name: Linting
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- uses: DoozyX/clang-format-lint-action@v0.14
with:
source: 'src test example'
exclude: 'test/lib/munit.*'
extensions: 'c,h'
clangFormatVersion: 14
style: file
configure:
name: Configuration flags
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Check that no optional dependency is installed
run: |
# Remove liblz4-dev which is installed by default on the runner
sudo apt-get remove liblz4-dev
# Check that there are no dependencies installed
! pkg-config --exists libuv
! pkg-config --exists liblz4
- name: Run autoreconf
run: |
autoreconf -i
- name: With no deps ./configure
run: |
# Succeed, since we are not explicitly requiring libuv.
./configure
- name: With no deps ./configure --enable-uv
run: |
# Fail, since libuv is not installed.
! ./configure --enable-uv 2>errors
tail -1 errors | grep -q "libuv required but not found" || (cat errors && false)
- name: With no deps ./configure --with-lz4
run: |
# Fail, since using lz4 makes sense only if libuv is used too.
! ./configure --with-lz4 2>errors
tail -1 errors | grep -q "liblz4 can be used only if libuv is used too" || (cat errors && false)
- name: Install libuv
run: |
sudo apt-get install -qq linux-libc-dev libuv1-dev
- name: With libuv only ./configure
run: |
# Succeed, since libuv is installed and automatically used.
./configure
- name: With libuv only ./configure --disable-uv
run: |
# Succeed, since libuv support can be disabled
./configure --disable-uv
- name: With libuv only ./configure --with-lz4
run: |
# Fail, since liblz4 is not installed.
! ./configure --with-lz4 2>errors
tail -1 errors | grep -q "liblz4 required but not found" || (cat errors && false)
- name: With libuv only ./configure --disable-uv --with-lz4
run: |
# Fail, since using lz4 makes sense only if libuv is used too.
! ./configure --disable-uv --with-lz4 2>errors
tail -1 errors | grep -q "liblz4 can be used only if libuv is used too" || (cat errors && false)
- name: Install liblz4
run: |
sudo apt-get install -qq liblz4-dev
- name: With libuv and liblz4 ./configure
run: |
# Succeed, since all optional dependencies are found and used.
./configure
- name: With libuv and liblz4 ./configure --without-lz4
run: |
# Succeed, since we support building without lz4 even if both libuv and
# liblz4 are found.
./configure --without-lz4
raft-0.22.1/.gitignore 0000664 0000000 0000000 00000000715 14601504142 0014520 0 ustar 00root root 0000000 0000000 *.o
*.gcno
*.gcda
*~
Makefile.in
aclocal.m4
aminclude_static.am
autom4te.cache/
config.h.in
configure
Makefile
config.h
config.log
config.status
libtool
raft.pc
stamp-h1
*.lo
*.la
.dirstamp
.deps/
.libs/
test/unit/core
test/unit/uv
test/integration/core
test/integration/uv
test/fuzzy/core
test/*/*.log
test/*/*.trs
os-test*
test-suite.log
coverage/
coverage.info
TAGS
example/server
example/cluster
tools/raft-benchmark
tmp
conftest*
docs/build
include/raft.h
raft-0.22.1/.readthedocs.yaml 0000664 0000000 0000000 00000000454 14601504142 0015757 0 ustar 00root root 0000000 0000000 # .readthedocs.yaml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.11"
python:
install:
- requirements: docs/requirements.txt
sphinx:
configuration: docs/conf.py
raft-0.22.1/.travis.yml 0000664 0000000 0000000 00000002035 14601504142 0014636 0 ustar 00root root 0000000 0000000 language: c
addons:
apt:
packages:
- lcov
- linux-libc-dev
- libuv1-dev
- btrfs-progs
- xfsprogs
- zfsutils-linux
jobs:
include:
- if: type != pull_request
compiler: gcc
dist: bionic
arch: s390x
- if: type == pull_request
compiler: gcc
dist: bionic
arch: arm64
- if: type != pull_request
compiler: clang
dist: bionic
arch: ppc64le
before_script:
- git clone --depth 1 https://github.com/edlund/amalgamate.git
- export PATH=$PATH:$PWD/amalgamate
script:
- autoreconf -i
- |
if [ $TRAVIS_CPU_ARCH = "s390x" ] || [ $TRAVIS_CPU_ARCH = "arm64" ]; then
./configure --enable-example --enable-debug
else
./configure --enable-example --enable-debug --enable-sanitize
fi
- amalgamate.py --config=amalgamation.json --source=$(pwd)
- $CC raft.c -c -D_GNU_SOURCE -DHAVE_LINUX_AIO_ABI_H -Wall -Wextra -Wpedantic -fpic
- ./test/lib/fs.sh setup
- make check $(./test/lib/fs.sh detect) || (cat ./test-suite.log && false)
- ./test/lib/fs.sh teardown
raft-0.22.1/AUTHORS 0000664 0000000 0000000 00000000360 14601504142 0013574 0 ustar 00root root 0000000 0000000 Unless mentioned otherwise in a specific file's header, all code in this
project is released under the LGPL v3 license.
The list of authors and contributors can be retrieved from the git
commit history and in some cases, the file headers.
raft-0.22.1/LICENSE 0000664 0000000 0000000 00000022011 14601504142 0013526 0 ustar 00root root 0000000 0000000 All files in this repository are licensed as follows. If you contribute
to this repository, it is assumed that you license your contribution
under the same license unless you state otherwise.
All files Copyright (C) 2019 Canonical Ltd. unless otherwise specified in the file.
All files modifications after the 9th of August 2023 Copyright (C) 2023 Free
Ekanayaka unless otherwise specified in the file.
This software is licensed under the LGPLv3, included below.
As a special exception to the GNU Lesser General Public License version 3
("LGPL3"), the copyright holders of this Library give you permission to
convey to a third party a Combined Work that links statically or dynamically
to this Library without providing any Minimal Corresponding Source or
Minimal Application Code as set out in 4d or providing the installation
information set out in section 4e, provided that you comply with the other
provisions of LGPL3 and provided that you meet, for the Application the
terms and conditions of the license(s) which apply to the Application.
Except as stated in this special exception, the provisions of LGPL3 will
continue to comply in full to this Library. If you modify this Library, you
may apply this exception to your version of this Library, but you are not
obliged to do so. If you do not wish to do so, delete this exception
statement from your version. This exception does not (and cannot) modify any
license terms which apply to the Application, with which you must still
comply.
SPDX-License-Identifier: LGPL-3.0-only WITH LGPL-3.0-linking-exception
GNU LESSER GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc.
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
This version of the GNU Lesser General Public License incorporates
the terms and conditions of version 3 of the GNU General Public
License, supplemented by the additional permissions listed below.
0. Additional Definitions.
As used herein, "this License" refers to version 3 of the GNU Lesser
General Public License, and the "GNU GPL" refers to version 3 of the GNU
General Public License.
"The Library" refers to a covered work governed by this License,
other than an Application or a Combined Work as defined below.
An "Application" is any work that makes use of an interface provided
by the Library, but which is not otherwise based on the Library.
Defining a subclass of a class defined by the Library is deemed a mode
of using an interface provided by the Library.
A "Combined Work" is a work produced by combining or linking an
Application with the Library. The particular version of the Library
with which the Combined Work was made is also called the "Linked
Version".
The "Minimal Corresponding Source" for a Combined Work means the
Corresponding Source for the Combined Work, excluding any source code
for portions of the Combined Work that, considered in isolation, are
based on the Application, and not on the Linked Version.
The "Corresponding Application Code" for a Combined Work means the
object code and/or source code for the Application, including any data
and utility programs needed for reproducing the Combined Work from the
Application, but excluding the System Libraries of the Combined Work.
1. Exception to Section 3 of the GNU GPL.
You may convey a covered work under sections 3 and 4 of this License
without being bound by section 3 of the GNU GPL.
2. Conveying Modified Versions.
If you modify a copy of the Library, and, in your modifications, a
facility refers to a function or data to be supplied by an Application
that uses the facility (other than as an argument passed when the
facility is invoked), then you may convey a copy of the modified
version:
a) under this License, provided that you make a good faith effort to
ensure that, in the event an Application does not supply the
function or data, the facility still operates, and performs
whatever part of its purpose remains meaningful, or
b) under the GNU GPL, with none of the additional permissions of
this License applicable to that copy.
3. Object Code Incorporating Material from Library Header Files.
The object code form of an Application may incorporate material from
a header file that is part of the Library. You may convey such object
code under terms of your choice, provided that, if the incorporated
material is not limited to numerical parameters, data structure
layouts and accessors, or small macros, inline functions and templates
(ten or fewer lines in length), you do both of the following:
a) Give prominent notice with each copy of the object code that the
Library is used in it and that the Library and its use are
covered by this License.
b) Accompany the object code with a copy of the GNU GPL and this license
document.
4. Combined Works.
You may convey a Combined Work under terms of your choice that,
taken together, effectively do not restrict modification of the
portions of the Library contained in the Combined Work and reverse
engineering for debugging such modifications, if you also do each of
the following:
a) Give prominent notice with each copy of the Combined Work that
the Library is used in it and that the Library and its use are
covered by this License.
b) Accompany the Combined Work with a copy of the GNU GPL and this license
document.
c) For a Combined Work that displays copyright notices during
execution, include the copyright notice for the Library among
these notices, as well as a reference directing the user to the
copies of the GNU GPL and this license document.
d) Do one of the following:
0) Convey the Minimal Corresponding Source under the terms of this
License, and the Corresponding Application Code in a form
suitable for, and under terms that permit, the user to
recombine or relink the Application with a modified version of
the Linked Version to produce a modified Combined Work, in the
manner specified by section 6 of the GNU GPL for conveying
Corresponding Source.
1) Use a suitable shared library mechanism for linking with the
Library. A suitable mechanism is one that (a) uses at run time
a copy of the Library already present on the user's computer
system, and (b) will operate properly with a modified version
of the Library that is interface-compatible with the Linked
Version.
e) Provide Installation Information, but only if you would otherwise
be required to provide such information under section 6 of the
GNU GPL, and only to the extent that such information is
necessary to install and execute a modified version of the
Combined Work produced by recombining or relinking the
Application with a modified version of the Linked Version. (If
you use option 4d0, the Installation Information must accompany
the Minimal Corresponding Source and Corresponding Application
Code. If you use option 4d1, you must provide the Installation
Information in the manner specified by section 6 of the GNU GPL
for conveying Corresponding Source.)
5. Combined Libraries.
You may place library facilities that are a work based on the
Library side by side in a single library together with other library
facilities that are not Applications and are not covered by this
License, and convey such a combined library under terms of your
choice, if you do both of the following:
a) Accompany the combined library with a copy of the same work based
on the Library, uncombined with any other library facilities,
conveyed under the terms of this License.
b) Give prominent notice with the combined library that part of it
is a work based on the Library, and explaining where to find the
accompanying uncombined form of the same work.
6. Revised Versions of the GNU Lesser General Public License.
The Free Software Foundation may publish revised and/or new versions
of the GNU Lesser General Public License from time to time. Such new
versions will be similar in spirit to the present version, but may
differ in detail to address new problems or concerns.
Each version is given a distinguishing version number. If the
Library as you received it specifies that a certain numbered version
of the GNU Lesser General Public License "or any later version"
applies to it, you have the option of following the terms and
conditions either of that published version or of any later version
published by the Free Software Foundation. If the Library as you
received it does not specify a version number of the GNU Lesser
General Public License, you may choose any version of the GNU Lesser
General Public License ever published by the Free Software Foundation.
If the Library as you received it specifies that a proxy can decide
whether future versions of the GNU Lesser General Public License shall
apply, that proxy's public statement of acceptance of any version is
permanent authorization for you to choose that version for the
Library.
raft-0.22.1/Makefile.am 0000664 0000000 0000000 00000016153 14601504142 0014567 0 ustar 00root root 0000000 0000000 ACLOCAL_AMFLAGS = -I m4
# Not clobbering the base pointer helps bpftrace construct backtraces
AM_CFLAGS += -fno-omit-frame-pointer
include_HEADERS = include/raft.h
raftincludedir = $(includedir)/raft
raftinclude_HEADERS =
lib_LTLIBRARIES = libraft.la
libraft_la_CFLAGS = $(AM_CFLAGS) $(CODE_COVERAGE_CFLAGS) -fvisibility=hidden
libraft_la_LDFLAGS = -version-info 0:0:0
libraft_la_SOURCES = \
src/byte.c \
src/client.c \
src/compress.c \
src/configuration.c \
src/convert.c \
src/election.c \
src/entry.c \
src/err.c \
src/heap.c \
src/membership.c \
src/message.c \
src/progress.c \
src/random.c \
src/raft.c \
src/recv.c \
src/recv_append_entries.c \
src/recv_append_entries_result.c \
src/recv_request_vote.c \
src/recv_request_vote_result.c \
src/recv_install_snapshot.c \
src/recv_timeout_now.c \
src/restore.c \
src/replication.c \
src/state.c \
src/syscall.c \
src/timeout.c \
src/tracing.c \
src/trail.c
bin_PROGRAMS =
check_PROGRAMS = \
test/unit/core \
test/integration/core
TESTS = $(check_PROGRAMS)
check_LTLIBRARIES = libtest.la
libtest_la_CFLAGS = $(AM_CFLAGS) -DMUNIT_TEST_NAME_LEN=60 -Wno-unused-result -Wno-conversion
libtest_la_SOURCES = \
test/lib/addrinfo.c \
test/lib/cluster.c \
test/lib/fault.c \
test/lib/heap.c \
test/lib/munit.c \
test/lib/tracer.c \
test/lib/tcp.c
test_unit_core_SOURCES = \
src/byte.c \
src/compress.c \
src/configuration.c \
src/err.c \
src/heap.c \
src/log.c \
src/random.c \
src/trail.c \
test/unit/main_core.c \
test/unit/test_byte.c \
test/unit/test_compress.c \
test/unit/test_configuration.c \
test/unit/test_err.c \
test/unit/test_log.c \
test/unit/test_queue.c \
test/unit/test_random.c \
test/unit/test_trail.c
test_unit_core_CFLAGS = $(AM_CFLAGS) $(CODE_COVERAGE_CFLAGS) -Wno-conversion
test_unit_core_LDADD = libtest.la
test_integration_core_SOURCES = \
test/integration/main_core.c \
test/integration/test_catch_up.c \
test/integration/test_digest.c \
test/integration/test_election.c \
test/integration/test_heap.c \
test/integration/test_init.c \
test/integration/test_membership.c \
test/integration/test_replication.c \
test/integration/test_snapshot.c \
test/integration/test_start.c \
test/integration/test_strerror.c \
test/integration/test_submit.c \
test/integration/test_tick.c \
test/integration/test_transfer.c
test_integration_core_CFLAGS = $(AM_CFLAGS) -Wno-conversion
test_integration_core_LDFLAGS = -no-install
test_integration_core_LDADD = libtest.la libraft.la
if LZ4_AVAILABLE
test_unit_core_CFLAGS += -DLZ4_AVAILABLE $(LZ4_CFLAGS)
test_unit_core_LDFLAGS = $(LZ4_LIBS)
libraft_la_CFLAGS += -DLZ4_AVAILABLE $(LZ4_CFLAGS)
libraft_la_LDFLAGS += $(LZ4_LIBS)
endif # LZ4_AVAILABLE
if V0_ENABLED
libraft_la_SOURCES += \
src/log.c \
src/legacy.c \
src/snapshot.c
libraft_la_CFLAGS += -DV0_ENABLED
libtest_la_SOURCES += \
test/lib/fsm.c \
test/lib/legacy.c
test_integration_core_SOURCES += \
test/integration/test_legacy.c
endif # V0_ENABLED
if FIXTURE_ENABLED
libraft_la_SOURCES += src/fixture.c
raftinclude_HEADERS += include/raft/fixture.h
check_PROGRAMS += \
test/fuzzy/core
test_integration_core_SOURCES += \
test/integration/test_fixture.c
test_fuzzy_core_SOURCES = \
test/fuzzy/main_core.c \
test/fuzzy/test_election.c \
test/fuzzy/test_liveness.c \
test/fuzzy/test_membership.c \
test/fuzzy/test_replication.c
test_fuzzy_core_CFLAGS = $(AM_CFLAGS) -Wno-conversion
test_fuzzy_core_LDFLAGS = -no-install
test_fuzzy_core_LDADD = libtest.la libraft.la
endif # FIXTURE_ENABLED
if UV_ENABLED
libraft_la_SOURCES += \
src/uv.c \
src/uv_append.c \
src/uv_encoding.c \
src/uv_finalize.c \
src/uv_fs.c \
src/uv_ip.c \
src/uv_list.c \
src/uv_metadata.c \
src/uv_os.c \
src/uv_prepare.c \
src/uv_recv.c \
src/uv_segment.c \
src/uv_send.c \
src/uv_snapshot.c \
src/uv_tcp.c \
src/uv_tcp_listen.c \
src/uv_tcp_connect.c \
src/uv_truncate.c \
src/uv_writer.c
libraft_la_LDFLAGS += $(UV_LIBS)
raftinclude_HEADERS += include/raft/uv.h
check_PROGRAMS += \
test/unit/uv \
test/integration/uv
libtest_la_SOURCES += \
test/lib/aio.c \
test/lib/dir.c \
test/lib/tcp.c \
test/lib/loop.c
test_unit_uv_SOURCES = \
src/err.c \
src/heap.c \
src/syscall.c \
src/tracing.c \
src/uv_fs.c \
src/uv_os.c \
src/uv_writer.c \
test/unit/main_uv.c \
test/unit/test_uv_fs.c \
test/unit/test_uv_os.c \
test/unit/test_uv_writer.c
test_unit_uv_CFLAGS = $(AM_CFLAGS) -Wno-conversion
test_unit_uv_LDADD = libtest.la $(UV_LIBS)
test_integration_uv_SOURCES = \
test/integration/main_uv.c \
test/integration/test_uv_init.c \
test/integration/test_uv_append.c \
test/integration/test_uv_bootstrap.c \
test/integration/test_uv_load.c \
test/integration/test_uv_recover.c \
test/integration/test_uv_recv.c \
test/integration/test_uv_send.c \
test/integration/test_uv_set_term.c \
test/integration/test_uv_tcp_connect.c \
test/integration/test_uv_tcp_listen.c \
test/integration/test_uv_snapshot_put.c \
test/integration/test_uv_truncate.c \
test/integration/test_uv_truncate_snapshot.c
test_integration_uv_CFLAGS = $(AM_CFLAGS) -Wno-type-limits -Wno-conversion
test_integration_uv_LDFLAGS = -no-install
test_integration_uv_LDADD = libtest.la libraft.la $(UV_LIBS)
AM_CFLAGS += $(UV_CFLAGS)
if LZ4_AVAILABLE
test_integration_uv_CFLAGS += -DLZ4_AVAILABLE
test_integration_uv_LDFLAGS += $(LZ4_LIBS)
endif # LZ4_AVAILABLE
endif # UV_ENABLED
if BACKTRACE_ENABLED
libraft_la_CFLAGS += -DRAFT_ASSERT_WITH_BACKTRACE
libraft_la_LDFLAGS += -lbacktrace
endif # BACKTRACE_ENABLED
if EXAMPLE_ENABLED
bin_PROGRAMS += \
example/server \
example/cluster
example_server_SOURCES = example/server.c
example_server_LDFLAGS = -no-install
example_server_LDADD = libraft.la $(UV_LIBS)
example_cluster_SOURCES = example/cluster.c
endif # EXAMPLE_ENABLED
if BENCHMARK_ENABLED
bin_PROGRAMS += \
tools/raft-benchmark
tools_raft_benchmark_SOURCES = \
tools/benchmark/disk.c \
tools/benchmark/disk_parse.c \
tools/benchmark/disk_uring.c \
tools/benchmark/fs.c \
tools/benchmark/main.c \
tools/benchmark/report.c \
tools/benchmark/submit_parse.c \
tools/benchmark/submit.c \
tools/benchmark/profiler.c \
tools/benchmark/timer.c
tools_raft_benchmark_LDFLAGS =
tools_raft_benchmark_LDADD = libraft.la $(UV_LIBS)
endif # BENCHMARK_ENABLED
if DEBUG_ENABLED
AM_CFLAGS += -Werror -Wall
else
AM_CFLAGS += -DNDEBUG
endif
if SANITIZE_ENABLED
AM_CFLAGS += -fsanitize=address
endif
if CODE_COVERAGE_ENABLED
include $(top_srcdir)/aminclude_static.am
CODE_COVERAGE_DIRECTORY=./src
CODE_COVERAGE_OUTPUT_DIRECTORY=coverage
CODE_COVERAGE_OUTPUT_FILE=coverage.info
if LCOV_VERSION_2
CODE_COVERAGE_LCOV_SHOPTS_DEFAULT=--ignore-errors unused,unused --ignore-errors gcov,gcov --rc branch_coverage=1
endif
CODE_COVERAGE_IGNORE_PATTERN="/usr/include/*"
CODE_COVERAGE_BRANCH_COVERAGE=1
CODE_COVERAGE_LCOV_OPTIONS=$(CODE_COVERAGE_LCOV_OPTIONS_DEFAULT) --rc lcov_excl_br_line="assert\("
clean-local: code-coverage-clean
distclean-local: code-coverage-dist-clean
endif # CODE_COVERAGE_ENABLED
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = @PACKAGE_NAME@.pc
raft-0.22.1/README.md 0000664 0000000 0000000 00000003665 14601504142 0014016 0 ustar 00root root 0000000 0000000 [](https://github.com/cowsql/raft/actions/workflows/tests.yml) [](https://codecov.io/gh/cowsql/raft) [](https://raft.readthedocs.io/en/latest/?badge=latest) [](https://scan.coverity.com/projects/cowsql-raft)
Production grade asynchronous C implementation of the Raft consensus protocol.
Documentation
-------------
See [readthedocs](https://raft.readthedocs.io/) for the full documentation.
Building
---------
```bash
autoreconf -i
./configure
make
```
History
-------
This library is a fork of [Canonical's](https://github.com/canonical/raft) Raft
implementation, which was originally written by this library's author
[himself](https://github.com/canonical/raft/commits?author=freeekanayaka) while
working at Canonical.
It is a **fully compatible drop-in replacement** of Canonical's version, at
least up to v0.18.0.
License
-------
This raft C library is released under a slightly modified version of LGPLv3,
that includes a copyright exception letting users to statically link the library
code in their project and release the final work under their own terms. See the
full [license](./LICENSE) text.
Notable users
-------------
- [cowsql](https://github.com/cowsql/cowsql)
Credits
-------
Of course the biggest thanks goes to Diego Ongaro :) (the original author of the
Raft dissertation).
A lot of ideas and inspiration was taken from other Raft implementations such
as:
- CoreOS' Go implementation for [etcd](https://github.com/etcd-io/etcd/tree/master/raft)
- Hashicorp's Go [raft](https://github.com/hashicorp/raft)
- Willem's [C implementation](https://github.com/willemt/raft)
- LogCabin's [C++ implementation](https://github.com/logcabin/logcabin)
raft-0.22.1/ac/ 0000775 0000000 0000000 00000000000 14601504142 0013110 5 ustar 00root root 0000000 0000000 raft-0.22.1/ac/.gitignore 0000664 0000000 0000000 00000000016 14601504142 0015075 0 ustar 00root root 0000000 0000000 *
!.gitignore
raft-0.22.1/amalgamation.json 0000664 0000000 0000000 00000002031 14601504142 0016046 0 ustar 00root root 0000000 0000000 {
"target": "raft.c",
"sources": [
"src/byte.c",
"src/client.c",
"src/compress.c",
"src/configuration.c",
"src/convert.c",
"src/election.c",
"src/entry.c",
"src/err.c",
"src/fixture.c",
"src/heap.c",
"src/legacy.c",
"src/log.c",
"src/membership.c",
"src/progress.c",
"src/raft.c",
"src/recv.c",
"src/recv_append_entries.c",
"src/recv_append_entries_result.c",
"src/recv_install_snapshot.c",
"src/recv_request_vote.c",
"src/recv_request_vote_result.c",
"src/replication.c",
"src/snapshot.c",
"src/restore.c",
"src/state.c",
"src/syscall.c",
"src/message.c",
"src/timeout.c",
"src/tracing.c",
"src/uv.c",
"src/uv_append.c",
"src/uv_encoding.c",
"src/uv_finalize.c",
"src/uv_fs.c",
"src/uv_ip.c",
"src/uv_list.c",
"src/uv_metadata.c",
"src/uv_os.c",
"src/uv_prepare.c",
"src/uv_recv.c",
"src/uv_segment.c",
"src/uv_send.c",
"src/uv_snapshot.c",
"src/uv_tcp.c",
"src/uv_tcp_connect.c",
"src/uv_tcp_listen.c",
"src/uv_truncate.c",
"src/uv_writer.c"
],
"include_paths": [
"include"
]
}
raft-0.22.1/configure.ac 0000664 0000000 0000000 00000014417 14601504142 0015022 0 ustar 00root root 0000000 0000000 AC_PREREQ(2.60)
AC_INIT([raft], [0.22.1])
AC_LANG([C])
AC_CONFIG_MACRO_DIR([m4])
AC_CONFIG_AUX_DIR([ac])
AM_INIT_AUTOMAKE([subdir-objects -Wall -Wno-portability foreign])
AM_SILENT_RULES([yes])
AC_USE_SYSTEM_EXTENSIONS # Defines _GNU_SOURCE and similar
LT_INIT
# The libuv raft_io implementation is built by default if libuv is found, unless
# explicitly disabled.
AC_ARG_ENABLE(uv, AS_HELP_STRING([--disable-uv], [do not build the libuv-based raft_io implementation]))
AS_IF([test "x$enable_uv" != "xno"],
[PKG_CHECK_MODULES(UV, [libuv >= 1.18.0], [have_uv=yes], [have_uv=no])],
[have_uv=no])
AS_IF([test "x$enable_uv" = "xyes" -a "x$have_uv" = "xno"], [AC_MSG_ERROR([libuv required but not found])], [])
AM_CONDITIONAL(UV_ENABLED, test "x$have_uv" = "xyes")
# Automatically link to liblz4 if it's present.
#
# We try to detect if lz4 is installed only if the libuv raft_io implementation
# is enabled, since that's the only place where we use lz4.
AC_ARG_WITH([lz4], AS_HELP_STRING([--without-lz4], [never link to liblz4]))
AS_IF([test "x$have_uv" = "xyes"],
# libuv is used
[AS_IF([test "x$with_lz4" != "xno"],
[PKG_CHECK_MODULES(LZ4, [liblz4 >= 1.7.1], [have_lz4=yes], [have_lz4=no])],
[have_lz4=no])
AS_IF([test "x$with_lz4" = "xyes" -a "x$have_lz4" = "xno"],
[AC_MSG_ERROR([liblz4 required but not found])],
[])],
# libuv is not used
[AS_IF([test "x$with_lz4" = "xyes"],
[AC_MSG_ERROR([liblz4 can be used only if libuv is used too])],
[])
have_lz4=no])
AM_CONDITIONAL(LZ4_AVAILABLE, test "x$have_lz4" = "xyes")
AC_ARG_ENABLE(backtrace, AS_HELP_STRING([--enable-backtrace[=ARG]], [print backtrace on assertion failure [default=no]]))
AM_CONDITIONAL(BACKTRACE_ENABLED, test "x$enable_backtrace" = "xyes")
# The fake I/O implementation and associated fixture is built by default, unless
# explicitly disabled.
AC_ARG_ENABLE(fixture, AS_HELP_STRING([--disable-fixture], [do not build the raft_fixture test helper]))
AM_CONDITIONAL(FIXTURE_ENABLED, test "x$enable_fixture" != "xno")
# The example program is optional.
AC_ARG_ENABLE(example, AS_HELP_STRING([--enable-example[=ARG]], [build the example program [default=no]]))
AS_IF([test "x$enable_example" = "xyes" -a "x$have_uv" = "xno"], [AC_MSG_ERROR([example program requires libuv])], [])
AM_CONDITIONAL(EXAMPLE_ENABLED, test "x$enable_example" = "xyes")
# The benchmark programs are optional.
AC_ARG_ENABLE(benchmark, AS_HELP_STRING([--enable-benchmark[=ARG]], [build the benchmark programs [default=no]]))
AM_CONDITIONAL(BENCHMARK_ENABLED, test "x$enable_benchmark" = "xyes")
# Whether to enable debugging code.
AC_ARG_ENABLE(debug, AS_HELP_STRING([--enable-debug[=ARG]], [enable debugging [default=no]]))
AM_CONDITIONAL(DEBUG_ENABLED, test "x$enable_debug" = "xyes")
# Whether to enable compile-time hardening options.
AC_ARG_ENABLE(hardening, AS_HELP_STRING([--disable-hardening], [disable compile-time hardening options]))
# Whether to enable memory sanitizer.
AC_ARG_ENABLE(sanitize, AS_HELP_STRING([--enable-sanitize[=ARG]], [enable code sanitizers [default=no]]))
AM_CONDITIONAL(SANITIZE_ENABLED, test x"$enable_sanitize" = x"yes")
# Whether to enable compatibility with the legacy v0 API.
AC_ARG_ENABLE(v0, AS_HELP_STRING([--enable-v0[=ARG]], [enable compatibility with v0 API [default=yes]]))
AM_CONDITIONAL(V0_ENABLED, test "x$enable_v0" != "xno")
# Whether to enable code coverage.
AX_CODE_COVERAGE
# Check if lcov >= 2.0
AS_IF([test x"$enable_code_coverage" = xyes],
[AX_COMPARE_VERSION($(lcov --version|cut -f 4 -d " " | cut -f 1 -d -), [ge], [2.0],
[have_lcov_2=yes], [have_lcov_2=no])
],
[have_lcov_2=no])
AM_CONDITIONAL(LCOV_VERSION_2, test x"$have_lcov_2" = x"yes")
# Checks for header files.
AC_CHECK_HEADERS([stdlib.h string.h stdio.h assert.h unistd.h linux/io_uring.h linux/aio_abi.h])
# Checks for library functions and definitions.
AC_CHECK_DECLS(RWF_NOWAIT, [], [AC_MSG_ERROR(Linux kernel >= 4.14 required.)], [#include ])
# Check if zfs >= 0.8.0 is available (for direct I/O support).
AC_CHECK_PROG(have_zfs, zfs, yes)
AS_IF([test x"$have_zfs" = x"yes"],
[AX_COMPARE_VERSION($(cat /sys/module/zfs/version | cut -f 1 -d -), [ge], [0.8.0],
[AC_DEFINE(RAFT_HAVE_ZFS_WITH_DIRECT_IO)], [])
],
[])
# Checks for typedefs, structures, and compiler characteristics.
AC_TYPE_SIZE_T
AC_TYPE_SSIZE_T
AC_TYPE_UINT8_T
AC_TYPE_UINT16_T
AC_TYPE_UINT32_T
AC_TYPE_UINT64_T
# Enable large file support. This is mandatory in order to interoperate with
# libuv, which enables large file support by default, making the size of 'off_t'
# on 32-bit architecture be 8 bytes instead of the normal 4.
AC_SYS_LARGEFILE
CC_CHECK_FLAGS_APPEND([AM_CFLAGS],[CFLAGS],[ \
-std=c11 \
-g \
-pipe \
])
AS_IF([test "x$enable_hardening" != "xno"],
[CC_CHECK_FLAGS_APPEND([AM_CFLAGS],[CFLAGS],[ \
-fcf-protection \
--param=ssp-buffer-size=4 \
-fdiagnostics-color \
-fexceptions \
-fstack-clash-protection \
-fstack-protector-strong \
-fasynchronous-unwind-tables \
-fdiagnostics-show-option \
-Wall \
-Wextra \
-Wpedantic \
-Wimplicit-fallthrough=5 \
-Wcast-align \
-Wstrict-prototypes \
-Wlogical-op \
-Wmissing-include-dirs \
-Wold-style-definition \
-Winit-self \
-Wfloat-equal \
-Wsuggest-attribute=noreturn \
-Wformat=2 \
-Wendif-labels \
-Wdate-time \
-Wnested-externs \
-Wconversion \
-Werror=implicit-function-declaration \
-Wunused-but-set-variable \
-Werror=return-type \
-Werror=incompatible-pointer-types \
-Wshadow \
-Werror=overflow \
-Werror=shift-count-overflow \
-Werror=shift-overflow=2 \
-Warray-bounds \
-Wrestrict \
-Wreturn-local-addr \
-Wstringop-overflow \
])],
[])
AC_SUBST(AM_CFLAGS)
AS_IF([test "x$enable_hardening" != "xno"],
[CC_CHECK_FLAGS_APPEND([AM_LDFLAGS],[LDFLAGS],[ \
-z relro \
-z now \
-fstack-protector-strong \
--param=ssp-buffer-size=4 \
])],
[])
AC_SUBST(AM_LDLAGS)
AC_SUBST(enable_v0)
AC_CONFIG_FILES([include/raft.h raft.pc Makefile])
AC_OUTPUT
raft-0.22.1/docs/ 0000775 0000000 0000000 00000000000 14601504142 0013455 5 ustar 00root root 0000000 0000000 raft-0.22.1/docs/_static/ 0000775 0000000 0000000 00000000000 14601504142 0015103 5 ustar 00root root 0000000 0000000 raft-0.22.1/docs/_static/css/ 0000775 0000000 0000000 00000000000 14601504142 0015673 5 ustar 00root root 0000000 0000000 raft-0.22.1/docs/_static/css/custom.css 0000664 0000000 0000000 00000000425 14601504142 0017720 0 ustar 00root root 0000000 0000000 html {
font-size: 100%;
}
h1 {
font-size: 1.5em;
}
h2 {
font-size: 1.2em;
}
h3 {
font-size: 1.0em;
}
/* Hide ads. TODO: is there a way to disable them altogether? */
#furo-sidebar-ad-placement {
display: none;
}
.sidebar-container {
width: 20em;
}
raft-0.22.1/docs/algorithm.rst 0000664 0000000 0000000 00000006425 14601504142 0016204 0 ustar 00root root 0000000 0000000 .. _algorithm:
:c:struct:`raft` --- Algorithm state
====================================
The :c:struct:`raft` struct is the central part of C-Raft. It holds and drive
the state of a single Raft server in a cluster.
It is purely a finite state machine, and it doesn't perform any I/O or system
calls.
The :c:func:`raft_step()` function is used to advance the state of a
:c:struct:`raft` state machine, and is designed to be integrated in some
external event loop or I/O layer that is in charge of receiving users requests,
implementing network communication with other Raft servers, persisting data to
disk.
For example:
.. code-block:: C
/* A RequestVote RPC message has been received from the network. We inform
* struct raft about that by passing a struct raft_event to raft_step(). */
struct raft raft;
struct raft_event event;
struct raft_update update;
event.type = RAFT_RECEIVE;
event.receive.message = ...; /* Fill with the content of the message */
raft_step(&raft, &event, &update);
/* The struct raft_update object contains information about the next actions
* the I/O layer should perform, for example it might contain new messages to
* be sent. */
if (update.flags & RAFT_UPDATE_MESSAGES) {
for (unsigned i = 0; i < update.messages.n; i++) {
/* Send the message contained in update.messages.batch[i] */
}
}
Basically whenever an event occurs in the I/O layer, the :c:func:`raft_step()`
function must be called and the resulting state updates should be performed.
See the `External events`_ section for details about what events to pass to the
step function in order to drive the state machine forward, and `State updates`_
for details about how state updates should be processed after calling the step
function.
.. _External events: ./events.html
.. _State updates: ./updates.html
Data types
----------
.. c:enum:: raft_state
Type code for the possible states a :c:struct:`raft` struct can be in.
.. code-block:: C
enum raft_state {
RAFT_FOLLOWER = 1,
RAFT_CANDIDATE,
RAFT_LEADER
};
.. c:struct:: raft
A single raft server in a cluster.
Public members
^^^^^^^^^^^^^^
.. c:member:: raft_id raft.id
Server ID. Readonly.
API
---
.. c:function:: int raft_init(struct raft *r, raft_id id, const char *address)
Initialize a raft state machine.
.. c:function:: int raft_close(struct raft* r)
Close a raft state machine, releasing all memory it uses.
.. c:function:: int raft_step(struct raft* r, struct raft_event *event, struct raft_update *update)
Advance the state of the given raft state machine.
.. c:function:: raft_term raft_current_term(const struct raft *r)
Return the current term of this server.
.. c:function:: raft_id raft_voted_for(const struct raft *r)
Return the ID of the server that this server has voted for, or :c:expr:0 if
it did not vote.
.. c:function:: enum raft_state raft_state(struct raft *r)
Return the code of the current Raft state (follower/candidate/leader).
.. c:function:: raft_index raft_commit_index(const struct raft *r);
Return the commit index of this server.
.. c:function:: raft_time raft_timeout(const struct raft *r)
Return the time at which the next :c:macro:`RAFT_TIMEOUT` event should be
fired.
raft-0.22.1/docs/conf.py 0000664 0000000 0000000 00000004271 14601504142 0014760 0 ustar 00root root 0000000 0000000 # The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'C-Raft'
copyright = u'2023-present, Free Ekanayaka'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = u'0.22.0'
# The full version, including alpha/beta/rc tags.
release = version
# The name of the Pygments (syntax highlighting) style to use.
# pygments_style = 'sphinx'
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#html_theme = 'material'
# html_theme = 'material'
#html_theme = 'furo'
try:
import furo
html_theme = 'furo'
except:
try:
import alabaster
html_theme = 'alabaster'
except:
html_them = 'classic'
html_theme_path = ['_themes']
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# " v documentation".
html_title = 'C-Raft'
# A shorter title for the navigation bar. Default is the same as html_title.
html_short_title = 'C-Raft'
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = 'static/logo.png'
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = 'static/favicon.ico'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_css_files = [
'css/custom.css',
]
# Output file base name for HTML help builder.
htmlhelp_basename = 'raft'
raft-0.22.1/docs/core.rst 0000664 0000000 0000000 00000000143 14601504142 0015135 0 ustar 00root root 0000000 0000000 .. _core:
Core API
========
.. toctree::
algorithm
events
updates
messages
types
raft-0.22.1/docs/disk-format.rst 0000664 0000000 0000000 00000006333 14601504142 0016434 0 ustar 00root root 0000000 0000000 Disk format
===========
The implementation of metadata and log persistency is virtually the same as the
one found in `LogCabin`_.
The disk files consist of metadata files, closed segments, and open
segments. Metadata files are used to track Raft metadata, such as the server's
current term, vote, and log's start index. Segments contain contiguous entries
that are part of the log. Closed segments are never written to again (but may be
renamed and truncated if a suffix of the log is truncated). Open segments are
where newly appended entries go. Once an open segment reaches the maximum
allowed size, it is closed and a new one is used. There are usually about 3 open
segments at any given time, the one with the lower index is the one actively
being written, and the other ones have been fallocate'd and are ready to be used
as soon as the active one gets closed.
Metadata files are named "metadata1" and "metadata2". The code alternates
between these so that there is always at least one readable metadata file.
On startup, the readable metadata file with the higher version number is used.
The format of a metadata file is:
* [8 bytes] Format (currently 1).
* [8 bytes] Incremental version number.
* [8 bytes] Current term.
* [8 bytes] ID of server we voted for.
All values are in little endian encoding.
Closed segments are named by the format string "%lu-%lu" with their start and
end indexes, both inclusive. Closed segments always contain at least one entry,
and the end index is always at least as large as the start index. Closed segment
files may occasionally include data past their filename's end index (these are
ignored but a warning is logged). This can happen if the suffix of the segment
is truncated and a crash occurs at an inopportune time (the segment file is
first renamed, then truncated, and a crash occurs in between).
Open segments are named by the format string "open-%lu" with a unique
number. These should not exist when the server shuts down cleanly, but they
exist while the server is running and may be left around during a crash. Open
segments either contain entries which come after the last closed segment or are
full of zeros. When the server crashes while appending to an open segment, the
end of that file may be corrupt. We can't distinguish between a corrupt file and
a partially written entry. The code assumes it's a partially written entry, logs
a warning, and ignores it.
Truncating a suffix of the log will remove all entries that are no longer part
of the log. Truncating a prefix of the log will only remove complete segments
that are before the new log start index. For example, if a segment has entries
10 through 20 and the prefix of the log is truncated to start at entry 15, that
entire segment will be retained.
Each segment file starts with a segment header, which currently contains just an
8-byte version number for the format of that segment. The current format
(version 1) is just a concatenation of serialized entry batches.
Each batch has the following format:
* [4 bytes] CRC32 checksum of the batch header, little endian.
* [4 bytes] CRC32 checksum of the batch data, little endian.
* [ ... ] Batch of one or more entries.
.. _LogCabin: https://github.com/logcabin/logcabin/blob/master/Storage/SegmentedLog.h
raft-0.22.1/docs/events.rst 0000664 0000000 0000000 00000015142 14601504142 0015516 0 ustar 00root root 0000000 0000000 .. _events:
:c:struct:`raft_event` --- External events
==========================================
Information about new events that should be passed to :c:func:`raft_step()`,
Data types
----------
.. c:enum:: raft_event_type
Event type codes.
.. code-block:: C
enum raft_event_type {
RAFT_START = 1, /* Initial event starting loading persisted data */
RAFT_RECEIVE, /* A message has been received from another server */
RAFT_PERSISTED_ENTRIES, /* Some entries have been persisted to disk */
RAFT_PERSISTED_SNAPSHOT, /* A snapshot has been persisted */
RAFT_CONFIGURATION, /* A new committed configuration must be applied */
RAFT_SNAPSHOT, /* A snapshot has been taken */
RAFT_TIMEOUT, /* The timeout has expired */
RAFT_SUBMIT, /* New entries have been submitted */
RAFT_CATCH_UP, /* Start catching-up a server */
RAFT_TRANSFER /* Start transferring leadership to another server */
};
.. c:struct:: raft_event
The :c:struct:`raft_event` struct holds information about events such as:
- a new message has been received from another server
- disk I/O has been completed for persisting data
- new entries have been submitted for replication
Users of the core :c:struct:`raft` struct are responsible for implementing
an I/O layer that watches for the above events, filling :c:struct:`raft_event`
objects as appropriate and passing them to the :c:func:`raft_step()`
function.
Each :c:enum:`raft_event_type` has an associated sub-struct, whose fields
are described separately in the sections below.
.. code-block:: C
struct raft_event
{
raft_time time; /* Must be filled with the current time */
enum raft_event_type type; /* Must be filled with the type code of the event */
unsigned char unused;
unsigned short capacity; /* Disk capacity that has been reserved */
unsigned char reserved[4];
union { /* Additional data about a specific event type */
struct { ... } start;
struct { ... } receive;
struct { ... } persisted_entries;
struct { ... } persisted_snapshot;
struct { ... } configuration;
struct { ... } snapshot;
struct { ... } submit;
struct { ... } catch_up;
struct { ... } transfer;
};
};
Common fields
^^^^^^^^^^^^^
.. c:member:: raft_time raft_event.time
Event timestamp. Must always be filled with the current time.
.. c:member:: enum raft_event_type raft_event.type
Event type. Must be filled with the type code of the event.
.. c:member:: unsigned short raft_event.capacity
Disk capacity that has been reserved and is guaranteed to be available.
Start
^^^^^
.. c:member:: struct @0 raft_event.start
To be filled when :c:struct:`raft_event.type` is :c:enum:`RAFT_START`.
It contains all state persisted on disk by the server.
.. code-block:: C
struct
{
raft_term term; /* Current term */
raft_id voted_for; /* Current vote */
struct raft_snapshot_metadata *metadata; /* Last snapshot, if any */
raft_index start_index; /* Index of first entry */
struct raft_entry *entries; /* Array of persisted entries */
unsigned n_entries; /* Length of entries array */
} start;
Receive
^^^^^^^
.. c:member:: struct @0 raft_event.receive
To be filled when :c:struct:`raft_event.type` is :c:enum:`RAFT_RECEIVE`.
It contains the :c:struct:`raft_message` being received.
.. code-block:: C
struct
{
struct raft_message *message; /* Message being received */
} receive;
Persisted entries
^^^^^^^^^^^^^^^^^
.. c:member:: struct @0 raft_event.persisted_entries
To be filled when :c:struct:`raft_event.type` is :c:enum:`RAFT_PERSISTED_ENTRIES`.
It contains the latest log index that has been successfully persisted.
.. code-block:: C
struct
{
raft_index index; /* Highest index persisted */
} persisted_entries;
Persisted snapshot
^^^^^^^^^^^^^^^^^^
.. c:member:: struct @0 raft_event.persisted_snapshot
To be filled when :c:struct:`raft_event.type` is :c:enum:`RAFT_PERSISTED_SNAPSHOT`.
It contains metadata about the latest snapshot that has been successfully
persisted.
.. code-block:: C
struct
{
struct raft_snapshot_metadata metadata;
size_t offset;
bool last;
} persisted_snapshot;
Configuration
^^^^^^^^^^^^^
.. c:member:: struct @0 raft_event.configuration
To be filled when :c:struct:`raft_event.type` is :c:enum:`RAFT_CONFIGURATION`.
It contains the last committed configuration that has been processed.
.. code-block:: C
struct
{
raft_index index;
struct raft_configuration conf;
} configuration;
Snapshot taken
^^^^^^^^^^^^^^
.. c:member:: struct @0 raft_event.snapshot
To be filled when :c:struct:`raft_event.type` is :c:enum:`RAFT_SNAPSHOT`.
It contains metadata about the last snapshot that has been taken.
.. code-block:: C
struct
{
struct raft_snapshot_metadata metadata; /* Snapshot metadata */
unsigned trailing; /* Trailing entries kept */
} snapshot;
Submit
^^^^^^
.. c:member:: struct @0 raft_event.submit
To be filled when :c:struct:`raft_event.type` is :c:enum:`RAFT_SUBMIT`.
It contains new entries that have been submitted for replication.
.. code-block:: C
struct
{
struct raft_entry *entries;
unsigned n;
} submit;
Catch-up server
^^^^^^^^^^^^^^^
.. c:member:: struct @0 raft_event.catch_up
To be filled when :c:struct:`raft_event.type` is :c:enum:`RAFT_CATCH_UP`.
It contains the ID of a server that should be caught-up with the leader log.
.. code-block:: C
struct
{
raft_id server_id;
} catch_up;
Transfer leadership
^^^^^^^^^^^^^^^^^^^
.. c:member:: struct @0 raft_event.transfer
To be filled when :c:struct:`raft_event.type` is :c:enum:`RAFT_TRANSFER`.
It contains the ID of a server that leadership should be transfered to.
.. code-block:: C
struct
{
raft_id server_id;
} transfer;
raft-0.22.1/docs/index.rst 0000664 0000000 0000000 00000003067 14601504142 0015324 0 ustar 00root root 0000000 0000000 C-Raft
======
Production grade asynchronous C implementation of the Raft consensus protocol.
Design
------
The library a has modular design: its core part implements only the core Raft
algorithm logic (no I/O and no system calls). On top of that, various drivers
are provided that implement actual network communication and persistent data
storage.
The core part of the library is designed to work well with asynchronous or
non-blocking I/O engines (such as `libuv`_ and `io_uring`_), although it can be
used in threaded or blocking contexts as well.
.. _libuv: http://libuv.org
.. _io_uring: https://en.wikipedia.org/wiki/Io_uring
Features
--------
C-Raft implements all the basic features described in the Raft dissertation:
* Leader election
* Log replication
* Log compaction
* Membership changes
It also includes a few optional enhancements:
* Optimistic pipelining to reduce log replication latency
* Writing to the leader's disk in parallel
* Automatic stepping down when the leader loses quorum
* Leadership transfer extension
* Non-voting servers
Source
------
The source tree is available on `github`_.
.. _github: https://github.com/cowsql/raft
Licence
-------
This raft C library is released under a slightly modified version of LGPLv3,
that includes a copyright exception letting users to statically link the library
code in their project and release the final work under their own terms. See the
full `license`_ text.
.. _license: https://github.com/cowsql/raft/blob/main/LICENSE
.. toctree::
:hidden:
:maxdepth: 1
quick-start
core
disk-format
raft-0.22.1/docs/messages.rst 0000664 0000000 0000000 00000012621 14601504142 0016020 0 ustar 00root root 0000000 0000000 .. _messages:
:c:struct:`raft_message` --- RPC messages
=========================================
The :c:struct:`raft_message` struct holds information about a single RPC message
being received or sent over the network.
Data types
----------
.. c:enum:: raft_message_type
RPC message type codes.
.. code-block:: C
enum raft_event_type {
RAFT_APPEND_ENTRIES = 1,
RAFT_APPEND_ENTRIES_RESULT,
RAFT_REQUEST_VOTE,
RAFT_REQUEST_VOTE_RESULT,
RAFT_INSTALL_SNAPSHOT,
RAFT_TIMEOUT_NOW
};
.. c:struct:: raft_message
Union of all RPC message structs, plus information about the sender or the
receiver (depending on whether the message is being sent or received).
.. code-block:: C
struct raft_message
{
enum raft_message_type type; /* RPC type code */
raft_id server_id; /* ID of sending or destination server */
const char *server_address; /* Address of sending or destination server */
union { /* Type-specific data */
struct raft_request_vote request_vote;
struct raft_request_vote_result request_vote_result;
struct raft_append_entries append_entries;
struct raft_append_entries_result append_entries_result;
struct raft_install_snapshot install_snapshot;
struct raft_timeout_now timeout_now;
};
};
RequestVote
^^^^^^^^^^^
.. c:struct:: raft_request_vote
Holds the parameters of a `RequestVote` RPC message.
.. code-block:: C
struct raft_request_vote
{
unsigned char version; /* Message format version. */
raft_term term; /* Candidate's term */
raft_id candidate_id; /* ID of the server requesting the vote */
raft_index last_log_index; /* Index of candidate's last log entry */
raft_index last_log_term; /* Term of log entry at last_log_index */
bool disrupt_leader; /* True if current leader should be discarded */
bool pre_vote; /* True if this is a pre-vote request */
};
RequestVote result
^^^^^^^^^^^^^^^^^^
.. c:struct:: raft_request_vote_result
Holds the parameters of a `RequestVote` RPC result message.
.. code-block:: C
struct raft_append_entries_result
{
unsigned char version; /* Message format version */
raft_term term; /* Receiver's current_term */
raft_index rejected; /* If non-zero, the index that was rejected */
raft_index last_log_index; /* Receiver's last log entry index, as hint */
unsigned short features; /* Feature flags (since version 1) */
unsigned short capacity; /* Reserved disk capacity for log entries */
};
AppendEntries
^^^^^^^^^^^^^
.. c:struct:: raft_append_entries
Holds the parameters of an `AppendEntries` RPC request message.
.. code-block:: C
struct raft_append_entries
{
unsigned char version; /* Message format version */
raft_term term; /* Leader's term */
raft_index prev_log_index; /* Index of log entry preceeding new ones */
raft_term prev_log_term; /* Term of entry at prev_log_index */
raft_index leader_commit; /* Leader's commit index */
struct raft_entry *entries; /* Log entries to append */
unsigned n_entries; /* Size of the log entries array */
};
AppendEntries result
^^^^^^^^^^^^^^^^^^^^
.. c:struct:: raft_append_entries_result
Holds the parameters of an `AppendEntries` RPC result message.
.. code-block:: C
struct raft_append_entries_result
{
unsigned char version; /* Message format version */
raft_term term; /* Receiver's current_term */
raft_index rejected; /* If non-zero, the index that was rejected */
raft_index last_log_index; /* Receiver's last log entry index, as hint */
unsigned short features; /* Feature flags (since version 1) */
unsigned short capacity; /* Reserved disk capacity for log entries */
};
InstallSnapshot
^^^^^^^^^^^^^^^
.. c:struct:: raft_install_snapshot
Holds the parameters of an `InstallSnapshot` RPC request message.
.. code-block:: C
struct raft_install_snapshot
{
unsigned char version; /* Message format version */
raft_term term; /* Leader's term */
raft_index last_index; /* Index of last entry in the snapshot */
raft_term last_term; /* Term of last_index */
struct raft_configuration conf; /* Config as of last_index */
raft_index conf_index; /* Commit index of conf */
struct raft_buffer data; /* Raw snapshot data */
};
TimeoutNow
^^^^^^^^^^
.. c:struct:: raft_timeout_now
Holds the parameters of a `TimeoutNow` RPC request message.
.. code-block:: C
struct raft_timeout_now
{
unsigned char version; /* Message format version */
raft_term term; /* Leader's term */
raft_index last_log_index; /* Index of leader's last log entry */
raft_index last_log_term; /* Term of log entry at last_log_index */
};
raft-0.22.1/docs/quick-start.rst 0000664 0000000 0000000 00000004112 14601504142 0016454 0 ustar 00root root 0000000 0000000 Quick start
===========
Make sure that `autotools`_, `libtool`_, `pkg-config`_ and `libuv`_ are
installed on your system.
On a Debian (or derivative) systems you can do that with:
.. code-block:: bash
sudo apt-get install build-essential libtool pkg-config libuv1-dev
Then run:
.. code-block:: bash
autoreconf -i
./configure
make
sudo make install
Then create a :file:`main.c` file with this simple test program that just runs a
single raft server and implements a basic state machine for incrementing a
counter:
.. code-block:: C
#include
#include
static raft_id id = 12345;
static const char *address = "127.0.0.1:8080";
static const char *dir = "/tmp/raft-quick-start";
static struct raft_configuration conf;
static struct uv_loop_s loop;
static struct uv_raft_s raft;
static struct uv_timer_s timer;
static unsigned counter = 0;
static uint64_t command;
static void timerCb(uv_timer_t *timer) {
struct raft_buffer buf;
command = uv_now(timer->loop) % 10;
buf.len = sizeof command;
buf.base = &command;
uv_raft_submit(&raft, RAFT_COMMAND, &buf);
}
static void commitCb(struct uv_raft_s *raft, int type, const struct uv_buf_s *buf) {
counter += *(uint64_t *)buf->base;
printf("counter: %u\n", counter);
return 0;
}
int main() {
mkdir(dir, 0755);
raft_configuration_init(&conf);
raft_configuration_add(&conf, id, address, RAFT_VOTER);
uv_loop_init(&loop);
uv_raft_init(&loop, &raft, id, address, dir);
uv_timer_init(&loop, &timer);
uv_raft_bootstrap(&raft, &conf);
uv_raft_start(&raft, commitCb, NULL, NULL);
uv_timer_start(&timer, timerCb, 0, 1000);
uv_run(&loop, UV_RUN_DEFAULT);
}
You can compile and run it with:
.. code-block:: bash
cc main.c -o main -lraft -luv && ./main
.. _autotools: https://en.wikipedia.org/wiki/GNU_Autotools
.. _libtool: https://www.gnu.org/software/libtool/
.. _pkg-config: https://www.freedesktop.org/wiki/Software/pkg-config/
.. _libuv: http://libuv.org
raft-0.22.1/docs/requirements.txt 0000664 0000000 0000000 00000000005 14601504142 0016734 0 ustar 00root root 0000000 0000000 furo
raft-0.22.1/docs/types.rst 0000664 0000000 0000000 00000002662 14601504142 0015361 0 ustar 00root root 0000000 0000000 .. _types:
Basic types
===========
Basic types and utilities.
Data types
----------
.. c:type:: raft_id
Hold the value of a Raft server ID. Guaranteed to be at least 64-bit long.
.. c:type:: raft_term
Hold the value of a Raft term. Guaranteed to be at least 64-bit long.
.. c:type:: raft_index
Hold the value of a raft entry index. Guaranteed to be at least 64-bit long.
.. c:type:: raft_time
Hold a time value expressed in milliseconds since the epoch.
.. c:struct:: raft_buffer
A data buffer.
.. code-block:: C
struct raft_buffer
{
void *base; /* Pointer to the buffer data */
size_t len; /* Length of the buffer */
};
.. c:enum:: raft_entry_type
Log entry type codes.
.. code-block:: C
enum raft_entry_type {
RAFT_COMMAND = 1, /* Command for the application FSM. */
RAFT_BARRIER, /* Wait for all previous commands to be applied. */
RAFT_CHANGE /* Raft configuration change. */
};
.. c:struct:: raft_entry
A single entry in the raft log.
.. code-block:: C
struct raft_entry
{
raft_term term; /* Term in which the entry was created */
unsigned short type; /* Type (FSM command, barrier, config change) */
struct raft_buffer buf; /* Entry data */
void *batch; /* Batch that buf's memory points to, if any */
};
raft-0.22.1/docs/updates.rst 0000664 0000000 0000000 00000011132 14601504142 0015652 0 ustar 00root root 0000000 0000000 .. _updates:
:c:struct:`raft_update` --- State updates
=========================================
State changes or actions to handle after calling :c:func:`raft_step()`.
Data types
----------
.. c:struct:: raft_update
The :c:struct:`raft_update` struct holds information about new state changes
or actions that a user should handle after a call to :c:func:`raft_step()`,
such as:
- New data to persist on disk (e.g. new entries or snapshot)
- New messages to send to other Raft servers
- New term, vote, commit index, etc
Users of the core :c:struct:`raft` struct are responsible for implementing
I/O and application code that manages the above state updates.
.. code-block:: C
struct raft_update
{
unsigned flags;
struct { ,.. } entries;
struct { ,.. } snapshot;
struct { ,.. } messages;
struct
};
Public members
^^^^^^^^^^^^^^
.. c:member:: unsigned raft_update.flags
Bit flags that indicate which particular state change or action should be
processed:
.. code-block:: C
#define RAFT_UPDATE_CURRENT_TERM 1 << 0
#define RAFT_UPDATE_VOTED_FOR 1 << 1
#define RAFT_UPDATE_ENTRIES 1 << 2
#define RAFT_UPDATE_SNAPSHOT 1 << 3
#define RAFT_UPDATE_MESSAGES 1 << 4
#define RAFT_UPDATE_STATE 1 << 5
#define RAFT_UPDATE_COMMIT_INDEX 1 << 6
#define RAFT_UPDATE_TIMEOUT 1 << 7
Current Term
^^^^^^^^^^^^
.. c:macro:: RAFT_UPDATE_CURRENT_TERM
If this bit flag is on, the current term of :c:struct:`raft` struct has
changed and must be durably persisted to disk. This has to be done before
processing any other change or action (i.e. no messages must be sent until
the new term has been persisted).
User code can use :c:func:`raft_current_term()` to get the new term that
should be persisted.
Voted for
^^^^^^^^^
.. c:macro:: RAFT_UPDATE_VOTED_FOR
If this bit flag is on, the server that the :c:struct:`raft` struct has
voted for has changed and must be durably persisted to disk. This has to be
done before processing any other change or action (i.e. no messages must be
sent until the new vote has been persisted).
User code can use :c:func:`raft_voted_for()` to get the server ID that
should be persisted as new vote.
Entries
^^^^^^^
.. c:macro:: RAFT_UPDATE_ENTRIES
If this bit flag is on, a new batch of log entries should be persisted to
disk, as described by the :c:struct:`raft_update.entries` field.
.. c:member:: struct @0 raft_update.entries
Details about new entries to persist.
.. code-block:: C
struct
{
raft_index index; /* Index of first entry in the batch */
struct raft_entry *batch; /* Array of entries to persist */
unsigned n; /* Number of entries in the array */
} entries;
Snapshot
^^^^^^^^
.. c:macro:: RAFT_UPDATE_SNAPOSHOT
If this bit flag is on, a new snapshot chunk should be persisted to disk, as
described by the :c:struct:`raft_update.snapshot` field.
.. c:member:: struct @0 raft_update.snapshot
Details about new entries to persist.
.. code-block:: C
struct
{
struct raft_snapshot_metadata metadata; /* Snapshot metadata */
size_t offset; /* Chunk offset */
struct raft_buffer chunk; /* Data chunk */
bool last; /* True if last chunk */
} snapshot;
Messages
^^^^^^^^
.. c:macro:: RAFT_UPDATE_MESSAGES
If this bit flag is on, new messages should be sent, as described by the
:c:struct:`raft_update.messages` field.
.. c:member:: struct @0 raft_update.messages
Details about new entries to persist.
.. code-block:: C
struct
{
struct raft_message *batch; /* Array of messages to send */
unsigned n; /* Number of messages in the array */
} messages;
State
^^^^^
.. c:macro:: RAFT_UPDATE_STATE
If this bit flag is on, the :c:enum:`raft_state` of the :c:struct:`raft`
struct has changed. The new state can be obtained with
:c:func:`raft_state()`.
Commit index
^^^^^^^^^^^^
.. c:macro:: RAFT_UPDATE_COMMIT_INDEX
If this bit flag is on, the commit index has changed. The new commit index
can be obtained with :c:func:`raft_commit_index()`.
Timeout
^^^^^^^
.. c:macro:: RAFT_UPDATE_TIMEOUT
If this bit flag is on, the time at which the next :c:macro:`RAFT_TIMEOUT`
event should be fired has changed. The new time can be obtained with
:c:func:`raft_timeout()`.
raft-0.22.1/example/ 0000775 0000000 0000000 00000000000 14601504142 0014160 5 ustar 00root root 0000000 0000000 raft-0.22.1/example/cluster.c 0000664 0000000 0000000 00000005544 14601504142 0016015 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include
#include
#include
#include
#include
#define N_SERVERS 3 /* Number of servers in the example cluster */
static int ensureDir(const char *dir)
{
int rv;
struct stat sb;
rv = stat(dir, &sb);
if (rv == -1) {
if (errno == ENOENT) {
rv = mkdir(dir, 0700);
if (rv != 0) {
printf("error: create directory '%s': %s", dir,
strerror(errno));
return 1;
}
} else {
printf("error: stat directory '%s': %s", dir, strerror(errno));
return 1;
}
} else {
if ((sb.st_mode & S_IFMT) != S_IFDIR) {
printf("error: path '%s' is not a directory", dir);
return 1;
}
}
return 0;
}
static void forkServer(const char *topLevelDir, unsigned i, pid_t *pid)
{
*pid = fork();
if (*pid == 0) {
char *dir = malloc(strlen(topLevelDir) + strlen("/D") + 1);
char *id = malloc(N_SERVERS / 10 + 2);
char *argv[] = {"./example/server", dir, id, NULL};
char *envp[] = {NULL};
int rv;
sprintf(dir, "%s/%u", topLevelDir, i + 1);
rv = ensureDir(dir);
if (rv != 0) {
abort();
}
sprintf(id, "%u", i + 1);
execve("./example/server", argv, envp);
}
}
int main(int argc, char *argv[])
{
const char *topLevelDir = "/tmp/raft";
struct timespec now;
pid_t pids[N_SERVERS];
unsigned i;
int rv;
if (argc > 2) {
printf("usage: example-cluster []\n");
return 1;
}
if (argc == 2) {
topLevelDir = argv[1];
}
/* Make sure the top level directory exists. */
rv = ensureDir(topLevelDir);
if (rv != 0) {
return rv;
}
/* Spawn the cluster nodes */
for (i = 0; i < N_SERVERS; i++) {
forkServer(topLevelDir, i, &pids[i]);
}
/* Seed the random generator */
timespec_get(&now, TIME_UTC);
srandom((unsigned)(now.tv_nsec ^ now.tv_sec));
while (1) {
struct timespec interval;
int status;
/* Sleep a little bit. */
interval.tv_sec = 1 + random() % 15;
interval.tv_nsec = 0;
rv = nanosleep(&interval, NULL);
if (rv != 0) {
printf("error: sleep: %s", strerror(errno));
}
/* Kill a random server. */
i = (unsigned)(random() % N_SERVERS);
rv = kill(pids[i], SIGINT);
if (rv != 0) {
printf("error: kill server %d: %s", i, strerror(errno));
}
waitpid(pids[i], &status, 0);
rv = nanosleep(&interval, NULL);
if (rv != 0) {
printf("error: sleep: %s", strerror(errno));
}
forkServer(topLevelDir, i, &pids[i]);
}
return 0;
}
raft-0.22.1/example/server.c 0000664 0000000 0000000 00000027243 14601504142 0015642 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include
#include "../include/raft.h"
#include "../include/raft/uv.h"
#define N_SERVERS 3 /* Number of servers in the example cluster */
#define APPLY_RATE 125 /* Apply a new entry every 125 milliseconds */
#define Log(SERVER_ID, FORMAT) printf("%d: " FORMAT "\n", SERVER_ID)
#define Logf(SERVER_ID, FORMAT, ...) \
printf("%d: " FORMAT "\n", SERVER_ID, __VA_ARGS__)
/********************************************************************
*
* Sample application FSM that just increases a counter.
*
********************************************************************/
struct Fsm
{
unsigned long long count;
};
static int FsmApply(struct raft_fsm *fsm,
const struct raft_buffer *buf,
void **result)
{
struct Fsm *f = fsm->data;
if (buf->len != 8) {
return RAFT_MALFORMED;
}
f->count += *(uint64_t *)buf->base;
*result = &f->count;
return 0;
}
static int FsmSnapshot(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs)
{
struct Fsm *f = fsm->data;
*n_bufs = 1;
*bufs = raft_malloc(sizeof **bufs);
if (*bufs == NULL) {
return RAFT_NOMEM;
}
(*bufs)[0].len = sizeof(uint64_t);
(*bufs)[0].base = raft_malloc((*bufs)[0].len);
if ((*bufs)[0].base == NULL) {
return RAFT_NOMEM;
}
*(uint64_t *)(*bufs)[0].base = f->count;
return 0;
}
static int FsmRestore(struct raft_fsm *fsm, struct raft_buffer *buf)
{
struct Fsm *f = fsm->data;
if (buf->len != sizeof(uint64_t)) {
return RAFT_MALFORMED;
}
f->count = *(uint64_t *)buf->base;
raft_free(buf->base);
return 0;
}
static int FsmInit(struct raft_fsm *fsm)
{
struct Fsm *f = raft_malloc(sizeof *f);
if (f == NULL) {
return RAFT_NOMEM;
}
f->count = 0;
fsm->version = 2;
fsm->data = f;
fsm->apply = FsmApply;
fsm->snapshot = FsmSnapshot;
fsm->snapshot_finalize = NULL;
fsm->restore = FsmRestore;
return 0;
}
static void FsmClose(struct raft_fsm *f)
{
if (f->data != NULL) {
raft_free(f->data);
}
}
/********************************************************************
*
* Example struct holding a single raft server instance and all its
* dependencies.
*
********************************************************************/
struct Server;
typedef void (*ServerCloseCb)(struct Server *server);
struct Server
{
void *data; /* User data context. */
struct uv_loop_s *loop; /* UV loop. */
struct uv_timer_s timer; /* To periodically apply a new entry. */
const char *dir; /* Data dir of UV I/O backend. */
struct raft_uv_transport transport; /* UV I/O backend transport. */
struct raft_io io; /* UV I/O backend. */
struct raft_fsm fsm; /* Sample application FSM. */
unsigned id; /* Raft instance ID. */
char address[64]; /* Raft instance address. */
struct raft raft; /* Raft instance. */
struct raft_transfer transfer; /* Transfer leadership request. */
ServerCloseCb close_cb; /* Optional close callback. */
};
static void serverRaftCloseCb(struct raft *raft)
{
struct Server *s = raft->data;
raft_uv_close(&s->io);
raft_uv_tcp_close(&s->transport);
FsmClose(&s->fsm);
if (s->close_cb != NULL) {
s->close_cb(s);
}
}
static void serverTransferCb(struct raft_transfer *req)
{
struct Server *s = req->data;
raft_id id;
const char *address;
raft_leader(&s->raft, &id, &address);
raft_close(&s->raft, serverRaftCloseCb);
}
/* Final callback in the shutdown sequence, invoked after the timer handle has
* been closed. */
static void serverTimerCloseCb(struct uv_handle_s *handle)
{
struct Server *s = handle->data;
if (s->raft.data != NULL) {
if (s->raft.state == RAFT_LEADER) {
int rv;
rv = raft_transfer(&s->raft, &s->transfer, 0, serverTransferCb);
if (rv == 0) {
return;
}
}
raft_close(&s->raft, serverRaftCloseCb);
}
}
/* Initialize the example server struct, without starting it yet. */
static int ServerInit(struct Server *s,
struct uv_loop_s *loop,
const char *dir,
unsigned id)
{
struct raft_configuration configuration;
struct timespec now;
unsigned i;
int rv;
memset(s, 0, sizeof *s);
/* Seed the random generator */
timespec_get(&now, TIME_UTC);
srandom((unsigned)(now.tv_nsec ^ now.tv_sec));
s->loop = loop;
/* Add a timer to periodically try to propose a new entry. */
rv = uv_timer_init(s->loop, &s->timer);
if (rv != 0) {
Logf(s->id, "uv_timer_init(): %s", uv_strerror(rv));
goto err;
}
s->timer.data = s;
/* Initialize the TCP-based RPC transport. */
s->transport.version = 1;
s->transport.data = NULL;
rv = raft_uv_tcp_init(&s->transport, s->loop);
if (rv != 0) {
goto err;
}
/* Initialize the libuv-based I/O backend. */
rv = raft_uv_init(&s->io, s->loop, dir, &s->transport);
if (rv != 0) {
Logf(s->id, "raft_uv_init(): %s", s->io.errmsg);
goto err_after_uv_tcp_init;
}
/* Initialize the finite state machine. */
rv = FsmInit(&s->fsm);
if (rv != 0) {
Logf(s->id, "FsmInit(): %s", raft_strerror(rv));
goto err_after_uv_init;
}
/* Save the server ID. */
s->id = id;
/* Render the address. */
sprintf(s->address, "127.0.0.1:900%d", id);
/* Initialize and start the engine, using the libuv-based I/O backend. */
rv = raft_init(&s->raft, &s->io, &s->fsm, id, s->address);
if (rv != 0) {
Logf(s->id, "raft_init(): %s", raft_errmsg(&s->raft));
goto err_after_fsm_init;
}
s->raft.data = s;
/* Bootstrap the initial configuration if needed. */
raft_configuration_init(&configuration);
for (i = 0; i < N_SERVERS; i++) {
char address[64];
unsigned server_id = i + 1;
sprintf(address, "127.0.0.1:900%d", server_id);
rv = raft_configuration_add(&configuration, server_id, address,
RAFT_VOTER);
if (rv != 0) {
Logf(s->id, "raft_configuration_add(): %s", raft_strerror(rv));
goto err_after_configuration_init;
}
}
rv = raft_bootstrap(&s->raft, &configuration);
if (rv != 0 && rv != RAFT_CANTBOOTSTRAP) {
goto err_after_configuration_init;
}
raft_configuration_close(&configuration);
raft_set_snapshot_threshold(&s->raft, 64);
raft_set_snapshot_trailing(&s->raft, 16);
raft_set_pre_vote(&s->raft, true);
s->transfer.data = s;
return 0;
err_after_configuration_init:
raft_configuration_close(&configuration);
err_after_fsm_init:
FsmClose(&s->fsm);
err_after_uv_init:
raft_uv_close(&s->io);
err_after_uv_tcp_init:
raft_uv_tcp_close(&s->transport);
err:
return rv;
}
/* Called after a request to apply a new command to the FSM has been
* completed. */
static void serverApplyCb(struct raft_apply *req, int status, void *result)
{
struct Server *s = req->data;
int count;
raft_free(req);
if (status != 0) {
if (status != RAFT_LEADERSHIPLOST) {
Logf(s->id, "raft_apply() callback: %s (%d)", raft_errmsg(&s->raft),
status);
}
return;
}
count = *(int *)result;
if (count % 100 == 0) {
Logf(s->id, "count %d", count);
}
}
/* Called periodically every APPLY_RATE milliseconds. */
static void serverTimerCb(uv_timer_t *timer)
{
struct Server *s = timer->data;
struct raft_buffer buf;
struct raft_apply *req;
int rv;
if (s->raft.state != RAFT_LEADER) {
return;
}
buf.len = sizeof(uint64_t);
buf.base = raft_malloc(buf.len);
if (buf.base == NULL) {
Log(s->id, "serverTimerCb(): out of memory");
return;
}
*(uint64_t *)buf.base = 1;
req = raft_malloc(sizeof *req);
if (req == NULL) {
Log(s->id, "serverTimerCb(): out of memory");
return;
}
req->data = s;
rv = raft_apply(&s->raft, req, &buf, 1, serverApplyCb);
if (rv != 0) {
Logf(s->id, "raft_apply(): %s", raft_errmsg(&s->raft));
return;
}
}
/* Start the example server. */
static int ServerStart(struct Server *s)
{
int rv;
Log(s->id, "starting");
rv = raft_start(&s->raft);
if (rv != 0) {
Logf(s->id, "raft_start(): %s", raft_errmsg(&s->raft));
goto err;
}
rv = uv_timer_start(&s->timer, serverTimerCb, 0, 125);
if (rv != 0) {
Logf(s->id, "uv_timer_start(): %s", uv_strerror(rv));
goto err;
}
return 0;
err:
return rv;
}
/* Release all resources used by the example server. */
static void ServerClose(struct Server *s, ServerCloseCb cb)
{
s->close_cb = cb;
Log(s->id, "stopping");
/* Close the timer asynchronously if it was successfully
* initialized. Otherwise invoke the callback immediately. */
if (s->timer.data != NULL) {
uv_close((struct uv_handle_s *)&s->timer, serverTimerCloseCb);
} else {
s->close_cb(s);
}
}
/********************************************************************
*
* Top-level main loop.
*
********************************************************************/
static void mainServerCloseCb(struct Server *server)
{
struct uv_signal_s *sigint = server->data;
uv_close((struct uv_handle_s *)sigint, NULL);
}
/* Handler triggered by SIGINT. It will initiate the shutdown sequence. */
static void mainSigintCb(struct uv_signal_s *handle, int signum)
{
(void)signum;
struct Server *server = handle->data;
assert(signum == SIGINT);
uv_signal_stop(handle);
server->data = handle;
ServerClose(server, mainServerCloseCb);
}
int main(int argc, char *argv[])
{
struct uv_loop_s loop;
struct uv_signal_s sigint; /* To catch SIGINT and exit. */
struct Server server;
const char *dir;
unsigned id;
int rv;
if (argc != 3) {
printf("usage: example-server \n");
return 1;
}
dir = argv[1];
id = (unsigned)atoi(argv[2]);
/* Ignore SIGPIPE, see https://github.com/joyent/libuv/issues/1254 */
signal(SIGPIPE, SIG_IGN);
/* Initialize the libuv loop. */
rv = uv_loop_init(&loop);
if (rv != 0) {
Logf(id, "uv_loop_init(): %s", uv_strerror(rv));
goto err;
}
/* Initialize the example server. */
rv = ServerInit(&server, &loop, dir, id);
if (rv != 0) {
goto err_after_server_init;
}
/* Add a signal handler to stop the example server upon SIGINT. */
rv = uv_signal_init(&loop, &sigint);
if (rv != 0) {
Logf(id, "uv_signal_init(): %s", uv_strerror(rv));
goto err_after_server_init;
}
sigint.data = &server;
rv = uv_signal_start(&sigint, mainSigintCb, SIGINT);
if (rv != 0) {
Logf(id, "uv_signal_start(): %s", uv_strerror(rv));
goto err_after_signal_init;
}
/* Start the server. */
rv = ServerStart(&server);
if (rv != 0) {
goto err_after_signal_init;
}
/* Run the event loop until we receive SIGINT. */
rv = uv_run(&loop, UV_RUN_DEFAULT);
if (rv != 0) {
Logf(id, "uv_run_start(): %s", uv_strerror(rv));
}
uv_loop_close(&loop);
return rv;
err_after_signal_init:
uv_close((struct uv_handle_s *)&sigint, NULL);
err_after_server_init:
ServerClose(&server, NULL);
uv_run(&loop, UV_RUN_DEFAULT);
uv_loop_close(&loop);
err:
return rv;
}
raft-0.22.1/include/ 0000775 0000000 0000000 00000000000 14601504142 0014150 5 ustar 00root root 0000000 0000000 raft-0.22.1/include/raft.h.in 0000664 0000000 0000000 00000156724 14601504142 0015701 0 ustar 00root root 0000000 0000000 #ifndef RAFT_H
#define RAFT_H
#include
#include
#include
#include
#ifndef RAFT_API
#define RAFT_API __attribute__((visibility("default")))
#endif
/* clang-format off */
#define RAFT__LEGACY_@enable_v0@
/* clang-format on */
/* Handle C/C++ differences for static asserts */
#if defined(__cpp_static_assert)
#define RAFT__STATIC_ASSERT static_assert
#else
#define RAFT__STATIC_ASSERT _Static_assert
#endif
/* Helper for statically checking ABI compatibility when changing or adding
* struct fields. */
#if defined(RAFT__LEGACY_no)
#define RAFT__ASSERT_COMPATIBILITY(OLD_FIELDS, NEW_FIELDS) \
RAFT__STATIC_ASSERT(1 == 1, "no-op")
#else
#define RAFT__ASSERT_COMPATIBILITY(OLD_FIELDS, NEW_FIELDS) \
RAFT__STATIC_ASSERT(sizeof(NEW_FIELDS) <= sizeof(OLD_FIELDS), \
"ABI breakage")
#endif
/**
* Version.
*/
#define RAFT_VERSION_MAJOR 0
#define RAFT_VERSION_MINOR 22
#define RAFT_VERSION_RELEASE 1
#define RAFT_VERSION_NUMBER \
(RAFT_VERSION_MAJOR * 100 * 100 + RAFT_VERSION_MINOR * 100 + \
RAFT_VERSION_RELEASE)
int raft_version_number(void);
/**
* Error codes.
*/
enum {
RAFT_NOMEM = 1, /* Out of memory */
RAFT_BADID, /* Server ID is not valid */
RAFT_DUPLICATEID, /* Server ID already in use */
RAFT_DUPLICATEADDRESS, /* Server address already in use */
RAFT_BADROLE, /* Server role is not valid */
RAFT_MALFORMED,
RAFT_NOTLEADER,
RAFT_LEADERSHIPLOST,
RAFT_SHUTDOWN,
RAFT_CANTBOOTSTRAP,
RAFT_CANTCHANGE,
RAFT_CORRUPT,
RAFT_CANCELED,
RAFT_NAMETOOLONG,
RAFT_TOOBIG,
RAFT_NOCONNECTION,
RAFT_BUSY,
RAFT_IOERR, /* File system or storage error */
RAFT_NOTFOUND, /* Resource not found */
RAFT_INVALID, /* Invalid parameter */
RAFT_UNAUTHORIZED, /* No access to a resource */
RAFT_NOSPACE, /* Not enough space on disk */
RAFT_TOOMANY /* Some system or raft limit was hit */
};
/**
* Size of human-readable error message buffers.
*/
#if defined(RAFT__LEGACY_no)
#define RAFT_ERRMSG_BUF_SIZE 64
#else
#define RAFT_ERRMSG_BUF_SIZE 256
#endif
/**
* Return the error message describing the given error code.
*/
RAFT_API const char *raft_strerror(int errnum);
/**
* Hold the value of a raft server ID. Guaranteed to be at least 64-bit long.
*/
typedef unsigned long long raft_id;
/**
* Hold the value of a raft term. Guaranteed to be at least 64-bit long.
*/
typedef unsigned long long raft_term;
/**
* Hold the value of a raft entry index. Guaranteed to be at least 64-bit long.
*/
typedef unsigned long long raft_index;
/**
* Hold a time value expressed in milliseconds since the epoch.
*/
typedef unsigned long long raft_time;
/**
* A data buffer.
*/
struct raft_buffer
{
void *base; /* Pointer to the buffer data. */
size_t len; /* Length of the buffer. */
};
/**
* Customizable tracer for debugging, logging and metrics purposes.
*/
struct raft_tracer
{
/**
* Implementation-defined state object.
*/
void *impl;
/**
* Version of the raft_tracer structure. Must be at least 2.
*/
int version;
/**
* Emit an event of the given @type code. The @info object contains
* details about the event and its format depends on the event @type.
*
* Type codes from #1 to #255 are reserved for traces defined by the
* core library.
*
* Type codes from #256 to #65535 are reserved for events defined by
* specific #raft_io backends.
*
* Type codes from #65535 onwards can be used by user applications.
*/
void (*emit)(struct raft_tracer *t, int type, const void *info);
};
#define RAFT_TRACER_DIAGNOSTIC 1
/**
* Information passed as @info argument to #raft_tracer->trace() for
* traces generated by #raft objects.
*/
struct raft_tracer_info
{
/**
* Version of the #raft_tracer_info structure. Must be at least 1.
*/
int version;
union {
struct
{
int level; /* 1 Error, 2 Warning, 3 Info, 4 Debug, 5 Trace */
const char *message;
const char *file;
int line;
} diagnostic; /* @type RAFT_TRACER_DIAGNOSTIC */
};
};
/**
* Server role codes.
*/
enum {
RAFT_STANDBY, /* Replicate log, does not participate in quorum. */
RAFT_VOTER, /* Replicate log, does participate in quorum. */
RAFT_SPARE /* Does not replicate log, or participate in quorum. */
};
/**
* Hold information about a single server in the cluster configuration.
* WARNING: This struct is encoded/decoded, be careful when adapting it.
*/
struct raft_server
{
raft_id id; /* Server ID, must be greater than zero. */
char *address; /* Server address. User defined. */
int role; /* Server role. */
};
/**
* Hold information about all servers currently part of the cluster.
* WARNING: This struct is encoded/decoded, be careful when adapting it.
*/
struct raft_configuration
{
struct raft_server *servers; /* Array of servers member of the cluster. */
unsigned n; /* Number of servers in the array. */
};
/**
* Initialize an empty raft configuration.
*/
RAFT_API void raft_configuration_init(struct raft_configuration *c);
/**
* Release all memory used by the given configuration object.
*/
RAFT_API void raft_configuration_close(struct raft_configuration *c);
/**
* Add a server to a raft configuration.
*
* The @id must be greater than zero and @address point to a valid string.
*
* The @role must be either #RAFT_VOTER, #RAFT_STANDBY, #RAFT_SPARE.
*
* If @id or @address are already in use by another server in the configuration,
* an error is returned.
*
* The @address string will be copied and can be released after this function
* returns.
*/
RAFT_API int raft_configuration_add(struct raft_configuration *c,
raft_id id,
const char *address,
int role);
/**
* Encode the given configuration object.
*
* The memory of the returned buffer is allocated using raft_malloc(), and
* client code is responsible for releasing it when no longer needed.
*/
RAFT_API int raft_configuration_encode(const struct raft_configuration *c,
struct raft_buffer *buf);
/**
* Decode the given configuration object. */
RAFT_API int raft_configuration_decode(const struct raft_buffer *buf,
struct raft_configuration *c);
/**
* Hash function which outputs a 64-bit value based on a text and a number.
*
* This can be used to generate a unique ID for a new server being added, for
* example based on its address and on the current time in milliseconds since
* the Epoch.
*
* It's internally implemented as a SHA1 where only the last 8 bytes of the hash
* value are kept.
*/
RAFT_API unsigned long long raft_digest(const char *text, unsigned long long n);
/**
* Log entry types.
*/
enum raft_entry_type {
RAFT_COMMAND = 1, /* Command for the application FSM. */
RAFT_BARRIER, /* Wait for all previous commands to be applied. */
RAFT_CHANGE /* Raft configuration change. */
};
/**
* A single entry in the raft log.
*
* An entry that originated from this raft instance while it was the leader
* (typically via client calls to raft_apply()) should normally have a @buf
* attribute referencing directly the memory that was originally allocated by
* the client itself to contain the entry data, and the @batch attribute set to
* #NULL.
*
* An entry that was received from the network as part of an AppendEntries RPC
* or that was loaded from disk at startup should normally have a @batch
* attribute that points to a contiguous chunk of memory that contains the data
* of the entry itself plus possibly the data for other entries that were
* received or loaded with it at the same time. In this case the @buf pointer
* will be equal to the @batch pointer plus an offset, that locates the position
* of the entry's data within the batch.
*
* When the @batch attribute is not #NULL the raft library will take care of
* releasing that memory only once there are no more references to the
* associated entries.
*
* This arrangement makes it possible to minimize the amount of memory-copying
* when performing I/O.
*/
struct raft_entry
{
raft_term term; /* Term in which the entry was created. */
enum raft_entry_type type; /* Type (FSM command, barrier, config change). */
struct raft_buffer buf; /* Entry data. */
void *batch; /* Batch that buf's memory points to, if any. */
};
/**
* Hold the arguments of a RequestVote RPC.
*
* The RequestVote RPC is invoked by candidates to gather votes.
*/
struct raft_request_vote
{
unsigned char version;
raft_term term; /* Candidate's term. */
raft_id candidate_id; /* ID of the server requesting the vote. */
raft_index last_log_index; /* Index of candidate's last log entry. */
raft_index last_log_term; /* Term of log entry at last_log_index. */
bool disrupt_leader; /* True if current leader should be discarded. */
bool pre_vote; /* True if this is a pre-vote request. */
};
/**
* Hold the result of a RequestVote RPC.
*/
struct raft_request_vote_result
{
unsigned char version;
raft_term term; /* Receiver's current term (candidate updates itself). */
bool vote_granted; /* True means candidate received vote. */
bool pre_vote; /* The response to a pre-vote RequestVote or not. */
unsigned short features; /* Server capabilities */
unsigned short capacity; /* Disk capacity reserved to store entries. */
};
/**
* Hold the arguments of an AppendEntries RPC.
*
* The AppendEntries RPC is invoked by the leader to replicate log entries. It's
* also used as heartbeat (figure 3.1).
*/
struct raft_append_entries
{
unsigned char version;
raft_term term; /* Leader's term. */
raft_index prev_log_index; /* Index of log entry preceeding new ones. */
raft_term prev_log_term; /* Term of entry at prev_log_index. */
raft_index leader_commit; /* Leader's commit index. */
struct raft_entry *entries; /* Log entries to append. */
unsigned n_entries; /* Size of the log entries array. */
};
/**
* Hold the result of an AppendEntries RPC (figure 3.1).
*/
struct raft_append_entries_result
{
unsigned char version;
raft_term term; /* Receiver's current_term. */
raft_index rejected; /* If non-zero, the index that was rejected. */
raft_index last_log_index; /* Receiver's last log entry index, as hint. */
unsigned short features; /* Feature flags (since version 1). */
unsigned short capacity; /* Reserved disk capacity for log entries. */
};
/**
* Hold the arguments of an InstallSnapshot RPC (figure 5.3).
*/
struct raft_install_snapshot
{
unsigned char version;
raft_term term; /* Leader's term. */
raft_index last_index; /* Index of last entry in the snapshot. */
raft_term last_term; /* Term of last_index. */
struct raft_configuration conf; /* Config as of last_index. */
raft_index conf_index; /* Commit index of conf. */
struct raft_buffer data; /* Raw snapshot data. */
};
/**
* Hold the arguments of a TimeoutNow RPC.
*
* The TimeoutNow RPC is invoked by leaders to transfer leadership to a
* follower.
*/
struct raft_timeout_now
{
unsigned char version;
raft_term term; /* Leader's term. */
raft_index last_log_index; /* Index of leader's last log entry. */
raft_index last_log_term; /* Term of log entry at last_log_index. */
};
/**
* Type codes for RPC messages.
*/
enum raft_message_type {
RAFT_APPEND_ENTRIES = 1,
RAFT_APPEND_ENTRIES_RESULT,
RAFT_REQUEST_VOTE,
RAFT_REQUEST_VOTE_RESULT,
RAFT_INSTALL_SNAPSHOT,
RAFT_TIMEOUT_NOW
};
/**
* A single RPC message that can be sent or received over the network.
*
* The RPC message types all have a `version` field.
* In the libuv io implementation, `version` is filled out during decoding
* and is based on the size of the message on the wire, see e.g.
* `sizeofRequestVoteV1`. The version number in the RAFT_MESSAGE_XXX_VERSION
* macro needs to be bumped every time the message is updated.
*
* Notes when adding a new message type to raft:
* raft_io implementations compiled against old versions of raft don't know the
* new message type and possibly have not allocated enough space for it. When
* such an application receives a new message over the wire, the raft_io
* implementation will err out or drop the message, because it doesn't know how
* to decode it based on its type.
* raft_io implementations compiled against versions of raft that know the new
* message type but at runtime are linked against an older raft lib, will pass
* the message to raft, where raft will drop it.
* When raft receives a message and accesses a field of a new message type,
* the raft_io implementation must have known about the new message type,
* so it was compiled against a modern enough version of raft, and memory
* accesses should be safe.
*
* Sending a new message type with a raft_io implementation that doesn't know
* the type is safe, the implementation should drop the message based on its
* type and will not try to access fields it doesn't know the existence of.
*/
struct raft_message
{
enum raft_message_type type; /* RPC type code. */
raft_id server_id; /* ID of sending or destination server. */
const char *server_address; /* Address of sending or destination server. */
union { /* Type-specific data */
struct raft_request_vote request_vote;
struct raft_request_vote_result request_vote_result;
struct raft_append_entries append_entries;
struct raft_append_entries_result append_entries_result;
struct raft_install_snapshot install_snapshot;
struct raft_timeout_now timeout_now;
};
};
/**
* Hold metadata associated with a snapshot.
*/
struct raft_snapshot_metadata
{
/* Index and term of last entry included in the snapshot. */
raft_index index;
raft_term term;
/* Last committed configuration included in the snapshot, along with the
* index it was committed at. */
struct raft_configuration configuration;
raft_index configuration_index;
};
/**
* Type codes of events to be passed to raft_step().
*/
enum raft_event_type {
RAFT_START = 1, /* Initial event starting loading persisted data. */
RAFT_RECEIVE, /* A message has been received from another server. */
RAFT_PERSISTED_ENTRIES, /* Some entries have been persisted to disk. */
RAFT_PERSISTED_SNAPSHOT, /* A snapshot has been persisted. */
RAFT_CONFIGURATION, /* A new committed configuration must be applied. */
RAFT_SNAPSHOT, /* A snapshot has been taken. */
RAFT_TIMEOUT, /* The timeout has expired. */
RAFT_SUBMIT, /* New entries have been submitted. */
RAFT_CATCH_UP, /* Start catching-up a server. */
RAFT_TRANSFER /* Start transferring leadership to another server. */
};
/**
* Represents an external event that drives the raft engine forward (for example
* receiving a message or completing a task.
*/
struct raft_event
{
raft_time time;
enum raft_event_type type;
unsigned char unused;
unsigned short capacity;
unsigned char reserved[4];
union {
struct
{
raft_term term;
raft_id voted_for;
struct raft_snapshot_metadata *metadata;
raft_index start_index;
struct raft_entry *entries;
unsigned n_entries;
} start;
struct
{
struct raft_message *message;
} receive;
struct
{
raft_index index; /* Highest index persisted */
} persisted_entries;
struct
{
struct raft_snapshot_metadata metadata;
size_t offset;
bool last;
} persisted_snapshot;
struct
{
raft_index index;
struct raft_configuration conf;
} configuration;
struct
{
struct raft_snapshot_metadata metadata; /* Snapshot metadata */
unsigned trailing; /* Trailing entries kept */
} snapshot;
struct
{
struct raft_entry *entries;
unsigned n;
} submit;
struct
{
raft_id server_id;
} catch_up;
struct
{
raft_id server_id;
} transfer;
};
};
/**
* Hold information about changes that user code must perform after a call to
* raft_step() returns (e.g. new entries that must be persisted, new messages
* that must be sent, etc.).
*/
struct raft_update
{
unsigned flags;
struct
{
raft_index index; /* 0 if no change */
struct raft_entry *batch;
unsigned n;
} entries;
struct
{
struct raft_snapshot_metadata metadata;
size_t offset;
struct raft_buffer chunk;
bool last;
} snapshot;
struct
{
struct raft_message *batch;
unsigned n;
} messages;
};
#define RAFT_UPDATE_CURRENT_TERM 1 << 0
#define RAFT_UPDATE_VOTED_FOR 1 << 1
#define RAFT_UPDATE_ENTRIES 1 << 2
#define RAFT_UPDATE_SNAPSHOT 1 << 3
#define RAFT_UPDATE_MESSAGES 1 << 4
#define RAFT_UPDATE_STATE 1 << 5
#define RAFT_UPDATE_COMMIT_INDEX 1 << 6
#define RAFT_UPDATE_TIMEOUT 1 << 7
/**
* State codes.
*/
enum raft_state {
RAFT_UNAVAILABLE,
RAFT_FOLLOWER,
RAFT_CANDIDATE,
RAFT_LEADER
};
/**
* Data structure for efficiently keeping track of the indexes of all entries in
* the log and of their terms.
*/
struct raft_trail
{
struct
{
raft_index index; /* Index of the last entry at a certain term. */
raft_term term; /* Term this record is tracking. */
} *records; /* Circular buffer of index/term records. */
unsigned size; /* Number of available slots in the buffer. */
unsigned front, back; /* Indexes of used slots [front, back). */
raft_index offset; /* Index of first entry in the log is offset+1. */
struct /* Information about last snapshot, or zero. */
{
raft_index index; /* Snapshot contains all entries up to here. */
raft_term term; /* Term of last index. */
} snapshot;
};
struct raft; /* Forward declaration. */
#if defined(RAFT__LEGACY_no)
#define RAFT__RESERVED \
struct \
{ \
void *dummy; \
}
#define RAFT__EXTENSIONS_LEGACY
#else
/* Unused uint64_t slots that are reserved for v0.x extensions.*/
#define RAFT__RESERVED \
struct \
{ \
uint64_t reserved[32]; \
}
#define RAFT__EXTENSIONS_LEGACY \
/* Fields used by the v0 compatibility code */ \
struct \
{ \
void *requests[2]; /* Completed client requests */ \
void (*step_cb)(struct raft *); /* Invoked after raft_step() */ \
unsigned short prev_state; /* Used to detect lost leadership */ \
bool closing; /* True when shutting down */ \
void *pending[2]; /* Pending client requests */ \
struct raft_change *change; /* Pending membership change */ \
raft_index snapshot_index; /* Last persisted snapshot */ \
struct raft_buffer snapshot_chunk; /* Cache of snapshot data */ \
bool snapshot_taking; /* True when taking a snapshot */ \
bool snapshot_install; /* True if installing a snapshot */ \
void *snapshot_pending; /* Pending install snapshot */ \
struct raft_log *log; /* Cache on-disk log */ \
unsigned snapshot_threshold; /* N. of entries before snapshot */ \
unsigned snapshot_trailing; /* N. of entries to retain */ \
} legacy;
#endif /* RAFT__LEGACY_no */
/* Extended struct raft fields added after the v0.x ABI freeze. */
#define RAFT__EXTENSIONS \
struct \
{ \
raft_time now; /* Current time, updated via raft_step() */ \
unsigned random; /* Pseudo-random number generator state */ \
struct raft_message *messages; /* Pre-allocated message queue */ \
unsigned n_messages_cap; /* Capacity of the message queue */ \
unsigned max_inflight_entries; /* Pending entries limit */ \
/* Index of the last snapshot that was taken */ \
raft_index configuration_last_snapshot_index; \
RAFT__EXTENSIONS_LEGACY \
struct raft_update *update; /* Pointer passed to raft_step() */ \
struct raft_trail trail; \
struct raft_entry barrier; \
}
RAFT__ASSERT_COMPATIBILITY(RAFT__RESERVED, RAFT__EXTENSIONS);
#if defined(RAFT__LEGACY_no)
#define RAFT__SNAPSHOT_FIELDS_V0 \
struct \
{ \
void *dummy; \
}
#else
/**
* Hold the details of a snapshot.
* The user-provided raft_buffer structs should provide the user with enough
* flexibility to adapt/evolve snapshot formats.
* If this struct would NEED to be adapted in the future, raft can always move
* to a new struct with a new name and a new raft_io version.
*/
struct raft_snapshot
{
/* Index and term of last entry included in the snapshot. */
raft_index index;
raft_term term;
/* Last committed configuration included in the snapshot, along with the
* index it was committed at. */
struct raft_configuration configuration;
raft_index configuration_index;
/* Content of the snapshot. When a snapshot is taken, the user FSM can fill
* the bufs array with more than one buffer. When a snapshot is restored,
* there will always be a single buffer. */
struct raft_buffer *bufs;
unsigned n_bufs;
};
/**
* Asynchronous request to store a new snapshot.
*/
struct raft_io_snapshot_put;
typedef void (*raft_io_snapshot_put_cb)(struct raft_io_snapshot_put *req,
int status);
struct raft_io_snapshot_put
{
void *data; /* User data */
raft_io_snapshot_put_cb cb; /* Request callback */
};
#define RAFT__SNAPSHOT_FIELDS_V0 \
struct \
{ \
unsigned _threshold; /* N. of entries before snapshot */ \
unsigned _trailing; /* N. of trailing entries to retain */ \
struct raft_snapshot _pending; \
struct raft_io_snapshot_put _put; \
}
#endif /* RAFT__LEGACY_no */
#define RAFT__SNAPSHOT_FIELDS_V1 \
struct \
{ \
bool unused; \
bool installing; /* A RAFT_UPDATE_SNAPSHOT request is in flight */ \
}
RAFT__ASSERT_COMPATIBILITY(RAFT__SNAPSHOT_FIELDS_V0, RAFT__SNAPSHOT_FIELDS_V1);
typedef void (*raft_close_cb)(struct raft *raft);
struct raft_progress;
#if !defined(RAFT__LEGACY_no)
struct raft_log;
#endif
/**
* Hold and drive the state of a single raft server in a cluster.
* When replacing reserved fields in the middle of this struct, you MUST use a
* type with the same size and alignment requirements as the original type.
*/
struct raft
{
#if !defined(RAFT__LEGACY_no)
void *data; /* Custom user data. */
#endif
struct raft_tracer *tracer; /* Tracer implementation. */
#if !defined(RAFT__LEGACY_no)
struct raft_io *io; /* Disk and network I/O implementation. */
struct raft_fsm *fsm; /* User-defined FSM to apply commands to. */
#endif
raft_id id; /* Server ID of this raft instance. */
char *address; /* Server address of this raft instance. */
/*
* Cache of the server's persistent state, updated on stable storage before
* responding to RPCs (Figure 3.1).
*/
raft_term current_term; /* Latest term server has seen. */
raft_id voted_for; /* Candidate that received vote in current term. */
union {
#if !defined(RAFT__LEGACY_no)
struct raft_log *unused; /* XXX: Legacy field. */
#endif
struct
{
unsigned short capacity; /* Guaranteed disk capacity */
unsigned short capacity_threshold;
};
};
/*
* Current membership configuration (Chapter 4).
*
* At any given moment the current configuration can be committed or
* uncommitted.
*
* If a server is voting, the log entry with index 1 must always contain the
* first committed configuration.
*
* At all times #configuration_committed_index is either zero or is the
* index of the most recent log entry of type #RAFT_CHANGE that we know to
* be committed. That means #configuration_committed_index is always equal
* or lower than #commit_index.
*
* At all times #configuration_uncommitted_index is either zero or is the
* index of an uncommitted log entry of type #RAFT_CHANGE. There can be at
* most one uncommitted entry of type #RAFT_CHANGE because we allow only one
* configuration change at a time.
*
* At all times #configuration_committed is a copy of the last committed
* configuration, if any.
*
* The possible scenarios are:
*
* 1. #configuration_committed_index and #configuration_uncommitted_index
* are both zero. This should only happen when a brand new server starts
* joining a cluster and is waiting to receive log entries from the
* current leader. In this case #configuration and
* #configuration_committed must be empty and have no servers.
*
* 2. #configuration_committed_index is non-zero and
* #configuration_uncommitted_index is zero. This means that
* #configuration is committed and there is no pending configuration
* change. The content of #configuration must match the one of the log
* entry at #configuration_committed_index.
*
* 3. #configuration_committed_index and #configuration_uncommitted_index
* are both non-zero, with the latter being greater than the former. This
* means that #configuration is uncommitted and represents a pending
* configuration change. The content of #configuration must match the one
* of the log entry at #configuration_uncommitted_index.
*
* When a new configuration is committed, a copy of it is saved in
* #configuration_committed, so it can be easily retrieved in case the
* log gets truncated because of compaction and does not contain the entry
* at #configuration_committed_index anymore. Likewise, if a snapshot is
* restored its associated configuration is saved in
* #configuration_committed.
*/
struct raft_configuration configuration;
struct raft_configuration configuration_committed;
raft_index configuration_committed_index;
raft_index configuration_uncommitted_index;
/*
* Election timeout in milliseconds (default 1000).
*
* From 3.4:
*
* Raft uses a heartbeat mechanism to trigger leader election. When
* servers start up, they begin as followers. A server remains in follower
* state as long as it receives valid RPCs from a leader or
* candidate. Leaders send periodic heartbeats (AppendEntries RPCs that
* carry no log entries) to all followers in order to maintain their
* authority. If a follower receives no communication over a period of
* time called the election timeout, then it assumes there is no viable
* leader and begins an election to choose a new leader.
*
* This is the baseline value and will be randomized between 1x and 2x.
*
* See raft_change_election_timeout() to customize the value of this
* attribute.
*/
unsigned election_timeout;
/*
* Heartbeat timeout in milliseconds (default 100). This is relevant only
* for when the raft instance is in leader state: empty AppendEntries RPCs
* will be sent if this amount of milliseconds elapses without any
* user-triggered AppendEntries RCPs being sent.
*
* From Figure 3.1:
*
* [Leaders] Send empty AppendEntries RPC during idle periods to prevent
* election timeouts.
*/
unsigned heartbeat_timeout;
/*
* When the leader sends an InstallSnapshot RPC to a follower it will
* consider the RPC as failed after this timeout and retry.
*/
unsigned install_snapshot_timeout;
/*
* The fields below hold the part of the server's volatile state which is
* always applicable regardless of the whether the server is follower,
* candidate or leader (Figure 3.1). This state is rebuilt automatically
* after a server restart.
*/
raft_index commit_index; /* Highest log entry known to be committed */
#if !defined(RAFT__LEGACY_no)
raft_index last_applied; /* Highest log entry applied to the FSM */
#endif
raft_index last_stored; /* Highest log entry persisted on disk */
/*
* Current server state of this raft instance, along with a union defining
* state-specific values.
*/
unsigned short state;
union {
struct /* Follower */
{
unsigned randomized_election_timeout; /* Timer expiration. */
struct /* Current leader info. */
{
raft_id id;
char *address;
} current_leader;
union {
raft_index match; /* Highest local index that matches leader. */
#if !defined(RAFT__LEGACY_no)
uint64_t reserved[8]; /* Future use */
#endif
};
} follower_state;
struct
{
unsigned randomized_election_timeout; /* Timer expiration. */
struct
{
bool grant; /* True if the server voted for us */
unsigned short features; /* What the server is capable of. */
unsigned short capacity; /* Guaranteed capacity. */
} *votes; /* Vote results. */
bool disrupt_leader; /* For leadership transfer */
bool in_pre_vote; /* True in pre-vote phase. */
#if !defined(RAFT__LEGACY_no)
uint64_t reserved[8]; /* Future use */
#endif
} candidate_state;
struct
{
struct raft_progress *progress; /* Per-server replication state. */
#if !defined(RAFT__LEGACY_no)
struct raft_change *unused1; /* XXX: unused, for ABI compat. */
#endif
raft_id promotee_id; /* ID of server being promoted. */
unsigned short round_number; /* Current sync round. */
raft_index round_index; /* Target of the current round. */
raft_time round_start; /* Start of current round. */
#if !defined(RAFT__LEGACY_no)
void *unused2[2]; /* XXX: unused, for ABI compat. */
#endif
union {
#if !defined(RAFT__LEGACY_no)
uint64_t reserved[8]; /* Future use */
#endif
struct
{
raft_id transferee; /* Server ID of aleadership transfer */
raft_time transfer_start;
bool transferring; /* True if after sending TimeoutNow */
};
};
} leader_state;
};
/* Election timer start.
*
* This timer has different purposes depending on the state. Followers
* convert to candidate after the randomized election timeout has elapsed
* without leader contact. Candidates start a new election after the
* randomized election timeout has elapsed without a winner. Leaders step
* down after the election timeout has elapsed without contacting a majority
* of voting servers. */
raft_time election_timer_start;
#if !defined(RAFT__LEGACY_no)
struct raft_transfer *transfer; /* Used by the legacy compatibility layer */
#endif
/*
* Information about the last snapshot that was taken (if any).
*/
struct
{
union {
RAFT__SNAPSHOT_FIELDS_V0;
RAFT__SNAPSHOT_FIELDS_V1;
};
#if !defined(RAFT__LEGACY_no)
uint64_t reserved[8]; /* Future use */
#endif
} snapshot;
#if !defined(RAFT__LEGACY_no)
/*
* Callback to invoke once a close request has completed.
*/
raft_close_cb close_cb;
#endif
/*
* Human-readable message providing diagnostic information about the last
* error occurred.
*/
char errmsg[RAFT_ERRMSG_BUF_SIZE];
/* Whether to use pre-vote to avoid disconnected servers disrupting the
* current leader, as described in 4.2.3 and 9.6. */
bool pre_vote;
/* Limit how long to wait for a stand-by to catch-up with the log when its
* being promoted to voter. */
unsigned max_catch_up_rounds;
unsigned max_catch_up_round_duration;
/* Fields added after the v0.x ABI freeze, packed in the unused space. */
union {
RAFT__RESERVED;
RAFT__EXTENSIONS;
};
};
#undef RAFT__RESERVED
#undef RAFT__EXTENSIONS
#undef RAFT__SNAPSHOT_FIELDS_V1
#undef RAFT__SNAPSHOT_FIELDS_V2
struct raft_io;
struct raft_fsm;
RAFT_API int raft_init(struct raft *r,
struct raft_io *io,
struct raft_fsm *fsm,
raft_id id,
const char *address);
RAFT_API void raft_close(struct raft *r, raft_close_cb cb);
/**
* Seed the state of the pseudo random number generator.
*
* This should be called only once, before calling raft_start().
*/
RAFT_API void raft_seed(struct raft *r, unsigned random);
/**
* Notify the raft engine of the given @event.
*
* The @commit_index output parameter will be filled with the index of the most
* recent entry known to be committed.
*
* The @timeout output parameter will be filled with the time at which the next
* timeout event should be fired. Any previously scheduled timeout that has not
* yet been fired should be cancelled.
*
* The @tasks output parameter will point to an array of @n_tasks pending tasks
* that should be performed.
*
* The memory of the @tasks array is guaranteed to be valid until the next call
* to raft_step(), and must be freed by consuming code.
*
* Tasks of type #RAFT_PERSIST_TERM_AND_VOTE must be carried out synchronously
* before any subsequent task (such as sending messages) is even started.
*/
RAFT_API int raft_step(struct raft *r,
struct raft_event *event,
struct raft_update *update);
/**
* Return the current term of this server.
*/
RAFT_API raft_term raft_current_term(const struct raft *r);
/**
* Return the ID of the server that this server has voted for, or #0 if it not
* vote.
*/
RAFT_API raft_id raft_voted_for(const struct raft *r);
/**
* Return the commit index of this server.
*/
RAFT_API raft_index raft_commit_index(const struct raft *r);
/**
* Return the time at which the next RAFT_TIMEOUT event should be fired.
*/
RAFT_API raft_time raft_timeout(const struct raft *r);
/**
* Return the current match index of the server with the given ID.
*
* It must be called when in leader state.
*/
RAFT_API int raft_match_index(const struct raft *r,
raft_id id,
raft_index *index);
/**
* Return information about the progress of a server that is catching up with
* logs after a #RAFT_CATCH_UP event was fired.
*/
enum {
RAFT_CATCH_UP_NONE,
RAFT_CATCH_UP_RUNNING,
RAFT_CATCH_UP_ABORTED,
RAFT_CATCH_UP_FINISHED
};
RAFT_API int raft_catch_up(const struct raft *r, raft_id id, int *status);
/**
* Return the ID of the server that leadership is being transfered to, or #0 if
* no leadership transfer is in progress.
*/
RAFT_API raft_id raft_transferee(const struct raft *r);
/**
* Set the election timeout.
*
* Every raft instance is initialized with a default election timeout of 1000
* milliseconds. If you wish to tweak it, call this function before starting
* your event loop.
*
* From Chapter 9:
*
* We recommend a range that is 10-20 times the one-way network latency, which
* keeps split votes rates under 40% in all cases for reasonably sized
* clusters, and typically results in much lower rates.
*
* Note that the current random election timer will be reset and a new one
* will be generated.
*/
RAFT_API void raft_set_election_timeout(struct raft *r, unsigned msecs);
/**
* Set the heartbeat timeout.
*/
RAFT_API void raft_set_heartbeat_timeout(struct raft *r, unsigned msecs);
/**
* Set the snapshot install timeout.
*/
RAFT_API void raft_set_install_snapshot_timeout(struct raft *r, unsigned msecs);
/**
* Enable or disable pre-vote support. Pre-vote is turned off by default.
*/
RAFT_API void raft_set_pre_vote(struct raft *r, bool enabled);
/**
* Set the maximum number of a catch-up rounds to try when replicating entries
* to a stand-by server that is being promoted to voter, before giving up and
* failing the configuration change. The default is 10.
*/
RAFT_API void raft_set_max_catch_up_rounds(struct raft *r, unsigned n);
/**
* Set the maximum duration of a catch-up round when replicating entries to a
* stand-by server that is being promoted to voter. The default is 5 seconds.
*/
RAFT_API void raft_set_max_catch_up_round_duration(struct raft *r,
unsigned msecs);
/**
* Set the maximum number of in-flight append messages that will be
* optimistically sent to peers without waiting for acknowledgment. The engine
* will stop sending more messages if this limit is reached. The default is 32.
*
* This limit also applies to entries being persisted locally and that haven't
* been acknowledged yet.
*/
RAFT_API void raft_set_max_inflight_entries(struct raft *r, unsigned n);
/**
* If a non-zero capacity threshold is set using this function, then the
* #capacity field of struct raft_event must be always set when calling
* raft_step(), and its value should contain the current amount of disk space
* that the server is guaranteed to have and can use when storing new entries.
*
* If the leader notices that a majority of voting server have their capacity
* below #min, then it will refuse to accept new entries. Calling raft_step()
* with a #RAFT_SUBMIT event will return #RAFT_NOSPACE.
*/
RAFT_API void raft_set_capacity_threshold(struct raft *r, unsigned short min);
/**
* Return a human-readable description of the last error occurred.
*/
RAFT_API const char *raft_errmsg(struct raft *r);
/**
* Return the code of the current raft state (follower/candidate/leader).
*/
RAFT_API enum raft_state raft_state(struct raft *r);
/**
* Return the code of the current raft role (spare/standby/voter),
* or -1 if this server is not in the current configuration.
*/
RAFT_API int raft_role(struct raft *r);
/**
* Return the ID and address of the current known leader, if any.
*/
RAFT_API void raft_leader(struct raft *r, raft_id *id, const char **address);
/**
* Return the index of the last entry that was appended to the local log.
*/
RAFT_API raft_index raft_last_index(struct raft *r);
/**
* Generate a pseudo-random number between @min and @max, using @state as
* generator state.
*/
RAFT_API unsigned raft_random(unsigned *state, unsigned min, unsigned max);
/**
* Return the name of state with the given code.
*/
RAFT_API const char *raft_state_name(int state);
/**
* Return the name of role with the given code.
*/
RAFT_API const char *raft_role_name(int state);
/**
* User-definable dynamic memory allocation functions.
*
* The @data field will be passed as first argument to all functions.
*/
struct raft_heap
{
void *data; /* User data */
void *(*malloc)(void *data, size_t size);
void (*free)(void *data, void *ptr);
void *(*calloc)(void *data, size_t nmemb, size_t size);
void *(*realloc)(void *data, void *ptr, size_t size);
void *(*aligned_alloc)(void *data, size_t alignment, size_t size);
void (*aligned_free)(void *data, size_t alignment, void *ptr);
};
RAFT_API void *raft_malloc(size_t size);
RAFT_API void raft_free(void *ptr);
RAFT_API void *raft_calloc(size_t nmemb, size_t size);
RAFT_API void *raft_realloc(void *ptr, size_t size);
RAFT_API void *raft_aligned_alloc(size_t alignment, size_t size);
RAFT_API void raft_aligned_free(size_t alignment, void *ptr);
/**
* Use a custom dynamic memory allocator.
*/
RAFT_API void raft_heap_set(struct raft_heap *heap);
/**
* Use the default dynamic memory allocator (from the stdlib). This clears any
* custom allocator specified with @raft_heap_set.
*/
RAFT_API void raft_heap_set_default(void);
/**
* Return a reference to the current dynamic memory allocator.
*
* This is intended for use by applications that want to temporarily replace
* and then restore the original allocator, or that want to defer to the
* original allocator in some circumstances.
*
* The behavior of attempting to mutate the default allocator through the
* pointer returned by this function, including attempting to deallocate
* the backing memory, is undefined.
*/
RAFT_API const struct raft_heap *raft_heap_get(void);
#if !defined(RAFT__LEGACY_no)
/**
* Asynchronous request to send an RPC message.
*/
struct raft_io_send;
typedef void (*raft_io_send_cb)(struct raft_io_send *req, int status);
struct raft_io_send
{
void *data; /* User data */
raft_io_send_cb cb; /* Request callback */
};
/**
* Asynchronous request to store new log entries.
*/
struct raft_io_append;
typedef void (*raft_io_append_cb)(struct raft_io_append *req, int status);
struct raft_io_append
{
void *data; /* User data */
raft_io_append_cb cb; /* Request callback */
};
/**
* Asynchronous request to load the most recent snapshot available.
*/
struct raft_io_snapshot_get;
typedef void (*raft_io_snapshot_get_cb)(struct raft_io_snapshot_get *req,
struct raft_snapshot *snapshot,
int status);
struct raft_io_snapshot_get
{
void *data; /* User data */
raft_io_snapshot_get_cb cb; /* Request callback */
};
struct raft_io; /* Forward declaration. */
/**
* Callback invoked by the I/O implementation at regular intervals.
*/
typedef void (*raft_io_tick_cb)(struct raft_io *io);
/**
* Callback invoked by the I/O implementation when an RPC message is received.
*/
typedef void (*raft_io_recv_cb)(struct raft_io *io, struct raft_message *msg);
typedef void (*raft_io_close_cb)(struct raft_io *io);
/**
* version field MUST be filled out by user.
* When moving to a new version, the user MUST implement the newly added
* methods.
*/
struct raft_io
{
short version; /* 1 or 2 */
unsigned short capacity; /* Reserved disk capacity */
void *data;
void *impl;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int (*init)(struct raft_io *io, raft_id id, const char *address);
void (*close)(struct raft_io *io, raft_io_close_cb cb);
int (*load)(struct raft_io *io,
raft_term *term,
raft_id *voted_for,
struct raft_snapshot **snapshot,
raft_index *start_index,
struct raft_entry *entries[],
size_t *n_entries);
int (*start)(struct raft_io *io,
unsigned msecs,
raft_io_tick_cb tick,
raft_io_recv_cb recv);
int (*bootstrap)(struct raft_io *io, const struct raft_configuration *conf);
int (*recover)(struct raft_io *io, const struct raft_configuration *conf);
int (*set_term)(struct raft_io *io, raft_term term);
int (*set_vote)(struct raft_io *io, raft_id server_id);
int (*send)(struct raft_io *io,
struct raft_io_send *req,
const struct raft_message *message,
raft_io_send_cb cb);
int (*append)(struct raft_io *io,
struct raft_io_append *req,
const struct raft_entry entries[],
unsigned n,
raft_io_append_cb cb);
int (*truncate)(struct raft_io *io, raft_index index);
int (*snapshot_put)(struct raft_io *io,
unsigned trailing,
struct raft_io_snapshot_put *req,
const struct raft_snapshot *snapshot,
raft_io_snapshot_put_cb cb);
int (*snapshot_get)(struct raft_io *io,
struct raft_io_snapshot_get *req,
raft_io_snapshot_get_cb cb);
raft_time (*time)(struct raft_io *io);
int (*random)(struct raft_io *io, int min, int max);
};
/**
* version field MUST be filled out by user.
* When moving to a new version, the user MUST initialize the new methods,
* either with an implementation or with NULL.
*
* version 2:
* introduces `snapshot_finalize`, when this method is not NULL, it will
* always run after a successful call to `snapshot`, whether the snapshot has
* been successfully written to disk or not. If it is set, raft will
* assume no ownership of any of the `raft_buffer`s and the responsibility to
* clean up lies with the user of raft.
* `snapshot_finalize` can be used to e.g. release a lock that was taken during
* a call to `snapshot`. Until `snapshot_finalize` is called, raft can access
* the data contained in the `raft_buffer`s.
*/
struct raft_fsm
{
int version; /* 1, 2 or 3 */
void *data;
int (*apply)(struct raft_fsm *fsm,
const struct raft_buffer *buf,
void **result);
int (*snapshot)(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs);
int (*restore)(struct raft_fsm *fsm, struct raft_buffer *buf);
/* Fields below added since version 2. */
int (*snapshot_finalize)(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs);
/* Fields below added since version 3. */
int (*snapshot_async)(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs);
};
/**
* Close callback.
*
* It's safe to release the memory of a raft instance only after this callback
* has fired.
*/
struct raft_change; /* Forward declaration */
struct raft_transfer; /* Forward declaration */
/**
* Bootstrap this raft instance using the given configuration. The instance
* must not have been started yet and must be completely pristine, otherwise
* #RAFT_CANTBOOTSTRAP will be returned.
*/
RAFT_API int raft_bootstrap(struct raft *r,
const struct raft_configuration *conf);
/**
* Force a new configuration in order to recover from a loss of quorum where the
* current configuration cannot be restored, such as when a majority of servers
* die at the same time.
*
* This works by appending the new configuration directly to the log stored on
* disk.
*
* In order for this operation to be safe you must follow these steps:
*
* 1. Make sure that no servers in the cluster are running, either because they
* died or because you manually stopped them.
*
* 2. Run @raft_recover exactly one time, on the non-dead server which has
* the highest term and the longest log.
*
* 3. Copy the data directory of the server you ran @raft_recover on to all
* other non-dead servers in the cluster, replacing their current data
* directory.
*
* 4. Restart all servers.
*/
RAFT_API int raft_recover(struct raft *r,
const struct raft_configuration *conf);
RAFT_API int raft_start(struct raft *r);
/* Unused uint64_t slots that are reserved for v0.x extensions.*/
#define RAFT__REQUEST_RESERVED \
struct \
{ \
uint64_t reserved[4]; \
}
/* Extended RAFT__REQUEST fields added after the v0.x ABI freeze. */
#define RAFT__REQUEST_EXTENSIONS \
struct \
{ \
int status; /* Store the request status code, for delayed firing */ \
union { \
void *result; /* For raft_apply, store the request result */ \
raft_id catch_up_id; /* For raft_change, the catching up server */ \
}; \
}
RAFT__ASSERT_COMPATIBILITY(RAFT__REQUEST_RESERVED, RAFT__REQUEST_EXTENSIONS);
/**
* Common fields across client request types.
* `req_id`, `client_id` and `unique_id` are currently unused.
* `reserved` fields should be replaced by new members with the same size
* and alignment requirements as `uint64_t`.
*/
#define RAFT__REQUEST \
void *data; \
int type; \
raft_index index; \
void *queue[2]; \
uint8_t req_id[16]; \
uint8_t client_id[16]; \
uint8_t unique_id[16]; \
/* Fields added after the v0.x ABI freeze, packed in the unused space. */ \
union { \
RAFT__REQUEST_RESERVED; \
RAFT__REQUEST_EXTENSIONS; \
}
/**
* Asynchronous request to append a new command entry to the log and apply it to
* the FSM when a quorum is reached.
*/
struct raft_apply;
typedef void (*raft_apply_cb)(struct raft_apply *req, int status, void *result);
struct raft_apply
{
RAFT__REQUEST;
raft_apply_cb cb;
};
/**
* Propose to append commands to the log and apply them to the FSM once
* committed.
*
* If this server is the leader, it will create @n new log entries of type
* #RAFT_COMMAND using the given buffers as their payloads, append them to its
* own log and attempt to replicate them on other servers by sending
* AppendEntries RPCs.
*
* The memory pointed at by the @base attribute of each #raft_buffer in the
* given array must have been allocated with raft_malloc() or a compatible
* allocator. If this function returns 0, the ownership of this memory is
* implicitly transferred to the raft library, which will take care of releasing
* it when appropriate. Any further client access to such memory leads to
* undefined behavior.
*
* The ownership of the memory of the @bufs array itself is not transferred to
* the raft library, and, if allocated dynamically, must be deallocated by the
* caller.
*
* If the command was successfully applied, r->last_applied will be equal to
* the log entry index of the applied command when the cb is invoked.
*/
RAFT_API int raft_apply(struct raft *r,
struct raft_apply *req,
const struct raft_buffer bufs[],
const unsigned n,
raft_apply_cb cb);
/**
* Asynchronous request to append a barrier entry.
*/
struct raft_barrier;
typedef void (*raft_barrier_cb)(struct raft_barrier *req, int status);
struct raft_barrier
{
RAFT__REQUEST;
raft_barrier_cb cb;
};
/**
* Propose to append a log entry of type #RAFT_BARRIER.
*
* This can be used to ensure that there are no unapplied commands.
*/
RAFT_API int raft_barrier(struct raft *r,
struct raft_barrier *req,
raft_barrier_cb cb);
/**
* Asynchronous request to change the raft configuration.
*/
typedef void (*raft_change_cb)(struct raft_change *req, int status);
struct raft_change
{
RAFT__REQUEST;
raft_change_cb cb;
};
/**
* Add a new server to the cluster configuration. Its initial role will be
* #RAFT_SPARE.
*/
RAFT_API int raft_add(struct raft *r,
struct raft_change *req,
raft_id id,
const char *address,
raft_change_cb cb);
/**
* Assign a new role to the given server.
*
* If the server has already the given role, or if the given role is unknown,
* #RAFT_BADROLE is returned.
*/
RAFT_API int raft_assign(struct raft *r,
struct raft_change *req,
raft_id id,
int role,
raft_change_cb cb);
/**
* Remove the given server from the cluster configuration.
*/
RAFT_API int raft_remove(struct raft *r,
struct raft_change *req,
raft_id id,
raft_change_cb cb);
/**
* Asynchronous request to transfer leadership.
*/
typedef void (*raft_transfer_cb)(struct raft_transfer *req);
struct raft_transfer
{
RAFT__REQUEST;
raft_id id; /* ID of target server. */
raft_time start; /* Start of leadership transfer. */
struct raft_io_send send; /* For sending TimeoutNow */
raft_transfer_cb cb; /* User callback */
};
/**
* Transfer leadership to the server with the given ID.
*
* If the target server is not part of the configuration, or it's the leader
* itself, or it's not a #RAFT_VOTER, then #RAFT_BADID is returned.
*
* The special value #0 means to automatically select a voting follower to
* transfer leadership to. If there are no voting followers, return
* #RAFT_NOTFOUND.
*
* When this server detects that the target server has become the leader, or
* when @election_timeout milliseconds have elapsed, the given callback will be
* invoked.
*
* After the callback files, clients can check whether the operation was
* successful or not by calling @raft_leader() and checking if it returns the
* target server.
*/
RAFT_API int raft_transfer(struct raft *r,
struct raft_transfer *req,
raft_id id,
raft_transfer_cb cb);
/**
* Return the index of the last entry that was applied to the local FSM.
*/
RAFT_API raft_index raft_last_applied(struct raft *r);
/**
* Number of outstanding log entries before starting a new snapshot. The default
* is 1024.
*/
RAFT_API void raft_set_snapshot_threshold(struct raft *r, unsigned n);
/**
* Number of outstanding log entries to keep in the log after a snapshot has
* been taken. This avoids sending snapshots when a follower is behind by just a
* few entries. The default is 128.
*/
RAFT_API void raft_set_snapshot_trailing(struct raft *r, unsigned n);
#endif
#undef RAFT__REQUEST
#undef RAFT__ASSERT_COMPATIBILITY
#undef RAFT__STATIC_ASSERT
#endif /* RAFT_H */
raft-0.22.1/include/raft/ 0000775 0000000 0000000 00000000000 14601504142 0015104 5 ustar 00root root 0000000 0000000 raft-0.22.1/include/raft/fixture.h 0000664 0000000 0000000 00000041173 14601504142 0016751 0 ustar 00root root 0000000 0000000 /**
* Raft cluster test fixture, using an in-memory @raft_io implementation. This
* is meant to be used in unit tests.
*/
#ifndef RAFT_FIXTURE_H
#define RAFT_FIXTURE_H
#include "../raft.h"
#include
#define RAFT_FIXTURE_MAX_SERVERS 8
/**
* Fixture step event types.
*/
enum {
RAFT_FIXTURE_TICK = 1, /* The tick callback has been invoked */
RAFT_FIXTURE_NETWORK, /* A network request has been sent or received */
RAFT_FIXTURE_DISK, /* An I/O request has been submitted */
RAFT_FIXTURE_WORK /* A large, CPU and/or memory intensive task */
};
/**
* State of a single server in a cluster fixture.
*/
struct raft_fixture_server;
/**
* Information about a test cluster event triggered by the fixture.
*/
struct raft_fixture_event;
/**
* Returns the type of the event.
*/
int raft_fixture_event_type(struct raft_fixture_event *event);
/**
* Returns the server index of the event.
*/
unsigned raft_fixture_event_server_index(struct raft_fixture_event *event);
/**
* Event callback. See raft_fixture_hook().
*/
struct raft_fixture;
typedef void (*raft_fixture_event_cb)(struct raft_fixture *f,
struct raft_fixture_event *event);
/**
* Test implementation of a cluster of @n servers, each having a user-provided
* FSM.
*
* The cluster can simulate network latency and time elapsed on individual
* servers.
*
* Servers can be alive or dead. Network messages sent to dead servers are
* dropped. Dead servers do not have their @raft_io_tick_cb callback invoked.
*
* Any two servers can be connected or disconnected. Network messages sent
* between disconnected servers are dropped.
*/
struct raft_fixture
{
raft_time time; /* Global time, common to all servers. */
unsigned n; /* Number of servers. */
raft_id leader_id; /* ID of current leader, or 0 if none. */
struct raft_log *log; /* Copy of current leader's log. */
raft_index commit_index; /* Current commit index on leader. */
struct raft_fixture_event *event; /* Last event occurred. */
raft_fixture_event_cb hook; /* Event callback. */
struct raft_fixture_server *servers[RAFT_FIXTURE_MAX_SERVERS];
uint64_t reserved[16]; /* For future expansion of struct. */
};
/**
* Initialize a raft cluster fixture. Servers can be added by using
* `raft_fixture_grow`.
*/
RAFT_API int raft_fixture_init(struct raft_fixture *f);
RAFT_API int raft_fixture_initialize(struct raft_fixture *f);
/**
* Release all memory used by the fixture.
*/
RAFT_API void raft_fixture_close(struct raft_fixture *f);
/**
* Convenience to generate a configuration object containing all servers in the
* cluster. The first @n_voting servers will be voting ones.
*/
RAFT_API int raft_fixture_configuration(struct raft_fixture *f,
unsigned n_voting,
struct raft_configuration *conf);
/**
* Convenience to bootstrap all servers in the cluster using the given
* configuration.
*/
RAFT_API int raft_fixture_bootstrap(struct raft_fixture *f,
struct raft_configuration *conf);
/**
* Convenience to start all servers in the fixture.
*/
RAFT_API int raft_fixture_start(struct raft_fixture *f);
/**
* Return the number of servers in the fixture.
*/
RAFT_API unsigned raft_fixture_n(struct raft_fixture *f);
/**
* Return the current cluster global time. All raft instances see the same time.
*/
RAFT_API raft_time raft_fixture_time(struct raft_fixture *f);
/**
* Return the raft instance associated with the @i'th server of the fixture.
*/
RAFT_API struct raft *raft_fixture_get(struct raft_fixture *f, unsigned i);
/**
* Return @true if the @i'th server hasn't been killed.
*/
RAFT_API bool raft_fixture_alive(struct raft_fixture *f, unsigned i);
/**
* Return the index of the current leader, or the current number of servers if
* there's no leader.
*/
RAFT_API unsigned raft_fixture_leader_index(struct raft_fixture *f);
/**
* Return the ID of the server the @i'th server has voted for, or zero .
*/
RAFT_API raft_id raft_fixture_voted_for(struct raft_fixture *f, unsigned i);
/**
* Drive the cluster so the @i'th server starts an election but doesn't
* necessarily win it.
*
* This is achieved by bumping the randomized election timeout of all other
* servers to a very high value, letting the one of the @i'th server expire.
*
* There must currently be no leader and no candidate and the given server must
* be a voting one. Also, the @i'th server must be connected to a majority of
* voting servers.
*/
RAFT_API void raft_fixture_start_elect(struct raft_fixture *f, unsigned i);
/**
* Calls raft_fixture_start_elect, but waits and asserts that the @i'th server
* has become the leader.
*/
RAFT_API void raft_fixture_elect(struct raft_fixture *f, unsigned i);
/**
* Drive the cluster so the current leader gets deposed.
*
* This is achieved by dropping all AppendEntries result messages sent by
* followers to the leader, until the leader decides to step down because it has
* lost connectivity to a majority of followers.
*/
RAFT_API void raft_fixture_depose(struct raft_fixture *f);
/**
* Step through the cluster state advancing the time to the minimum value needed
* for it to make progress (i.e. for a message to be delivered, for an I/O
* operation to complete or for a single time tick to occur).
*
* In particular, the following happens:
*
* 1. If there are pending #raft_io_send requests, that have been submitted
* using #raft_io->send() and not yet sent, the oldest one is picked and the
* relevant callback fired. This simulates completion of a socket write,
* which means that the send request has been completed. The receiver does
* not immediately receives the message, as the message is propagating
* through the network. However any memory associated with the #raft_io_send
* request can be released (e.g. log entries). The in-memory I/O
* implementation assigns a latency to each RPC message, which will get
* delivered to the receiver only after that amount of time elapses. If the
* sender and the receiver are currently disconnected, the RPC message is
* simply dropped. If a callback was fired, jump directly to 3. and skip 2.
*
* 2. All pending #raft_io_append disk writes across all servers, that have been
* submitted using #raft_io->append() but not yet completed, are scanned and
* the one with the lowest completion time is picked. All in-flight network
* messages waiting to be delivered are scanned and the one with the lowest
* delivery time is picked. All servers are scanned, and the one with the
* lowest tick expiration time is picked. The three times are compared and
* the lowest one is picked. If a #raft_io_append disk write has completed,
* the relevant callback will be invoked, if there's a network message to be
* delivered, the receiver's @raft_io_recv_cb callback gets fired, if a tick
* timer has expired the relevant #raft_io->tick() callback will be
* invoked. Only one event will be fired. If there is more than one event to
* fire, one of them is picked according to the following rules: events for
* servers with lower index are fired first, tick events take precedence over
* disk events, and disk events take precedence over network events.
*
* 3. The current cluster leader is detected (if any). When detecting the leader
* the Election Safety property is checked: no servers can be in leader state
* for the same term. The server in leader state with the highest term is
* considered the current cluster leader, as long as it's "stable", i.e. it
* has been acknowledged by all servers connected to it, and those servers
* form a majority (this means that no further leader change can happen,
* unless the network gets disrupted). If there is a stable leader and it has
* not changed with respect to the previous call to @raft_fixture_step(),
* then the Leader Append-Only property is checked, by comparing its log with
* a copy of it that was taken during the previous iteration.
*
* 4. If there is a stable leader, its current log is copied, in order to be
* able to check the Leader Append-Only property at the next call.
*
* 5. If there is a stable leader, its commit index gets copied.
*
* The function returns information about which particular event occurred
* (either in step 1 or 2).
*/
RAFT_API struct raft_fixture_event *raft_fixture_step(struct raft_fixture *f);
/**
* Call raft_fixture_step() exactly @n times, and return the last event fired.
*/
RAFT_API struct raft_fixture_event *raft_fixture_step_n(struct raft_fixture *f,
unsigned n);
/**
* Step the cluster until the given @stop function returns #true, or @max_msecs
* have elapsed.
*
* Return #true if the @stop function has returned #true within @max_msecs.
*/
RAFT_API bool raft_fixture_step_until(struct raft_fixture *f,
bool (*stop)(struct raft_fixture *f,
void *arg),
void *arg,
unsigned max_msecs);
/**
* Step the cluster until @msecs have elapsed.
*/
RAFT_API void raft_fixture_step_until_elapsed(struct raft_fixture *f,
unsigned msecs);
/**
* Step the cluster until a leader is elected, or @max_msecs have elapsed.
*/
RAFT_API bool raft_fixture_step_until_has_leader(struct raft_fixture *f,
unsigned max_msecs);
/**
* Step the cluster until the current leader gets deposed, or @max_msecs have
* elapsed.
*/
RAFT_API bool raft_fixture_step_until_has_no_leader(struct raft_fixture *f,
unsigned max_msecs);
/**
* Step the cluster until the @i'th server has applied the entry at the given
* index, or @max_msecs have elapsed. If @i equals the number of servers, then
* step until all servers have applied the given entry.
*/
RAFT_API bool raft_fixture_step_until_applied(struct raft_fixture *f,
unsigned i,
raft_index index,
unsigned max_msecs);
/**
* Step the cluster until the state of the @i'th server matches the given one,
* or @max_msecs have elapsed.
*/
RAFT_API bool raft_fixture_step_until_state_is(struct raft_fixture *f,
unsigned i,
int state,
unsigned max_msecs);
/**
* Step the cluster until the term of the @i'th server matches the given one,
* or @max_msecs have elapsed.
*/
RAFT_API bool raft_fixture_step_until_term_is(struct raft_fixture *f,
unsigned i,
raft_term term,
unsigned max_msecs);
/**
* Step the cluster until the @i'th server has voted for the @j'th one, or
* @max_msecs have elapsed.
*/
RAFT_API bool raft_fixture_step_until_voted_for(struct raft_fixture *f,
unsigned i,
unsigned j,
unsigned max_msecs);
/**
* Step the cluster until all pending network messages from the @i'th server to
* the @j'th server have been delivered, or @max_msecs have elapsed.
*/
RAFT_API bool raft_fixture_step_until_delivered(struct raft_fixture *f,
unsigned i,
unsigned j,
unsigned max_msecs);
/**
* Set a function to be called after every time a fixture event occurs as
* consequence of a step.
*/
RAFT_API void raft_fixture_hook(struct raft_fixture *f,
raft_fixture_event_cb hook);
/**
* Disconnect the @i'th and the @j'th servers, so attempts to send a message
* from @i to @j will fail with #RAFT_NOCONNECTION.
*/
RAFT_API void raft_fixture_disconnect(struct raft_fixture *f,
unsigned i,
unsigned j);
/**
* Reconnect the @i'th and the @j'th servers, so attempts to send a message
* from @i to @j will succeed again.
*/
RAFT_API void raft_fixture_reconnect(struct raft_fixture *f,
unsigned i,
unsigned j);
/**
* Saturate the connection between the @i'th and the @j'th servers, so messages
* sent by @i to @j will be silently dropped.
*/
RAFT_API void raft_fixture_saturate(struct raft_fixture *f,
unsigned i,
unsigned j);
/**
* Return true if the connection from the @i'th to the @j'th server has been set
* as saturated.
*/
RAFT_API bool raft_fixture_saturated(struct raft_fixture *f,
unsigned i,
unsigned j);
/**
* Desaturate the connection between the @i'th and the @j'th servers, so
* messages sent by @i to @j will start being delivered again.
*/
RAFT_API void raft_fixture_desaturate(struct raft_fixture *f,
unsigned i,
unsigned j);
/**
* Kill the server with the given index. The server won't receive any message
* and its tick callback won't be invoked.
*/
RAFT_API void raft_fixture_kill(struct raft_fixture *f, unsigned i);
/**
* Revive a killed server with the given index.
*/
RAFT_API void raft_fixture_revive(struct raft_fixture *f, unsigned i);
/**
* Add a new empty server to the cluster and connect it to all others.
*/
RAFT_API int raft_fixture_grow(struct raft_fixture *f, struct raft_fsm *fsm);
/**
* Set the value that will be returned to the @i'th raft instance when it asks
* the underlying #raft_io implementation for a randomized election timeout
* value. The default value is 1000 + @i * 100, meaning that the election timer
* of server 0 will expire first.
*/
RAFT_API void raft_fixture_set_randomized_election_timeout(
struct raft_fixture *f,
unsigned i,
unsigned msecs);
/**
* Set the network latency in milliseconds. Each RPC message sent by the @i'th
* server from now on will take @msecs milliseconds to be delivered. The default
* value is 15.
*/
RAFT_API void raft_fixture_set_network_latency(struct raft_fixture *f,
unsigned i,
unsigned msecs);
/**
* Set the disk I/O latency in milliseconds. Each append request will take this
* amount of milliseconds to complete. The default value is 10.
*/
RAFT_API void raft_fixture_set_disk_latency(struct raft_fixture *f,
unsigned i,
unsigned msecs);
/**
* Set the persisted term of the @i'th server.
*/
RAFT_API void raft_fixture_set_term(struct raft_fixture *f,
unsigned i,
raft_term term);
/**
* Set the most recent persisted snapshot on the @i'th server.
*/
RAFT_API void raft_fixture_set_snapshot(struct raft_fixture *f,
unsigned i,
struct raft_snapshot *snapshot);
/**
* Add an entry to the persisted entries of the @i'th server.
*/
RAFT_API void raft_fixture_add_entry(struct raft_fixture *f,
unsigned i,
struct raft_entry *entry);
/**
* Inject an I/O failure that will be triggered on the @i'th server after @delay
* I/O requests and occur @repeat times.
*/
RAFT_API void raft_fixture_io_fault(struct raft_fixture *f,
unsigned i,
int delay,
int repeat);
/**
* Return the number of messages of the given type that the @i'th server has
* successfully sent so far.
*/
RAFT_API unsigned raft_fixture_n_send(struct raft_fixture *f,
unsigned i,
int type);
/**
* Return the number of messages of the given type that the @i'th server has
* received so far.
*/
RAFT_API unsigned raft_fixture_n_recv(struct raft_fixture *f,
unsigned i,
int type);
#endif /* RAFT_FIXTURE_H */
raft-0.22.1/include/raft/uv.h 0000664 0000000 0000000 00000024033 14601504142 0015711 0 ustar 00root root 0000000 0000000 #ifndef RAFT_UV_H
#define RAFT_UV_H
#include
#include "../raft.h"
/**
* Trace events fired when a disk write gets submitted and completed.
*/
#define RAFT_UV_TRACER_WRITE_SUBMIT (1 << 8)
#define RAFT_UV_TRACER_WRITE_COMPLETE (2 << 8)
struct raft_uv_transport;
/**
* Configure the given @raft_io instance to use a libuv-based I/O
* implementation.
*
* The @dir path will be copied, and its memory can possibly be released once
* this function returns.
*
* Return #RAFT_NAMETOOLONG if @dir exceeds the size of the internal buffer
* that should hold it
*
* Return #RAFT_NOTFOUND if @dir does not exist.
*
* Return #RAFT_INVALID if @dir exists but it's not a directory.
*
* The implementation of metadata and log persistency is virtually the same as
* the one found in LogCabin [0].
*
* The disk files consist of metadata files, closed segments, and open
* segments. Metadata files are used to track Raft metadata, such as the
* server's current term, vote, and log's start index. Segments contain
* contiguous entries that are part of the log. Closed segments are never
* written to again (but may be renamed and truncated if a suffix of the log is
* truncated). Open segments are where newly appended entries go. Once an open
* segment reaches the maximum allowed size, it is closed and a new one is used.
*
* Metadata files are named "metadata1" and "metadata2". The code alternates
* between these so that there is always at least one readable metadata file.
* On boot, the readable metadata file with the higher version number is used.
*
* The format of a metadata file is:
*
* [8 bytes] Format (currently 1).
* [8 bytes] Incremental version number.
* [8 bytes] Current term.
* [8 bytes] ID of server we voted for.
*
* Closed segments are named by the format string "%lu-%lu" with their
* start and end indexes, both inclusive. Closed segments always contain at
* least one entry; the end index is always at least as large as the start
* index. Closed segment files may occasionally include data past their
* filename's end index (these are ignored but a warning is logged). This can
* happen if the suffix of the segment is truncated and a crash occurs at an
* inopportune time (the segment file is first renamed, then truncated, and a
* crash occurs in between).
*
* Open segments are named by the format string "open-%lu" with a unique
* number. These should not exist when the server shuts down cleanly, but they
* exist while the server is running and may be left around during a crash.
* Open segments either contain entries which come after the last closed
* segment or are full of zeros. When the server crashes while appending to an
* open segment, the end of that file may be corrupt. We can't distinguish
* between a corrupt file and a partially written entry. The code assumes it's
* a partially written entry, logs a warning, and ignores it.
*
* Truncating a suffix of the log will remove all entries that are no longer
* part of the log. Truncating a prefix of the log will only remove complete
* segments that are before the new log start index. For example, if a
* segment has entries 10 through 20 and the prefix of the log is truncated to
* start at entry 15, that entire segment will be retained.
*
* Each segment file starts with a segment header, which currently contains
* just an 8-byte version number for the format of that segment. The current
* format (version 1) is just a concatenation of serialized entry batches.
*
* Each batch has the following format:
*
* [4 bytes] CRC32 checksum of the batch header, little endian.
* [4 bytes] CRC32 checksum of the batch data, little endian.
* [ ... ] Batch (as described in @raft_decode_entries_batch).
*
* [0] https://github.com/logcabin/logcabin/blob/master/Storage/SegmentedLog.h
*/
RAFT_API int raft_uv_init(struct raft_io *io,
struct uv_loop_s *loop,
const char *dir,
struct raft_uv_transport *transport);
/**
* Release any memory allocated internally.
*/
RAFT_API void raft_uv_close(struct raft_io *io);
/**
* Set the block size that will be used for direct I/O.
*
* The default is to automatically detect the appropriate block size.
*/
RAFT_API void raft_uv_set_block_size(struct raft_io *io, size_t size);
/**
* Set the maximum initial size of newly created open segments.
*
* If the given size is not a multiple of the block size, the actual size will
* be reduced to the closest multiple.
*
* The default is 8 megabytes.
*/
RAFT_API void raft_uv_set_segment_size(struct raft_io *io, size_t size);
/**
* Set the retry frequency for failed disk operations, such as segment
* allocations and append writes.
*
* When a segment fails to be allocated or data write fails to complete due to
* insufficient disk space, the implementation will keep retrying every @msecs
* milliseconds.
*/
RAFT_API void raft_uv_set_disk_retry(struct raft_io *io, unsigned msecs);
/**
* DEPRECATED: This API is a no-op and is provided only for backoward ABI
* compatibility.
*/
RAFT_API int raft_uv_set_snapshot_compression(struct raft_io *io,
bool compressed);
/**
* Set how many milliseconds to wait between subsequent retries when
* establishing a connection with another server. The default is 1000
* milliseconds.
*/
RAFT_API void raft_uv_set_connect_retry_delay(struct raft_io *io,
unsigned msecs);
/**
* Emit low-level debug messages using the given tracer.
*/
RAFT_API void raft_uv_set_tracer(struct raft_io *io,
struct raft_tracer *tracer);
/**
* Enable or disable auto-recovery on startup. Default enabled.
*/
RAFT_API void raft_uv_set_auto_recovery(struct raft_io *io, bool flag);
/**
* Callback invoked by the transport implementation when a new incoming
* connection has been established.
*
* No references to @address must be kept after this function returns.
*
* Ownership of @stream is transferred to user code, which is responsible of
* uv_close()'ing it and then releasing its memory.
*/
typedef void (*raft_uv_accept_cb)(struct raft_uv_transport *t,
raft_id id,
const char *address,
struct uv_stream_s *stream);
/**
* Callback invoked by the transport implementation after a connect request has
* completed. If status is #0, then @stream will point to a valid handle, which
* user code is then responsible to uv_close() and then release.
*/
struct raft_uv_connect;
typedef void (*raft_uv_connect_cb)(struct raft_uv_connect *req,
struct uv_stream_s *stream,
int status);
/**
* Handle to a connect request.
*/
struct raft_uv_connect
{
void *data; /* User data */
raft_uv_connect_cb cb; /* Callback */
};
/**
* Callback invoked by the transport implementation after a close request is
* completed.
*/
typedef void (*raft_uv_transport_close_cb)(struct raft_uv_transport *t);
/**
* Interface to establish outgoing connections to other Raft servers and to
* accept incoming connections from them.
*/
struct raft_uv_transport
{
/**
* Keep track of struct version, MUST be filled out by user.
* When moving to a new version, the user MUST implement the newly added
* methods.
* Latest version is 1.
*/
int version;
/**
* User defined data.
*/
void *data;
/**
* Implementation-defined state.
*/
void *impl;
/**
* Human-readable message providing diagnostic information about the last
* error occurred.
*/
char errmsg[RAFT_ERRMSG_BUF_SIZE];
/**
* Initialize the transport with the given server's identity.
*/
int (*init)(struct raft_uv_transport *t, raft_id id, const char *address);
/**
* Start listening for incoming connections.
*
* Once a new connection is accepted, the @cb callback passed in the
* initializer must be invoked with the relevant details of the connecting
* Raft server.
*/
int (*listen)(struct raft_uv_transport *t, raft_uv_accept_cb cb);
/**
* Connect to the server with the given ID and address.
*
* The @cb callback must be invoked when the connection has been established
* or the connection attempt has failed. The memory pointed by @req can be
* released only after @cb has fired.
*/
int (*connect)(struct raft_uv_transport *t,
struct raft_uv_connect *req,
raft_id id,
const char *address,
raft_uv_connect_cb cb);
/**
* Close the transport.
*
* The implementation must:
*
* - Stop accepting incoming connections. The @cb callback passed to @listen
* must not be invoked anymore.
*
* - Cancel all pending @connect requests.
*
* - Invoke the @cb callback passed to this method once it's safe to release
* the memory of the transport object.
*/
void (*close)(struct raft_uv_transport *t, raft_uv_transport_close_cb cb);
};
/**
* Init a transport interface that uses TCP sockets.
*/
RAFT_API int raft_uv_tcp_init(struct raft_uv_transport *t,
struct uv_loop_s *loop);
/**
* Release any memory allocated internally.
*/
RAFT_API void raft_uv_tcp_close(struct raft_uv_transport *t);
/**
* Set the IP address and port that the listening socket will bind to.
*
* By default the socket will bind to the address provided in
* raft_init(), which may be inconvenient if running your application in a
* container, for example.
*
* The @address argument must be an IPv4 dotted quad IP address and port, e.g.
* "0.0.0.0:8080". If you do not provide a port, the default of 8080 will be
* used. The port given here *must* match the port given to raft_init().
*
* Must be called before raft_init().
*/
RAFT_API int raft_uv_tcp_set_bind_address(struct raft_uv_transport *t,
const char *address);
#endif /* RAFT_UV_H */
raft-0.22.1/m4/ 0000775 0000000 0000000 00000000000 14601504142 0013045 5 ustar 00root root 0000000 0000000 raft-0.22.1/m4/.gitignore 0000664 0000000 0000000 00000000326 14601504142 0015036 0 ustar 00root root 0000000 0000000 *.m4
!attributes.m4
!ax_ac_append_to_file.m4
!ax_ac_print_to_file.m4
!ax_add_am_macro_static.m4
!ax_am_macros_static.m4
!ax_check_gnu_make.m4
!ax_code_coverage.m4
!ax_compare_version.m4
!ax_file_escapes.m4
!pkg.m4
raft-0.22.1/m4/attributes.m4 0000664 0000000 0000000 00000024021 14601504142 0015474 0 ustar 00root root 0000000 0000000 dnl Macros to check the presence of generic (non-typed) symbols.
dnl Copyright (c) 2006-2008 Diego Pettenò
dnl Copyright (c) 2006-2008 xine project
dnl Copyright (c) 2012 Lucas De Marchi
dnl
dnl This program is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU General Public License as published by
dnl the Free Software Foundation; either version 2, or (at your option)
dnl any later version.
dnl
dnl This program is distributed in the hope that it will be useful,
dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
dnl GNU General Public License for more details.
dnl
dnl You should have received a copy of the GNU General Public License
dnl along with this program; if not, write to the Free Software
dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
dnl 02110-1301, USA.
dnl
dnl As a special exception, the copyright owners of the
dnl macro gives unlimited permission to copy, distribute and modify the
dnl configure scripts that are the output of Autoconf when processing the
dnl Macro. You need not follow the terms of the GNU General Public
dnl License when using or distributing such scripts, even though portions
dnl of the text of the Macro appear in them. The GNU General Public
dnl License (GPL) does govern all other use of the material that
dnl constitutes the Autoconf Macro.
dnl
dnl This special exception to the GPL applies to versions of the
dnl Autoconf Macro released by this project. When you make and
dnl distribute a modified version of the Autoconf Macro, you may extend
dnl this special exception to the GPL to apply to your modified version as
dnl well.
dnl Check if FLAG in ENV-VAR is supported by compiler and append it
dnl to WHERE-TO-APPEND variable. Note that we invert -Wno-* checks to
dnl -W* as gcc cannot test for negated warnings. If a C snippet is passed,
dnl use it, otherwise use a simple main() definition that just returns 0.
dnl CC_CHECK_FLAG_APPEND([WHERE-TO-APPEND], [ENV-VAR], [FLAG], [C-SNIPPET])
AC_DEFUN([CC_CHECK_FLAG_APPEND], [
AC_CACHE_CHECK([if $CC supports flag $3 in envvar $2],
AS_TR_SH([cc_cv_$2_$3]),
[eval "AS_TR_SH([cc_save_$2])='${$2}'"
eval "AS_TR_SH([$2])='${cc_save_$2} -Werror `echo "$3" | sed 's/^-Wno-/-W/'`'"
AC_LINK_IFELSE([AC_LANG_SOURCE(ifelse([$4], [],
[int main(void) { return 0; } ],
[$4]))],
[eval "AS_TR_SH([cc_cv_$2_$3])='yes'"],
[eval "AS_TR_SH([cc_cv_$2_$3])='no'"])
eval "AS_TR_SH([$2])='$cc_save_$2'"])
AS_IF([eval test x$]AS_TR_SH([cc_cv_$2_$3])[ = xyes],
[eval "$1='${$1} $3'"])
])
dnl CC_CHECK_FLAGS_APPEND([WHERE-TO-APPEND], [ENV-VAR], [FLAG1 FLAG2], [C-SNIPPET])
AC_DEFUN([CC_CHECK_FLAGS_APPEND], [
for flag in [$3]; do
CC_CHECK_FLAG_APPEND([$1], [$2], $flag, [$4])
done
])
dnl Check if the flag is supported by linker (cacheable)
dnl CC_CHECK_LDFLAGS([FLAG], [ACTION-IF-FOUND],[ACTION-IF-NOT-FOUND])
AC_DEFUN([CC_CHECK_LDFLAGS], [
AC_CACHE_CHECK([if $CC supports $1 flag],
AS_TR_SH([cc_cv_ldflags_$1]),
[ac_save_LDFLAGS="$LDFLAGS"
LDFLAGS="$LDFLAGS $1"
AC_LINK_IFELSE([int main() { return 1; }],
[eval "AS_TR_SH([cc_cv_ldflags_$1])='yes'"],
[eval "AS_TR_SH([cc_cv_ldflags_$1])="])
LDFLAGS="$ac_save_LDFLAGS"
])
AS_IF([eval test x$]AS_TR_SH([cc_cv_ldflags_$1])[ = xyes],
[$2], [$3])
])
dnl define the LDFLAGS_NOUNDEFINED variable with the correct value for
dnl the current linker to avoid undefined references in a shared object.
AC_DEFUN([CC_NOUNDEFINED], [
dnl We check $host for which systems to enable this for.
AC_REQUIRE([AC_CANONICAL_HOST])
case $host in
dnl FreeBSD (et al.) does not complete linking for shared objects when pthreads
dnl are requested, as different implementations are present; to avoid problems
dnl use -Wl,-z,defs only for those platform not behaving this way.
*-freebsd* | *-openbsd*) ;;
*)
dnl First of all check for the --no-undefined variant of GNU ld. This allows
dnl for a much more readable command line, so that people can understand what
dnl it does without going to look for what the heck -z defs does.
for possible_flags in "-Wl,--no-undefined" "-Wl,-z,defs"; do
CC_CHECK_LDFLAGS([$possible_flags], [LDFLAGS_NOUNDEFINED="$possible_flags"])
break
done
;;
esac
AC_SUBST([LDFLAGS_NOUNDEFINED])
])
dnl Check for a -Werror flag or equivalent. -Werror is the GCC
dnl and ICC flag that tells the compiler to treat all the warnings
dnl as fatal. We usually need this option to make sure that some
dnl constructs (like attributes) are not simply ignored.
dnl
dnl Other compilers don't support -Werror per se, but they support
dnl an equivalent flag:
dnl - Sun Studio compiler supports -errwarn=%all
AC_DEFUN([CC_CHECK_WERROR], [
AC_CACHE_CHECK(
[for $CC way to treat warnings as errors],
[cc_cv_werror],
[CC_CHECK_CFLAGS_SILENT([-Werror], [cc_cv_werror=-Werror],
[CC_CHECK_CFLAGS_SILENT([-errwarn=%all], [cc_cv_werror=-errwarn=%all])])
])
])
AC_DEFUN([CC_CHECK_ATTRIBUTE], [
AC_REQUIRE([CC_CHECK_WERROR])
AC_CACHE_CHECK([if $CC supports __attribute__(( ifelse([$2], , [$1], [$2]) ))],
AS_TR_SH([cc_cv_attribute_$1]),
[ac_save_CFLAGS="$CFLAGS"
CFLAGS="$CFLAGS $cc_cv_werror"
AC_COMPILE_IFELSE([AC_LANG_SOURCE([$3])],
[eval "AS_TR_SH([cc_cv_attribute_$1])='yes'"],
[eval "AS_TR_SH([cc_cv_attribute_$1])='no'"])
CFLAGS="$ac_save_CFLAGS"
])
AS_IF([eval test x$]AS_TR_SH([cc_cv_attribute_$1])[ = xyes],
[AC_DEFINE(
AS_TR_CPP([SUPPORT_ATTRIBUTE_$1]), 1,
[Define this if the compiler supports __attribute__(( ifelse([$2], , [$1], [$2]) ))]
)
$4],
[$5])
])
AC_DEFUN([CC_ATTRIBUTE_CONSTRUCTOR], [
CC_CHECK_ATTRIBUTE(
[constructor],,
[void __attribute__((constructor)) ctor() { int a; }],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_FORMAT], [
CC_CHECK_ATTRIBUTE(
[format], [format(printf, n, n)],
[void __attribute__((format(printf, 1, 2))) printflike(const char *fmt, ...) { fmt = (void *)0; }],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_FORMAT_ARG], [
CC_CHECK_ATTRIBUTE(
[format_arg], [format_arg(printf)],
[char *__attribute__((format_arg(1))) gettextlike(const char *fmt) { fmt = (void *)0; }],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_VISIBILITY], [
CC_CHECK_ATTRIBUTE(
[visibility_$1], [visibility("$1")],
[void __attribute__((visibility("$1"))) $1_function() { }],
[$2], [$3])
])
AC_DEFUN([CC_ATTRIBUTE_NONNULL], [
CC_CHECK_ATTRIBUTE(
[nonnull], [nonnull()],
[void __attribute__((nonnull())) some_function(void *foo, void *bar) { foo = (void*)0; bar = (void*)0; }],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_UNUSED], [
CC_CHECK_ATTRIBUTE(
[unused], ,
[void some_function(void *foo, __attribute__((unused)) void *bar);],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_SENTINEL], [
CC_CHECK_ATTRIBUTE(
[sentinel], ,
[void some_function(void *foo, ...) __attribute__((sentinel));],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_DEPRECATED], [
CC_CHECK_ATTRIBUTE(
[deprecated], ,
[void some_function(void *foo, ...) __attribute__((deprecated));],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_ALIAS], [
CC_CHECK_ATTRIBUTE(
[alias], [weak, alias],
[void other_function(void *foo) { }
void some_function(void *foo) __attribute__((weak, alias("other_function")));],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_MALLOC], [
CC_CHECK_ATTRIBUTE(
[malloc], ,
[void * __attribute__((malloc)) my_alloc(int n);],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_PACKED], [
CC_CHECK_ATTRIBUTE(
[packed], ,
[struct astructure { char a; int b; long c; void *d; } __attribute__((packed));],
[$1], [$2])
])
AC_DEFUN([CC_ATTRIBUTE_CONST], [
CC_CHECK_ATTRIBUTE(
[const], ,
[int __attribute__((const)) twopow(int n) { return 1 << n; } ],
[$1], [$2])
])
AC_DEFUN([CC_FLAG_VISIBILITY], [
AC_REQUIRE([CC_CHECK_WERROR])
AC_CACHE_CHECK([if $CC supports -fvisibility=hidden],
[cc_cv_flag_visibility],
[cc_flag_visibility_save_CFLAGS="$CFLAGS"
CFLAGS="$CFLAGS $cc_cv_werror"
CC_CHECK_CFLAGS_SILENT([-fvisibility=hidden],
cc_cv_flag_visibility='yes',
cc_cv_flag_visibility='no')
CFLAGS="$cc_flag_visibility_save_CFLAGS"])
AS_IF([test "x$cc_cv_flag_visibility" = "xyes"],
[AC_DEFINE([SUPPORT_FLAG_VISIBILITY], 1,
[Define this if the compiler supports the -fvisibility flag])
$1],
[$2])
])
AC_DEFUN([CC_FUNC_EXPECT], [
AC_REQUIRE([CC_CHECK_WERROR])
AC_CACHE_CHECK([if compiler has __builtin_expect function],
[cc_cv_func_expect],
[ac_save_CFLAGS="$CFLAGS"
CFLAGS="$CFLAGS $cc_cv_werror"
AC_COMPILE_IFELSE([AC_LANG_SOURCE(
[int some_function() {
int a = 3;
return (int)__builtin_expect(a, 3);
}])],
[cc_cv_func_expect=yes],
[cc_cv_func_expect=no])
CFLAGS="$ac_save_CFLAGS"
])
AS_IF([test "x$cc_cv_func_expect" = "xyes"],
[AC_DEFINE([SUPPORT__BUILTIN_EXPECT], 1,
[Define this if the compiler supports __builtin_expect() function])
$1],
[$2])
])
AC_DEFUN([CC_ATTRIBUTE_ALIGNED], [
AC_REQUIRE([CC_CHECK_WERROR])
AC_CACHE_CHECK([highest __attribute__ ((aligned ())) supported],
[cc_cv_attribute_aligned],
[ac_save_CFLAGS="$CFLAGS"
CFLAGS="$CFLAGS $cc_cv_werror"
for cc_attribute_align_try in 64 32 16 8 4 2; do
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
int main() {
static char c __attribute__ ((aligned($cc_attribute_align_try))) = 0;
return c;
}])], [cc_cv_attribute_aligned=$cc_attribute_align_try; break])
done
CFLAGS="$ac_save_CFLAGS"
])
if test "x$cc_cv_attribute_aligned" != "x"; then
AC_DEFINE_UNQUOTED([ATTRIBUTE_ALIGNED_MAX], [$cc_cv_attribute_aligned],
[Define the highest alignment supported])
fi
])
raft-0.22.1/m4/ax_ac_append_to_file.m4 0000664 0000000 0000000 00000001622 14601504142 0017413 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_ac_append_to_file.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_AC_APPEND_TO_FILE([FILE],[DATA])
#
# DESCRIPTION
#
# Appends the specified data to the specified Autoconf is run. If you want
# to append to a file when configure is run use AX_APPEND_TO_FILE instead.
#
# LICENSE
#
# Copyright (c) 2009 Allan Caffee
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 10
AC_DEFUN([AX_AC_APPEND_TO_FILE],[
AC_REQUIRE([AX_FILE_ESCAPES])
m4_esyscmd(
AX_FILE_ESCAPES
[
printf "%s" "$2" >> "$1"
])
])
raft-0.22.1/m4/ax_ac_print_to_file.m4 0000664 0000000 0000000 00000001611 14601504142 0017276 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_ac_print_to_file.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_AC_PRINT_TO_FILE([FILE],[DATA])
#
# DESCRIPTION
#
# Writes the specified data to the specified file when Autoconf is run. If
# you want to print to a file when configure is run use AX_PRINT_TO_FILE
# instead.
#
# LICENSE
#
# Copyright (c) 2009 Allan Caffee
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 10
AC_DEFUN([AX_AC_PRINT_TO_FILE],[
m4_esyscmd(
AC_REQUIRE([AX_FILE_ESCAPES])
[
printf "%s" "$2" > "$1"
])
])
raft-0.22.1/m4/ax_add_am_macro_static.m4 0000664 0000000 0000000 00000001525 14601504142 0017737 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_add_am_macro_static.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_ADD_AM_MACRO_STATIC([RULE])
#
# DESCRIPTION
#
# Adds the specified rule to $AMINCLUDE.
#
# LICENSE
#
# Copyright (c) 2009 Tom Howard
# Copyright (c) 2009 Allan Caffee
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 8
AC_DEFUN([AX_ADD_AM_MACRO_STATIC],[
AC_REQUIRE([AX_AM_MACROS_STATIC])
AX_AC_APPEND_TO_FILE(AMINCLUDE_STATIC,[$1])
])
raft-0.22.1/m4/ax_am_macros_static.m4 0000664 0000000 0000000 00000002125 14601504142 0017307 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_am_macros_static.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_AM_MACROS_STATIC
#
# DESCRIPTION
#
# Adds support for macros that create Automake rules. You must manually
# add the following line
#
# include $(top_srcdir)/aminclude_static.am
#
# to your Makefile.am files.
#
# LICENSE
#
# Copyright (c) 2009 Tom Howard
# Copyright (c) 2009 Allan Caffee
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 11
AC_DEFUN([AMINCLUDE_STATIC],[aminclude_static.am])
AC_DEFUN([AX_AM_MACROS_STATIC],
[
AX_AC_PRINT_TO_FILE(AMINCLUDE_STATIC,[
# ]AMINCLUDE_STATIC[ generated automatically by Autoconf
# from AX_AM_MACROS_STATIC on ]m4_esyscmd([LC_ALL=C date])[
])
])
raft-0.22.1/m4/ax_check_gnu_make.m4 0000664 0000000 0000000 00000007726 14601504142 0016736 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_check_gnu_make.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_CHECK_GNU_MAKE([run-if-true],[run-if-false])
#
# DESCRIPTION
#
# This macro searches for a GNU version of make. If a match is found:
#
# * The makefile variable `ifGNUmake' is set to the empty string, otherwise
# it is set to "#". This is useful for including a special features in a
# Makefile, which cannot be handled by other versions of make.
# * The makefile variable `ifnGNUmake' is set to #, otherwise
# it is set to the empty string. This is useful for including a special
# features in a Makefile, which can be handled
# by other versions of make or to specify else like clause.
# * The variable `_cv_gnu_make_command` is set to the command to invoke
# GNU make if it exists, the empty string otherwise.
# * The variable `ax_cv_gnu_make_command` is set to the command to invoke
# GNU make by copying `_cv_gnu_make_command`, otherwise it is unset.
# * If GNU Make is found, its version is extracted from the output of
# `make --version` as the last field of a record of space-separated
# columns and saved into the variable `ax_check_gnu_make_version`.
# * Additionally if GNU Make is found, run shell code run-if-true
# else run shell code run-if-false.
#
# Here is an example of its use:
#
# Makefile.in might contain:
#
# # A failsafe way of putting a dependency rule into a makefile
# $(DEPEND):
# $(CC) -MM $(srcdir)/*.c > $(DEPEND)
#
# @ifGNUmake@ ifeq ($(DEPEND),$(wildcard $(DEPEND)))
# @ifGNUmake@ include $(DEPEND)
# @ifGNUmake@ else
# fallback code
# @ifGNUmake@ endif
#
# Then configure.in would normally contain:
#
# AX_CHECK_GNU_MAKE()
# AC_OUTPUT(Makefile)
#
# Then perhaps to cause gnu make to override any other make, we could do
# something like this (note that GNU make always looks for GNUmakefile
# first):
#
# if ! test x$_cv_gnu_make_command = x ; then
# mv Makefile GNUmakefile
# echo .DEFAULT: > Makefile ;
# echo \ $_cv_gnu_make_command \$@ >> Makefile;
# fi
#
# Then, if any (well almost any) other make is called, and GNU make also
# exists, then the other make wraps the GNU make.
#
# LICENSE
#
# Copyright (c) 2008 John Darrington
# Copyright (c) 2015 Enrico M. Crisostomo
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 11
AC_DEFUN([AX_CHECK_GNU_MAKE],dnl
[AC_PROG_AWK
AC_CACHE_CHECK([for GNU make],[_cv_gnu_make_command],[dnl
_cv_gnu_make_command="" ;
dnl Search all the common names for GNU make
for a in "$MAKE" make gmake gnumake ; do
if test -z "$a" ; then continue ; fi ;
if "$a" --version 2> /dev/null | grep GNU 2>&1 > /dev/null ; then
_cv_gnu_make_command=$a ;
AX_CHECK_GNU_MAKE_HEADLINE=$("$a" --version 2> /dev/null | grep "GNU Make")
ax_check_gnu_make_version=$(echo ${AX_CHECK_GNU_MAKE_HEADLINE} | ${AWK} -F " " '{ print $(NF); }')
break ;
fi
done ;])
dnl If there was a GNU version, then set @ifGNUmake@ to the empty string, '#' otherwise
AS_VAR_IF([_cv_gnu_make_command], [""], [AS_VAR_SET([ifGNUmake], ["#"])], [AS_VAR_SET([ifGNUmake], [""])])
AS_VAR_IF([_cv_gnu_make_command], [""], [AS_VAR_SET([ifnGNUmake], [""])], [AS_VAR_SET([ifGNUmake], ["#"])])
AS_VAR_IF([_cv_gnu_make_command], [""], [AS_UNSET(ax_cv_gnu_make_command)], [AS_VAR_SET([ax_cv_gnu_make_command], [${_cv_gnu_make_command}])])
AS_VAR_IF([_cv_gnu_make_command], [""],[$2],[$1])
AC_SUBST([ifGNUmake])
AC_SUBST([ifnGNUmake])
])
raft-0.22.1/m4/ax_code_coverage.m4 0000664 0000000 0000000 00000027614 14601504142 0016576 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_code_coverage.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_CODE_COVERAGE()
#
# DESCRIPTION
#
# Defines CODE_COVERAGE_CPPFLAGS, CODE_COVERAGE_CFLAGS,
# CODE_COVERAGE_CXXFLAGS and CODE_COVERAGE_LIBS which should be included
# in the CPPFLAGS, CFLAGS CXXFLAGS and LIBS/LIBADD variables of every
# build target (program or library) which should be built with code
# coverage support. Also add rules using AX_ADD_AM_MACRO_STATIC; and
# $enable_code_coverage which can be used in subsequent configure output.
# CODE_COVERAGE_ENABLED is defined and substituted, and corresponds to the
# value of the --enable-code-coverage option, which defaults to being
# disabled.
#
# Test also for gcov program and create GCOV variable that could be
# substituted.
#
# Note that all optimization flags in CFLAGS must be disabled when code
# coverage is enabled.
#
# Usage example:
#
# configure.ac:
#
# AX_CODE_COVERAGE
#
# Makefile.am:
#
# include $(top_srcdir)/aminclude_static.am
#
# my_program_LIBS = ... $(CODE_COVERAGE_LIBS) ...
# my_program_CPPFLAGS = ... $(CODE_COVERAGE_CPPFLAGS) ...
# my_program_CFLAGS = ... $(CODE_COVERAGE_CFLAGS) ...
# my_program_CXXFLAGS = ... $(CODE_COVERAGE_CXXFLAGS) ...
#
# clean-local: code-coverage-clean
# distclean-local: code-coverage-dist-clean
#
# This results in a "check-code-coverage" rule being added to any
# Makefile.am which do "include $(top_srcdir)/aminclude_static.am"
# (assuming the module has been configured with --enable-code-coverage).
# Running `make check-code-coverage` in that directory will run the
# module's test suite (`make check`) and build a code coverage report
# detailing the code which was touched, then print the URI for the report.
#
# This code was derived from Makefile.decl in GLib, originally licensed
# under LGPLv2.1+.
#
# LICENSE
#
# Copyright (c) 2012, 2016 Philip Withnall
# Copyright (c) 2012 Xan Lopez
# Copyright (c) 2012 Christian Persch
# Copyright (c) 2012 Paolo Borelli
# Copyright (c) 2012 Dan Winship
# Copyright (c) 2015,2018 Bastien ROUCARIES
#
# This library is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or (at
# your option) any later version.
#
# This library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
# General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see .
#serial 32
m4_define(_AX_CODE_COVERAGE_RULES,[
AX_ADD_AM_MACRO_STATIC([
# Code coverage
#
# Optional:
# - CODE_COVERAGE_DIRECTORY: Top-level directory for code coverage reporting.
# Multiple directories may be specified, separated by whitespace.
# (Default: \$(top_builddir))
# - CODE_COVERAGE_OUTPUT_FILE: Filename and path for the .info file generated
# by lcov for code coverage. (Default:
# \$(PACKAGE_NAME)-\$(PACKAGE_VERSION)-coverage.info)
# - CODE_COVERAGE_OUTPUT_DIRECTORY: Directory for generated code coverage
# reports to be created. (Default:
# \$(PACKAGE_NAME)-\$(PACKAGE_VERSION)-coverage)
# - CODE_COVERAGE_BRANCH_COVERAGE: Set to 1 to enforce branch coverage,
# set to 0 to disable it and leave empty to stay with the default.
# (Default: empty)
# - CODE_COVERAGE_LCOV_SHOPTS_DEFAULT: Extra options shared between both lcov
# instances. (Default: based on $CODE_COVERAGE_BRANCH_COVERAGE)
# - CODE_COVERAGE_LCOV_SHOPTS: Extra options to shared between both lcov
# instances. (Default: $CODE_COVERAGE_LCOV_SHOPTS_DEFAULT)
# - CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH: --gcov-tool pathtogcov
# - CODE_COVERAGE_LCOV_OPTIONS_DEFAULT: Extra options to pass to the
# collecting lcov instance. (Default: $CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH)
# - CODE_COVERAGE_LCOV_OPTIONS: Extra options to pass to the collecting lcov
# instance. (Default: $CODE_COVERAGE_LCOV_OPTIONS_DEFAULT)
# - CODE_COVERAGE_LCOV_RMOPTS_DEFAULT: Extra options to pass to the filtering
# lcov instance. (Default: empty)
# - CODE_COVERAGE_LCOV_RMOPTS: Extra options to pass to the filtering lcov
# instance. (Default: $CODE_COVERAGE_LCOV_RMOPTS_DEFAULT)
# - CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT: Extra options to pass to the
# genhtml instance. (Default: based on $CODE_COVERAGE_BRANCH_COVERAGE)
# - CODE_COVERAGE_GENHTML_OPTIONS: Extra options to pass to the genhtml
# instance. (Default: $CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT)
# - CODE_COVERAGE_IGNORE_PATTERN: Extra glob pattern of files to ignore
#
# The generated report will be titled using the \$(PACKAGE_NAME) and
# \$(PACKAGE_VERSION). In order to add the current git hash to the title,
# use the git-version-gen script, available online.
# Optional variables
# run only on top dir
if CODE_COVERAGE_ENABLED
ifeq (\$(abs_builddir), \$(abs_top_builddir))
CODE_COVERAGE_DIRECTORY ?= \$(top_builddir)
CODE_COVERAGE_OUTPUT_FILE ?= \$(PACKAGE_NAME)-\$(PACKAGE_VERSION)-coverage.info
CODE_COVERAGE_OUTPUT_DIRECTORY ?= \$(PACKAGE_NAME)-\$(PACKAGE_VERSION)-coverage
CODE_COVERAGE_BRANCH_COVERAGE ?=
CODE_COVERAGE_LCOV_SHOPTS_DEFAULT ?= \$(if \$(CODE_COVERAGE_BRANCH_COVERAGE),\
--rc lcov_branch_coverage=\$(CODE_COVERAGE_BRANCH_COVERAGE))
CODE_COVERAGE_LCOV_SHOPTS ?= \$(CODE_COVERAGE_LCOV_SHOPTS_DEFAULT)
CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH ?= --gcov-tool \"\$(GCOV)\"
CODE_COVERAGE_LCOV_OPTIONS_DEFAULT ?= \$(CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH)
CODE_COVERAGE_LCOV_OPTIONS ?= \$(CODE_COVERAGE_LCOV_OPTIONS_DEFAULT)
CODE_COVERAGE_LCOV_RMOPTS_DEFAULT ?=
CODE_COVERAGE_LCOV_RMOPTS ?= \$(CODE_COVERAGE_LCOV_RMOPTS_DEFAULT)
CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT ?=\
\$(if \$(CODE_COVERAGE_BRANCH_COVERAGE),\
--rc genhtml_branch_coverage=\$(CODE_COVERAGE_BRANCH_COVERAGE))
CODE_COVERAGE_GENHTML_OPTIONS ?= \$(CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT)
CODE_COVERAGE_IGNORE_PATTERN ?=
GITIGNOREFILES = \$(GITIGNOREFILES) \$(CODE_COVERAGE_OUTPUT_FILE) \$(CODE_COVERAGE_OUTPUT_DIRECTORY)
code_coverage_v_lcov_cap = \$(code_coverage_v_lcov_cap_\$(V))
code_coverage_v_lcov_cap_ = \$(code_coverage_v_lcov_cap_\$(AM_DEFAULT_VERBOSITY))
code_coverage_v_lcov_cap_0 = @echo \" LCOV --capture\" \$(CODE_COVERAGE_OUTPUT_FILE);
code_coverage_v_lcov_ign = \$(code_coverage_v_lcov_ign_\$(V))
code_coverage_v_lcov_ign_ = \$(code_coverage_v_lcov_ign_\$(AM_DEFAULT_VERBOSITY))
code_coverage_v_lcov_ign_0 = @echo \" LCOV --remove /tmp/*\" \$(CODE_COVERAGE_IGNORE_PATTERN);
code_coverage_v_genhtml = \$(code_coverage_v_genhtml_\$(V))
code_coverage_v_genhtml_ = \$(code_coverage_v_genhtml_\$(AM_DEFAULT_VERBOSITY))
code_coverage_v_genhtml_0 = @echo \" GEN \" \"\$(CODE_COVERAGE_OUTPUT_DIRECTORY)\";
code_coverage_quiet = \$(code_coverage_quiet_\$(V))
code_coverage_quiet_ = \$(code_coverage_quiet_\$(AM_DEFAULT_VERBOSITY))
code_coverage_quiet_0 = --quiet
# sanitizes the test-name: replaces with underscores: dashes and dots
code_coverage_sanitize = \$(subst -,_,\$(subst .,_,\$(1)))
# Use recursive makes in order to ignore errors during check
check-code-coverage:
-\$(AM_V_at)\$(MAKE) \$(AM_MAKEFLAGS) -k check
\$(AM_V_at)\$(MAKE) \$(AM_MAKEFLAGS) code-coverage-capture
# Capture code coverage data
code-coverage-capture: code-coverage-capture-hook
\$(code_coverage_v_lcov_cap)\$(LCOV) \$(code_coverage_quiet) \$(addprefix --directory ,\$(CODE_COVERAGE_DIRECTORY)) --capture --output-file \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\" --test-name \"\$(call code_coverage_sanitize,\$(PACKAGE_NAME)-\$(PACKAGE_VERSION))\" --no-checksum --compat-libtool \$(CODE_COVERAGE_LCOV_SHOPTS) \$(CODE_COVERAGE_LCOV_OPTIONS)
\$(code_coverage_v_lcov_ign)\$(LCOV) \$(code_coverage_quiet) \$(addprefix --directory ,\$(CODE_COVERAGE_DIRECTORY)) --remove \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\" \"/tmp/*\" \$(CODE_COVERAGE_IGNORE_PATTERN) --output-file \"\$(CODE_COVERAGE_OUTPUT_FILE)\" \$(CODE_COVERAGE_LCOV_SHOPTS) \$(CODE_COVERAGE_LCOV_RMOPTS)
-@rm -f \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\"
\$(code_coverage_v_genhtml)LANG=C \$(GENHTML) \$(code_coverage_quiet) \$(addprefix --prefix ,\$(CODE_COVERAGE_DIRECTORY)) --output-directory \"\$(CODE_COVERAGE_OUTPUT_DIRECTORY)\" --title \"\$(PACKAGE_NAME)-\$(PACKAGE_VERSION) Code Coverage\" --legend --show-details \"\$(CODE_COVERAGE_OUTPUT_FILE)\" \$(CODE_COVERAGE_GENHTML_OPTIONS)
@echo \"file://\$(abs_builddir)/\$(CODE_COVERAGE_OUTPUT_DIRECTORY)/index.html\"
code-coverage-clean:
-\$(LCOV) --directory \$(top_builddir) -z
-rm -rf \"\$(CODE_COVERAGE_OUTPUT_FILE)\" \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\" \"\$(CODE_COVERAGE_OUTPUT_DIRECTORY)\"
-find . \\( -name \"*.gcda\" -o -name \"*.gcno\" -o -name \"*.gcov\" \\) -delete
code-coverage-dist-clean:
A][M_DISTCHECK_CONFIGURE_FLAGS = \$(A][M_DISTCHECK_CONFIGURE_FLAGS) --disable-code-coverage
else # ifneq (\$(abs_builddir), \$(abs_top_builddir))
check-code-coverage:
code-coverage-capture: code-coverage-capture-hook
code-coverage-clean:
code-coverage-dist-clean:
endif # ifeq (\$(abs_builddir), \$(abs_top_builddir))
else #! CODE_COVERAGE_ENABLED
# Use recursive makes in order to ignore errors during check
check-code-coverage:
@echo \"Need to reconfigure with --enable-code-coverage\"
# Capture code coverage data
code-coverage-capture: code-coverage-capture-hook
@echo \"Need to reconfigure with --enable-code-coverage\"
code-coverage-clean:
code-coverage-dist-clean:
endif #CODE_COVERAGE_ENABLED
# Hook rule executed before code-coverage-capture, overridable by the user
code-coverage-capture-hook:
.PHONY: check-code-coverage code-coverage-capture code-coverage-dist-clean code-coverage-clean code-coverage-capture-hook
])
])
AC_DEFUN([_AX_CODE_COVERAGE_ENABLED],[
AX_CHECK_GNU_MAKE([],[AC_MSG_ERROR([not using GNU make that is needed for coverage])])
AC_REQUIRE([AX_ADD_AM_MACRO_STATIC])
# check for gcov
AC_CHECK_TOOL([GCOV],
[$_AX_CODE_COVERAGE_GCOV_PROG_WITH],
[:])
AS_IF([test "X$GCOV" = "X:"],
[AC_MSG_ERROR([gcov is needed to do coverage])])
AC_SUBST([GCOV])
dnl Check if gcc is being used
AS_IF([ test "$GCC" = "no" ], [
AC_MSG_ERROR([not compiling with gcc, which is required for gcov code coverage])
])
AC_CHECK_PROG([LCOV], [lcov], [lcov])
AC_CHECK_PROG([GENHTML], [genhtml], [genhtml])
AS_IF([ test x"$LCOV" = x ], [
AC_MSG_ERROR([To enable code coverage reporting you must have lcov installed])
])
AS_IF([ test x"$GENHTML" = x ], [
AC_MSG_ERROR([Could not find genhtml from the lcov package])
])
dnl Build the code coverage flags
dnl Define CODE_COVERAGE_LDFLAGS for backwards compatibility
CODE_COVERAGE_CPPFLAGS="-DNDEBUG"
CODE_COVERAGE_CFLAGS="-O0 -g -fprofile-arcs -ftest-coverage"
CODE_COVERAGE_CXXFLAGS="-O0 -g -fprofile-arcs -ftest-coverage"
CODE_COVERAGE_LIBS="-lgcov"
AC_SUBST([CODE_COVERAGE_CPPFLAGS])
AC_SUBST([CODE_COVERAGE_CFLAGS])
AC_SUBST([CODE_COVERAGE_CXXFLAGS])
AC_SUBST([CODE_COVERAGE_LIBS])
])
AC_DEFUN([AX_CODE_COVERAGE],[
dnl Check for --enable-code-coverage
# allow to override gcov location
AC_ARG_WITH([gcov],
[AS_HELP_STRING([--with-gcov[=GCOV]], [use given GCOV for coverage (GCOV=gcov).])],
[_AX_CODE_COVERAGE_GCOV_PROG_WITH=$with_gcov],
[_AX_CODE_COVERAGE_GCOV_PROG_WITH=gcov])
AC_MSG_CHECKING([whether to build with code coverage support])
AC_ARG_ENABLE([code-coverage],
AS_HELP_STRING([--enable-code-coverage],
[Whether to enable code coverage support]),,
enable_code_coverage=no)
AM_CONDITIONAL([CODE_COVERAGE_ENABLED], [test "x$enable_code_coverage" = xyes])
AC_SUBST([CODE_COVERAGE_ENABLED], [$enable_code_coverage])
AC_MSG_RESULT($enable_code_coverage)
AS_IF([ test "x$enable_code_coverage" = xyes ], [
_AX_CODE_COVERAGE_ENABLED
])
_AX_CODE_COVERAGE_RULES
])
raft-0.22.1/m4/ax_compare_version.m4 0000664 0000000 0000000 00000014653 14601504142 0017203 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_compare_version.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_COMPARE_VERSION(VERSION_A, OP, VERSION_B, [ACTION-IF-TRUE], [ACTION-IF-FALSE])
#
# DESCRIPTION
#
# This macro compares two version strings. Due to the various number of
# minor-version numbers that can exist, and the fact that string
# comparisons are not compatible with numeric comparisons, this is not
# necessarily trivial to do in a autoconf script. This macro makes doing
# these comparisons easy.
#
# The six basic comparisons are available, as well as checking equality
# limited to a certain number of minor-version levels.
#
# The operator OP determines what type of comparison to do, and can be one
# of:
#
# eq - equal (test A == B)
# ne - not equal (test A != B)
# le - less than or equal (test A <= B)
# ge - greater than or equal (test A >= B)
# lt - less than (test A < B)
# gt - greater than (test A > B)
#
# Additionally, the eq and ne operator can have a number after it to limit
# the test to that number of minor versions.
#
# eq0 - equal up to the length of the shorter version
# ne0 - not equal up to the length of the shorter version
# eqN - equal up to N sub-version levels
# neN - not equal up to N sub-version levels
#
# When the condition is true, shell commands ACTION-IF-TRUE are run,
# otherwise shell commands ACTION-IF-FALSE are run. The environment
# variable 'ax_compare_version' is always set to either 'true' or 'false'
# as well.
#
# Examples:
#
# AX_COMPARE_VERSION([3.15.7],[lt],[3.15.8])
# AX_COMPARE_VERSION([3.15],[lt],[3.15.8])
#
# would both be true.
#
# AX_COMPARE_VERSION([3.15.7],[eq],[3.15.8])
# AX_COMPARE_VERSION([3.15],[gt],[3.15.8])
#
# would both be false.
#
# AX_COMPARE_VERSION([3.15.7],[eq2],[3.15.8])
#
# would be true because it is only comparing two minor versions.
#
# AX_COMPARE_VERSION([3.15.7],[eq0],[3.15])
#
# would be true because it is only comparing the lesser number of minor
# versions of the two values.
#
# Note: The characters that separate the version numbers do not matter. An
# empty string is the same as version 0. OP is evaluated by autoconf, not
# configure, so must be a string, not a variable.
#
# The author would like to acknowledge Guido Draheim whose advice about
# the m4_case and m4_ifvaln functions make this macro only include the
# portions necessary to perform the specific comparison specified by the
# OP argument in the final configure script.
#
# LICENSE
#
# Copyright (c) 2008 Tim Toolan
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 13
dnl #########################################################################
AC_DEFUN([AX_COMPARE_VERSION], [
AC_REQUIRE([AC_PROG_AWK])
# Used to indicate true or false condition
ax_compare_version=false
# Convert the two version strings to be compared into a format that
# allows a simple string comparison. The end result is that a version
# string of the form 1.12.5-r617 will be converted to the form
# 0001001200050617. In other words, each number is zero padded to four
# digits, and non digits are removed.
AS_VAR_PUSHDEF([A],[ax_compare_version_A])
A=`echo "$1" | sed -e 's/\([[0-9]]*\)/Z\1Z/g' \
-e 's/Z\([[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/[[^0-9]]//g'`
AS_VAR_PUSHDEF([B],[ax_compare_version_B])
B=`echo "$3" | sed -e 's/\([[0-9]]*\)/Z\1Z/g' \
-e 's/Z\([[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/[[^0-9]]//g'`
dnl # In the case of le, ge, lt, and gt, the strings are sorted as necessary
dnl # then the first line is used to determine if the condition is true.
dnl # The sed right after the echo is to remove any indented white space.
m4_case(m4_tolower($2),
[lt],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort -r | sed "s/x${A}/false/;s/x${B}/true/;1q"`
],
[gt],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort | sed "s/x${A}/false/;s/x${B}/true/;1q"`
],
[le],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort | sed "s/x${A}/true/;s/x${B}/false/;1q"`
],
[ge],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort -r | sed "s/x${A}/true/;s/x${B}/false/;1q"`
],[
dnl Split the operator from the subversion count if present.
m4_bmatch(m4_substr($2,2),
[0],[
# A count of zero means use the length of the shorter version.
# Determine the number of characters in A and B.
ax_compare_version_len_A=`echo "$A" | $AWK '{print(length)}'`
ax_compare_version_len_B=`echo "$B" | $AWK '{print(length)}'`
# Set A to no more than B's length and B to no more than A's length.
A=`echo "$A" | sed "s/\(.\{$ax_compare_version_len_B\}\).*/\1/"`
B=`echo "$B" | sed "s/\(.\{$ax_compare_version_len_A\}\).*/\1/"`
],
[[0-9]+],[
# A count greater than zero means use only that many subversions
A=`echo "$A" | sed "s/\(\([[0-9]]\{4\}\)\{m4_substr($2,2)\}\).*/\1/"`
B=`echo "$B" | sed "s/\(\([[0-9]]\{4\}\)\{m4_substr($2,2)\}\).*/\1/"`
],
[.+],[
AC_WARNING(
[invalid OP numeric parameter: $2])
],[])
# Pad zeros at end of numbers to make same length.
ax_compare_version_tmp_A="$A`echo $B | sed 's/./0/g'`"
B="$B`echo $A | sed 's/./0/g'`"
A="$ax_compare_version_tmp_A"
# Check for equality or inequality as necessary.
m4_case(m4_tolower(m4_substr($2,0,2)),
[eq],[
test "x$A" = "x$B" && ax_compare_version=true
],
[ne],[
test "x$A" != "x$B" && ax_compare_version=true
],[
AC_WARNING([invalid OP parameter: $2])
])
])
AS_VAR_POPDEF([A])dnl
AS_VAR_POPDEF([B])dnl
dnl # Execute ACTION-IF-TRUE / ACTION-IF-FALSE.
if test "$ax_compare_version" = "true" ; then
m4_ifvaln([$4],[$4],[:])dnl
m4_ifvaln([$5],[else $5])dnl
fi
]) dnl AX_COMPARE_VERSION
raft-0.22.1/m4/ax_file_escapes.m4 0000664 0000000 0000000 00000001373 14601504142 0016425 0 ustar 00root root 0000000 0000000 # ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_file_escapes.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_FILE_ESCAPES
#
# DESCRIPTION
#
# Writes the specified data to the specified file.
#
# LICENSE
#
# Copyright (c) 2008 Tom Howard
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 8
AC_DEFUN([AX_FILE_ESCAPES],[
AX_DOLLAR="\$"
AX_SRB="\\135"
AX_SLB="\\133"
AX_BS="\\\\"
AX_DQ="\""
])
raft-0.22.1/m4/pkg.m4 0000664 0000000 0000000 00000024011 14601504142 0014066 0 ustar 00root root 0000000 0000000 dnl pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*-
dnl serial 11 (pkg-config-0.29.1)
dnl
dnl Copyright © 2004 Scott James Remnant .
dnl Copyright © 2012-2015 Dan Nicholson
dnl
dnl This program is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU General Public License as published by
dnl the Free Software Foundation; either version 2 of the License, or
dnl (at your option) any later version.
dnl
dnl This program is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
dnl General Public License for more details.
dnl
dnl You should have received a copy of the GNU General Public License
dnl along with this program; if not, write to the Free Software
dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
dnl 02111-1307, USA.
dnl
dnl As a special exception to the GNU General Public License, if you
dnl distribute this file as part of a program that contains a
dnl configuration script generated by Autoconf, you may include it under
dnl the same distribution terms that you use for the rest of that
dnl program.
dnl PKG_PREREQ(MIN-VERSION)
dnl -----------------------
dnl Since: 0.29
dnl
dnl Verify that the version of the pkg-config macros are at least
dnl MIN-VERSION. Unlike PKG_PROG_PKG_CONFIG, which checks the user's
dnl installed version of pkg-config, this checks the developer's version
dnl of pkg.m4 when generating configure.
dnl
dnl To ensure that this macro is defined, also add:
dnl m4_ifndef([PKG_PREREQ],
dnl [m4_fatal([must install pkg-config 0.29 or later before running autoconf/autogen])])
dnl
dnl See the "Since" comment for each macro you use to see what version
dnl of the macros you require.
m4_defun([PKG_PREREQ],
[m4_define([PKG_MACROS_VERSION], [0.29.1])
m4_if(m4_version_compare(PKG_MACROS_VERSION, [$1]), -1,
[m4_fatal([pkg.m4 version $1 or higher is required but ]PKG_MACROS_VERSION[ found])])
])dnl PKG_PREREQ
dnl PKG_PROG_PKG_CONFIG([MIN-VERSION])
dnl ----------------------------------
dnl Since: 0.16
dnl
dnl Search for the pkg-config tool and set the PKG_CONFIG variable to
dnl first found in the path. Checks that the version of pkg-config found
dnl is at least MIN-VERSION. If MIN-VERSION is not specified, 0.9.0 is
dnl used since that's the first version where most current features of
dnl pkg-config existed.
AC_DEFUN([PKG_PROG_PKG_CONFIG],
[m4_pattern_forbid([^_?PKG_[A-Z_]+$])
m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$])
m4_pattern_allow([^PKG_CONFIG_(DISABLE_UNINSTALLED|TOP_BUILD_DIR|DEBUG_SPEW)$])
AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility])
AC_ARG_VAR([PKG_CONFIG_PATH], [directories to add to pkg-config's search path])
AC_ARG_VAR([PKG_CONFIG_LIBDIR], [path overriding pkg-config's built-in search path])
if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then
AC_PATH_TOOL([PKG_CONFIG], [pkg-config])
fi
if test -n "$PKG_CONFIG"; then
_pkg_min_version=m4_default([$1], [0.9.0])
AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version])
if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then
AC_MSG_RESULT([yes])
else
AC_MSG_RESULT([no])
PKG_CONFIG=""
fi
fi[]dnl
])dnl PKG_PROG_PKG_CONFIG
dnl PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
dnl -------------------------------------------------------------------
dnl Since: 0.18
dnl
dnl Check to see whether a particular set of modules exists. Similar to
dnl PKG_CHECK_MODULES(), but does not set variables or print errors.
dnl
dnl Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG])
dnl only at the first occurence in configure.ac, so if the first place
dnl it's called might be skipped (such as if it is within an "if", you
dnl have to call PKG_CHECK_EXISTS manually
AC_DEFUN([PKG_CHECK_EXISTS],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
if test -n "$PKG_CONFIG" && \
AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then
m4_default([$2], [:])
m4_ifvaln([$3], [else
$3])dnl
fi])
dnl _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES])
dnl ---------------------------------------------
dnl Internal wrapper calling pkg-config via PKG_CONFIG and setting
dnl pkg_failed based on the result.
m4_define([_PKG_CONFIG],
[if test -n "$$1"; then
pkg_cv_[]$1="$$1"
elif test -n "$PKG_CONFIG"; then
PKG_CHECK_EXISTS([$3],
[pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null`
test "x$?" != "x0" && pkg_failed=yes ],
[pkg_failed=yes])
else
pkg_failed=untried
fi[]dnl
])dnl _PKG_CONFIG
dnl _PKG_SHORT_ERRORS_SUPPORTED
dnl ---------------------------
dnl Internal check to see if pkg-config supports short errors.
AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])
if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
_pkg_short_errors_supported=yes
else
_pkg_short_errors_supported=no
fi[]dnl
])dnl _PKG_SHORT_ERRORS_SUPPORTED
dnl PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
dnl [ACTION-IF-NOT-FOUND])
dnl --------------------------------------------------------------
dnl Since: 0.4.0
dnl
dnl Note that if there is a possibility the first call to
dnl PKG_CHECK_MODULES might not happen, you should be sure to include an
dnl explicit call to PKG_PROG_PKG_CONFIG in your configure.ac
AC_DEFUN([PKG_CHECK_MODULES],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl
AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl
pkg_failed=no
AC_MSG_CHECKING([for $1])
_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2])
_PKG_CONFIG([$1][_LIBS], [libs], [$2])
m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS
and $1[]_LIBS to avoid the need to call pkg-config.
See the pkg-config man page for more details.])
if test $pkg_failed = yes; then
AC_MSG_RESULT([no])
_PKG_SHORT_ERRORS_SUPPORTED
if test $_pkg_short_errors_supported = yes; then
$1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "$2" 2>&1`
else
$1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "$2" 2>&1`
fi
# Put the nasty error message in config.log where it belongs
echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD
m4_default([$4], [AC_MSG_ERROR(
[Package requirements ($2) were not met:
$$1_PKG_ERRORS
Consider adjusting the PKG_CONFIG_PATH environment variable if you
installed software in a non-standard prefix.
_PKG_TEXT])[]dnl
])
elif test $pkg_failed = untried; then
AC_MSG_RESULT([no])
m4_default([$4], [AC_MSG_FAILURE(
[The pkg-config script could not be found or is too old. Make sure it
is in your PATH or set the PKG_CONFIG environment variable to the full
path to pkg-config.
_PKG_TEXT
To get pkg-config, see .])[]dnl
])
else
$1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS
$1[]_LIBS=$pkg_cv_[]$1[]_LIBS
AC_MSG_RESULT([yes])
$3
fi[]dnl
])dnl PKG_CHECK_MODULES
dnl PKG_CHECK_MODULES_STATIC(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
dnl [ACTION-IF-NOT-FOUND])
dnl ---------------------------------------------------------------------
dnl Since: 0.29
dnl
dnl Checks for existence of MODULES and gathers its build flags with
dnl static libraries enabled. Sets VARIABLE-PREFIX_CFLAGS from --cflags
dnl and VARIABLE-PREFIX_LIBS from --libs.
dnl
dnl Note that if there is a possibility the first call to
dnl PKG_CHECK_MODULES_STATIC might not happen, you should be sure to
dnl include an explicit call to PKG_PROG_PKG_CONFIG in your
dnl configure.ac.
AC_DEFUN([PKG_CHECK_MODULES_STATIC],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
_save_PKG_CONFIG=$PKG_CONFIG
PKG_CONFIG="$PKG_CONFIG --static"
PKG_CHECK_MODULES($@)
PKG_CONFIG=$_save_PKG_CONFIG[]dnl
])dnl PKG_CHECK_MODULES_STATIC
dnl PKG_INSTALLDIR([DIRECTORY])
dnl -------------------------
dnl Since: 0.27
dnl
dnl Substitutes the variable pkgconfigdir as the location where a module
dnl should install pkg-config .pc files. By default the directory is
dnl $libdir/pkgconfig, but the default can be changed by passing
dnl DIRECTORY. The user can override through the --with-pkgconfigdir
dnl parameter.
AC_DEFUN([PKG_INSTALLDIR],
[m4_pushdef([pkg_default], [m4_default([$1], ['${libdir}/pkgconfig'])])
m4_pushdef([pkg_description],
[pkg-config installation directory @<:@]pkg_default[@:>@])
AC_ARG_WITH([pkgconfigdir],
[AS_HELP_STRING([--with-pkgconfigdir], pkg_description)],,
[with_pkgconfigdir=]pkg_default)
AC_SUBST([pkgconfigdir], [$with_pkgconfigdir])
m4_popdef([pkg_default])
m4_popdef([pkg_description])
])dnl PKG_INSTALLDIR
dnl PKG_NOARCH_INSTALLDIR([DIRECTORY])
dnl --------------------------------
dnl Since: 0.27
dnl
dnl Substitutes the variable noarch_pkgconfigdir as the location where a
dnl module should install arch-independent pkg-config .pc files. By
dnl default the directory is $datadir/pkgconfig, but the default can be
dnl changed by passing DIRECTORY. The user can override through the
dnl --with-noarch-pkgconfigdir parameter.
AC_DEFUN([PKG_NOARCH_INSTALLDIR],
[m4_pushdef([pkg_default], [m4_default([$1], ['${datadir}/pkgconfig'])])
m4_pushdef([pkg_description],
[pkg-config arch-independent installation directory @<:@]pkg_default[@:>@])
AC_ARG_WITH([noarch-pkgconfigdir],
[AS_HELP_STRING([--with-noarch-pkgconfigdir], pkg_description)],,
[with_noarch_pkgconfigdir=]pkg_default)
AC_SUBST([noarch_pkgconfigdir], [$with_noarch_pkgconfigdir])
m4_popdef([pkg_default])
m4_popdef([pkg_description])
])dnl PKG_NOARCH_INSTALLDIR
dnl PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE,
dnl [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
dnl -------------------------------------------
dnl Since: 0.28
dnl
dnl Retrieves the value of the pkg-config variable for the given module.
AC_DEFUN([PKG_CHECK_VAR],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
AC_ARG_VAR([$1], [value of $3 for $2, overriding pkg-config])dnl
_PKG_CONFIG([$1], [variable="][$3]["], [$2])
AS_VAR_COPY([$1], [pkg_cv_][$1])
AS_VAR_IF([$1], [""], [$5], [$4])dnl
])dnl PKG_CHECK_VAR
raft-0.22.1/raft.pc.in 0000664 0000000 0000000 00000000350 14601504142 0014410 0 ustar 00root root 0000000 0000000 prefix=@prefix@
exec_prefix=@exec_prefix@
libdir=@libdir@
includedir=@includedir@
Name: raft
Description: C implementation of the Raft Consensus protocol
Version: @PACKAGE_VERSION@
Libs: -L${libdir} -lraft
Cflags: -I${includedir}
raft-0.22.1/src/ 0000775 0000000 0000000 00000000000 14601504142 0013314 5 ustar 00root root 0000000 0000000 raft-0.22.1/src/array.h 0000664 0000000 0000000 00000002015 14601504142 0014601 0 ustar 00root root 0000000 0000000 /* Macros to manipulate contiguous arrays. */
#ifndef ARRAY_H_
#define ARRAY_H_
#include "../include/raft.h"
/* Append item I of type T to array A which currently has N items.
*
* A and N must both by pointers. Set RV to -1 in case of failure. */
#define ARRAY__APPEND(T, I, A, N, RV) \
{ \
T *tmp_array; \
tmp_array = raft_realloc(*A, (*N + 1) * sizeof **A); \
if (tmp_array != NULL) { \
(*N)++; \
*A = tmp_array; \
(*A)[(*N) - 1] = I; \
RV = 0; \
} else { \
RV = -1; \
} \
}
#endif /* ARRAY_H_ */
raft-0.22.1/src/assert.h 0000664 0000000 0000000 00000003102 14601504142 0014762 0 ustar 00root root 0000000 0000000 /* Define the assert() macro, either as the standard one or the test one. */
#ifndef ASSERT_H_
#define ASSERT_H_
#if defined(RAFT_TEST)
extern void munit_errorf_ex(const char *filename,
int line,
const char *format,
...);
#define assert(expr) \
do { \
if (!expr) { \
munit_errorf_ex(__FILE__, __LINE__, "assertion failed: ", #expr); \
} \
} while (0)
#elif defined(NDEBUG)
#define assert(x) \
do { \
(void)sizeof(x); \
} while (0)
#elif defined(RAFT_ASSERT_WITH_BACKTRACE)
#include /* for __assert_fail */
#include
#include
#undef assert
#define assert(x) \
do { \
struct backtrace_state *state_; \
if (!(x)) { \
state_ = backtrace_create_state(NULL, 0, NULL, NULL); \
backtrace_print(state_, 0, stderr); \
__assert_fail(#x, __FILE__, __LINE__, __func__); \
} \
} while (0)
#else
#include
#endif
#endif /* ASSERT_H_ */
raft-0.22.1/src/byte.c 0000664 0000000 0000000 00000031577 14601504142 0014440 0 ustar 00root root 0000000 0000000 #include "byte.h"
/* Taken from https://github.com/gcc-mirror/gcc/blob/master/libiberty/crc32.c */
static const unsigned byteCrcTable[] = {
0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b,
0x1a864db2, 0x1e475005, 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61,
0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd, 0x4c11db70, 0x48d0c6c7,
0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3,
0x709f7b7a, 0x745e66cd, 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039,
0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5, 0xbe2b5b58, 0xbaea46ef,
0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d,
0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49, 0xc7361b4c, 0xc3f706fb,
0xceb42022, 0xca753d95, 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1,
0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d, 0x34867077, 0x30476dc0,
0x3d044b19, 0x39c556ae, 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072,
0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4,
0x0808d07d, 0x0cc9cdca, 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde,
0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02, 0x5e9f46bf, 0x5a5e5b08,
0x571d7dd1, 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc,
0xb6238b25, 0xb2e29692, 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6,
0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a, 0xe0b41de7, 0xe4750050,
0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2,
0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686, 0xd5b88683, 0xd1799b34,
0xdc3abded, 0xd8fba05a, 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637,
0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb, 0x4f040d56, 0x4bc510e1,
0x46863638, 0x42472b8f, 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53,
0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, 0x36194d42, 0x32d850f5,
0x3f9b762c, 0x3b5a6b9b, 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff,
0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623, 0xf12f560e, 0xf5ee4bb9,
0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd,
0xcda1f604, 0xc960ebb3, 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7,
0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b, 0x9b3660c6, 0x9ff77d71,
0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3,
0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2,
0x470cdd2b, 0x43cdc09c, 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8,
0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24, 0x119b4be9, 0x155a565e,
0x18197087, 0x1cd86d30, 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec,
0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a,
0x2d15ebe3, 0x29d4f654, 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0,
0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c, 0xe3a1cbc1, 0xe760d676,
0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662,
0x933eb0bb, 0x97ffad0c, 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668,
0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4};
unsigned byteCrc32(const void *buf, const size_t size, const unsigned init)
{
unsigned crc = init;
uint8_t *cursor = (uint8_t *)buf;
size_t count = size;
while (count--) {
crc = (crc << 8) ^ byteCrcTable[((crc >> 24) ^ *cursor) & 255];
cursor++;
}
return crc;
}
/* ================ sha1.c ================ */
/*
SHA-1 in C
By Steve Reid
100% Public Domain
Test Vectors (from FIPS PUB 180-1)
"abc"
A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
A million repetitions of "a"
34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
*/
/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */
/* #define SHA1HANDSOFF * Copies data before messing with it. */
#define SHA1HANDSOFF
#include
#include
#include /* for u_int*_t */
#if defined(__sun)
#include "solarisfixes.h"
#endif
#ifndef BYTE_ORDER
#if (BSD >= 199103)
#include
#else
#if defined(linux) || defined(__linux__)
#include
#else
#define LITTLE_ENDIAN 1234 /* least-significant byte first (vax, pc) */
#define BIG_ENDIAN 4321 /* most-significant byte first (IBM, net) */
#define PDP_ENDIAN 3412 /* LSB first in word, MSW first in long (pdp)*/
#if defined(vax) || defined(ns32000) || defined(sun386) || \
defined(__i386__) || defined(MIPSEL) || defined(_MIPSEL) || \
defined(BIT_ZERO_ON_RIGHT) || defined(__alpha__) || defined(__alpha)
#define BYTE_ORDER LITTLE_ENDIAN
#endif
#if defined(sel) || defined(pyr) || defined(mc68000) || defined(sparc) || \
defined(is68k) || defined(tahoe) || defined(ibm032) || defined(ibm370) || \
defined(MIPSEB) || defined(_MIPSEB) || defined(_IBMR2) || defined(DGUX) || \
defined(apollo) || defined(__convex__) || defined(_CRAY) || \
defined(__hppa) || defined(__hp9000) || defined(__hp9000s300) || \
defined(__hp9000s700) || defined(BIT_ZERO_ON_LEFT) || defined(m68k) || \
defined(__sparc)
#define BYTE_ORDER BIG_ENDIAN
#endif
#endif /* linux */
#endif /* BSD */
#endif /* BYTE_ORDER */
#if defined(__BYTE_ORDER) && !defined(BYTE_ORDER)
#if (__BYTE_ORDER == __LITTLE_ENDIAN)
#define BYTE_ORDER LITTLE_ENDIAN
#else
#define BYTE_ORDER BIG_ENDIAN
#endif
#endif
#if !defined(BYTE_ORDER) || \
(BYTE_ORDER != BIG_ENDIAN && BYTE_ORDER != LITTLE_ENDIAN && \
BYTE_ORDER != PDP_ENDIAN)
/* you must determine what the correct bit order is for
* your compiler - the next line is an intentional error
* which will force your compiles to bomb until you fix
* the above macros.
*/
#error "Undefined or invalid BYTE_ORDER"
#endif
#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
/* blk0() and blk() perform the initial expand. */
/* I got the idea of expanding during the round function from SSLeay */
#if BYTE_ORDER == LITTLE_ENDIAN
#define blk0(i) \
(block->l[i] = (rol(block->l[i], 24) & 0xFF00FF00) | \
(rol(block->l[i], 8) & 0x00FF00FF))
#elif BYTE_ORDER == BIG_ENDIAN
#define blk0(i) block->l[i]
#else
#error "Endianness not defined!"
#endif
#define blk(i) \
(block->l[i & 15] = rol(block->l[(i + 13) & 15] ^ block->l[(i + 8) & 15] ^ \
block->l[(i + 2) & 15] ^ block->l[i & 15], \
1))
/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
#define R0(v, w, x, y, z, i) \
z += ((w & (x ^ y)) ^ y) + blk0(i) + 0x5A827999 + rol(v, 5); \
w = rol(w, 30);
#define R1(v, w, x, y, z, i) \
z += ((w & (x ^ y)) ^ y) + blk(i) + 0x5A827999 + rol(v, 5); \
w = rol(w, 30);
#define R2(v, w, x, y, z, i) \
z += (w ^ x ^ y) + blk(i) + 0x6ED9EBA1 + rol(v, 5); \
w = rol(w, 30);
#define R3(v, w, x, y, z, i) \
z += (((w | x) & y) | (w & x)) + blk(i) + 0x8F1BBCDC + rol(v, 5); \
w = rol(w, 30);
#define R4(v, w, x, y, z, i) \
z += (w ^ x ^ y) + blk(i) + 0xCA62C1D6 + rol(v, 5); \
w = rol(w, 30);
static void byteSha1Transform(uint32_t state[5], const uint8_t buffer[64])
{
uint32_t a, b, c, d, e;
typedef union {
uint8_t c[64];
uint32_t l[16];
} CHAR64LONG16;
#ifdef SHA1HANDSOFF
CHAR64LONG16 block[1]; /* use array to appear as a pointer */
memcpy(block, buffer, 64);
#else
/* The following had better never be used because it causes the
* pointer-to-const buffer to be cast into a pointer to non-const.
* And the result is written through. I threw a "const" in, hoping
* this will cause a diagnostic.
*/
CHAR64LONG16 *block = (const CHAR64LONG16 *)buffer;
#endif
/* Copy context->state[] to working vars */
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
/* 4 rounds of 20 operations each. Loop unrolled. */
R0(a, b, c, d, e, 0);
R0(e, a, b, c, d, 1);
R0(d, e, a, b, c, 2);
R0(c, d, e, a, b, 3);
R0(b, c, d, e, a, 4);
R0(a, b, c, d, e, 5);
R0(e, a, b, c, d, 6);
R0(d, e, a, b, c, 7);
R0(c, d, e, a, b, 8);
R0(b, c, d, e, a, 9);
R0(a, b, c, d, e, 10);
R0(e, a, b, c, d, 11);
R0(d, e, a, b, c, 12);
R0(c, d, e, a, b, 13);
R0(b, c, d, e, a, 14);
R0(a, b, c, d, e, 15);
R1(e, a, b, c, d, 16);
R1(d, e, a, b, c, 17);
R1(c, d, e, a, b, 18);
R1(b, c, d, e, a, 19);
R2(a, b, c, d, e, 20);
R2(e, a, b, c, d, 21);
R2(d, e, a, b, c, 22);
R2(c, d, e, a, b, 23);
R2(b, c, d, e, a, 24);
R2(a, b, c, d, e, 25);
R2(e, a, b, c, d, 26);
R2(d, e, a, b, c, 27);
R2(c, d, e, a, b, 28);
R2(b, c, d, e, a, 29);
R2(a, b, c, d, e, 30);
R2(e, a, b, c, d, 31);
R2(d, e, a, b, c, 32);
R2(c, d, e, a, b, 33);
R2(b, c, d, e, a, 34);
R2(a, b, c, d, e, 35);
R2(e, a, b, c, d, 36);
R2(d, e, a, b, c, 37);
R2(c, d, e, a, b, 38);
R2(b, c, d, e, a, 39);
R3(a, b, c, d, e, 40);
R3(e, a, b, c, d, 41);
R3(d, e, a, b, c, 42);
R3(c, d, e, a, b, 43);
R3(b, c, d, e, a, 44);
R3(a, b, c, d, e, 45);
R3(e, a, b, c, d, 46);
R3(d, e, a, b, c, 47);
R3(c, d, e, a, b, 48);
R3(b, c, d, e, a, 49);
R3(a, b, c, d, e, 50);
R3(e, a, b, c, d, 51);
R3(d, e, a, b, c, 52);
R3(c, d, e, a, b, 53);
R3(b, c, d, e, a, 54);
R3(a, b, c, d, e, 55);
R3(e, a, b, c, d, 56);
R3(d, e, a, b, c, 57);
R3(c, d, e, a, b, 58);
R3(b, c, d, e, a, 59);
R4(a, b, c, d, e, 60);
R4(e, a, b, c, d, 61);
R4(d, e, a, b, c, 62);
R4(c, d, e, a, b, 63);
R4(b, c, d, e, a, 64);
R4(a, b, c, d, e, 65);
R4(e, a, b, c, d, 66);
R4(d, e, a, b, c, 67);
R4(c, d, e, a, b, 68);
R4(b, c, d, e, a, 69);
R4(a, b, c, d, e, 70);
R4(e, a, b, c, d, 71);
R4(d, e, a, b, c, 72);
R4(c, d, e, a, b, 73);
R4(b, c, d, e, a, 74);
R4(a, b, c, d, e, 75);
R4(e, a, b, c, d, 76);
R4(d, e, a, b, c, 77);
R4(c, d, e, a, b, 78);
R4(b, c, d, e, a, 79);
/* Add the working vars back into context.state[] */
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
/* Wipe variables */
a = b = c = d = e = 0;
#ifdef SHA1HANDSOFF
memset(block, '\0', sizeof(block));
#endif
}
void byteSha1Init(struct byteSha1 *s)
{
/* SHA1 initialization constants */
s->state[0] = 0x67452301;
s->state[1] = 0xEFCDAB89;
s->state[2] = 0x98BADCFE;
s->state[3] = 0x10325476;
s->state[4] = 0xC3D2E1F0;
s->count[0] = s->count[1] = 0;
}
/* Run your data through this. */
void __attribute__((noinline))
byteSha1Update(struct byteSha1 *s, const uint8_t *data, uint32_t len)
{
uint32_t i;
uint32_t j;
j = s->count[0];
if ((s->count[0] += len << 3) < j)
s->count[1]++;
s->count[1] += (len >> 29);
j = (j >> 3) & 63;
if ((j + len) > 63) {
memcpy(&s->buffer[j], data, (i = 64 - j));
byteSha1Transform(s->state, s->buffer);
for (; i + 63 < len; i += 64) {
byteSha1Transform(s->state, &data[i]);
}
j = 0;
} else
i = 0;
memcpy(&s->buffer[j], &data[i], len - i);
}
/* Add padding and return the message digest. */
void byteSha1Digest(struct byteSha1 *s, uint8_t value[20])
{
unsigned i;
uint8_t finalcount[8];
uint8_t c;
#if 0 /* untested "improvement" by DHR */
/* Convert context->count to a sequence of bytes
* in finalcount. Second element first, but
* big-endian order within element.
* But we do it all backwards.
*/
uint8_t *fcp = &finalcount[8];
for (i = 0; i < 2; i++)
{
u_int32_t t = context->count[i];
int j;
for (j = 0; j < 4; t >>= 8, j++)
*--fcp = (uint8_t) t
}
#else
for (i = 0; i < 8; i++) {
finalcount[i] =
(uint8_t)((s->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)) &
255); /* Endian independent */
}
#endif
c = 0200;
byteSha1Update(s, &c, 1);
while ((s->count[0] & 504) != 448) {
c = 0000;
byteSha1Update(s, &c, 1);
}
byteSha1Update(s, finalcount, 8); /* Should cause a SHA1Transform() */
for (i = 0; i < 20; i++) {
value[i] = (uint8_t)((s->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255);
}
/* Wipe variables */
memset(s, '\0', sizeof(*s));
memset(&finalcount, '\0', sizeof(finalcount));
}
/* ================ end of sha1.c ================ */
raft-0.22.1/src/byte.h 0000664 0000000 0000000 00000011522 14601504142 0014431 0 ustar 00root root 0000000 0000000 /* Byte-level utilities. */
#ifndef BYTE_H_
#define BYTE_H_
#include
#include
#include
#if defined(__cplusplus)
#define BYTE__INLINE inline
#else
#if defined(__clang__)
#define BYTE__INLINE static inline __attribute__((unused))
#else
#define BYTE__INLINE static inline
#endif
#endif
/* Compile-time endianess detection (best effort). */
#if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \
(defined(__ARMEL__) && (__ARMEL__ == 1))
#define BYTE__LITTLE_ENDIAN
#elif defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN) && \
defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8
#define RAFT__BIG_ENDIAN
#endif
/* Flip a 16-bit number to network byte order (little endian) */
BYTE__INLINE uint16_t byteFlip16(uint16_t v)
{
#if defined(BYTE__LITTLE_ENDIAN)
return v;
#elif defined(RAFT__BIG_ENDIAN)
return __builtin_bswap16(v);
#else /* Unknown endianess */
union {
uint16_t u;
uint8_t v[2];
} s;
s.v[0] = (uint8_t)v;
s.v[1] = (uint8_t)(v >> 8);
return s.u;
#endif
}
/* Flip a 32-bit number to network byte order (little endian) */
BYTE__INLINE uint32_t byteFlip32(uint32_t v)
{
#if defined(BYTE__LITTLE_ENDIAN)
return v;
#elif defined(RAFT__BIG_ENDIAN)
return __builtin_bswap32(v);
#else /* Unknown endianess */
union {
uint32_t u;
uint8_t v[4];
} s;
s.v[0] = (uint8_t)v;
s.v[1] = (uint8_t)(v >> 8);
s.v[2] = (uint8_t)(v >> 16);
s.v[3] = (uint8_t)(v >> 24);
return s.u;
#endif
}
/* Flip a 64-bit number to network byte order (little endian) */
BYTE__INLINE uint64_t byteFlip64(uint64_t v)
{
#if defined(BYTE__LITTLE_ENDIAN)
return v;
#elif defined(RAFT__BIG_ENDIAN)
return __builtin_bswap64(v);
#else
union {
uint64_t u;
uint8_t v[8];
} s;
s.v[0] = (uint8_t)v;
s.v[1] = (uint8_t)(v >> 8);
s.v[2] = (uint8_t)(v >> 16);
s.v[3] = (uint8_t)(v >> 24);
s.v[4] = (uint8_t)(v >> 32);
s.v[5] = (uint8_t)(v >> 40);
s.v[6] = (uint8_t)(v >> 48);
s.v[7] = (uint8_t)(v >> 56);
return s.u;
#endif
}
BYTE__INLINE void bytePut8(uint8_t **cursor, uint8_t value)
{
**cursor = value;
*cursor += 1;
}
BYTE__INLINE void bytePut16(uint8_t **cursor, uint16_t value)
{
unsigned i;
uint16_t flipped = byteFlip16(value);
for (i = 0; i < sizeof(uint16_t); i++) {
bytePut8(cursor, ((uint8_t *)(&flipped))[i]);
}
}
BYTE__INLINE void bytePut32(uint8_t **cursor, uint32_t value)
{
unsigned i;
uint32_t flipped = byteFlip32(value);
for (i = 0; i < sizeof(uint32_t); i++) {
bytePut8(cursor, ((uint8_t *)(&flipped))[i]);
}
}
BYTE__INLINE void bytePut64(uint8_t **cursor, uint64_t value)
{
unsigned i;
uint64_t flipped = byteFlip64(value);
for (i = 0; i < sizeof(uint64_t); i++) {
bytePut8(cursor, ((uint8_t *)(&flipped))[i]);
}
}
BYTE__INLINE void bytePutString(uint8_t **cursor, const char *value)
{
char **p = (char **)cursor;
strcpy(*p, value);
*cursor += strlen(value) + 1;
}
BYTE__INLINE uint8_t byteGet8(const uint8_t **cursor)
{
uint8_t value = **cursor;
*cursor += 1;
return value;
}
BYTE__INLINE uint16_t byteGet16(const uint8_t **cursor)
{
uint16_t value = 0;
unsigned i;
for (i = 0; i < sizeof(uint16_t); i++) {
((uint8_t *)(&value))[i] = byteGet8(cursor);
}
return byteFlip16(value);
}
BYTE__INLINE uint32_t byteGet32(const uint8_t **cursor)
{
uint32_t value = 0;
unsigned i;
for (i = 0; i < sizeof(uint32_t); i++) {
((uint8_t *)(&value))[i] = byteGet8(cursor);
}
return byteFlip32(value);
}
BYTE__INLINE uint64_t byteGet64(const uint8_t **cursor)
{
uint64_t value = 0;
unsigned i;
for (i = 0; i < sizeof(uint64_t); i++) {
((uint8_t *)(&value))[i] = byteGet8(cursor);
}
return byteFlip64(value);
}
BYTE__INLINE const char *byteGetString(const uint8_t **cursor, size_t max_len)
{
const char *value = (const char *)*cursor;
size_t len = 0;
while (len < max_len) {
if (*(*cursor + len) == 0) {
break;
}
len++;
}
if (len == max_len) {
return NULL;
}
*cursor += len + 1;
return value;
}
/* Add padding to size if it's not a multiple of 8. */
BYTE__INLINE size_t bytePad64(size_t size)
{
size_t rest = size % sizeof(uint64_t);
if (rest != 0) {
size += sizeof(uint64_t) - rest;
}
return size;
}
/* Calculate the CRC32 checksum of the given data buffer. */
unsigned byteCrc32(const void *buf, size_t size, unsigned init);
struct byteSha1
{
uint32_t state[5];
uint32_t count[2];
uint8_t buffer[64];
uint8_t value[20];
};
void byteSha1Init(struct byteSha1 *s);
void byteSha1Update(struct byteSha1 *s, const uint8_t *data, uint32_t len);
void byteSha1Digest(struct byteSha1 *s, uint8_t value[20]);
#endif /* BYTE_H_ */
raft-0.22.1/src/client.c 0000664 0000000 0000000 00000016412 14601504142 0014742 0 ustar 00root root 0000000 0000000 #include "../include/raft.h"
#include "assert.h"
#include "configuration.h"
#include "err.h"
#include "membership.h"
#include "message.h"
#include "progress.h"
#include "queue.h"
#include "replication.h"
#include "request.h"
#include "tracing.h"
#include "trail.h"
#define infof(...) Infof(r->tracer, " " __VA_ARGS__)
#define tracef(...) Tracef(r->tracer, __VA_ARGS__)
/* Emit a trace info message summarizing the entries being submmitted. */
static void clientEmitSubmissionMessage(const struct raft *r,
const raft_index index,
const struct raft_entry *entries,
const unsigned n)
{
if (n == 1) {
const char *type;
switch (entries[0].type) {
case RAFT_COMMAND:
type = "command";
break;
case RAFT_BARRIER:
type = "barrier";
break;
case RAFT_CHANGE:
type = "configuration";
break;
default:
type = "unknown";
break;
}
infof("replicate 1 new %s entry (%llu^%llu)", type, index,
entries[0].term);
} else {
infof("replicate %u new entries (%llu^%llu..%llu^%llu)", n, index,
entries[0].term, index + n - 1, entries[n - 1].term);
}
}
/* Return true if the capacity of the majority of voter servers is within the
* configured threshold. */
static bool clientCapacityIsWithinThreshold(const struct raft *r)
{
unsigned reporting = 0; /* N. of voters reporting capacity. */
unsigned healthy = 0; /* N. of voters with capacity above threshold. */
unsigned i;
/* If a capacity threshold is not set, don't perform any check. */
if (r->capacity_threshold == 0) {
return true;
}
for (i = 0; i < r->configuration.n; i++) {
struct raft_server *server = &r->configuration.servers[i];
unsigned features = progressGetFeatures(r, i);
if (server->role != RAFT_VOTER) {
continue;
}
if (!(features & MESSAGE__FEATURE_CAPACITY)) {
continue;
}
reporting += 1;
if (progressGetCapacity(r, i) >= r->capacity_threshold) {
healthy += 1;
}
}
/* If not enough nodes are actually reporting capacity, don't draw any bad
* conclusion. */
if (reporting <= configurationVoterCount(&r->configuration) / 2) {
return true;
}
return healthy > configurationVoterCount(&r->configuration) / 2;
}
int ClientSubmit(struct raft *r, struct raft_entry *entries, unsigned n)
{
const raft_index index = TrailLastIndex(&r->trail) + 1; /* 1st new entry */
unsigned i;
int rv;
assert(r != NULL);
assert(entries != NULL);
assert(n > 0);
assert(entries[0].batch != NULL);
if (r->state != RAFT_LEADER || r->leader_state.transferee != 0) {
rv = RAFT_NOTLEADER;
ErrMsgFromCode(r->errmsg, rv);
goto err;
}
if (!clientCapacityIsWithinThreshold(r)) {
rv = RAFT_NOSPACE;
ErrMsgFromCode(r->errmsg, rv);
goto err;
}
clientEmitSubmissionMessage(r, index, entries, n);
for (i = 0; i < n; i++) {
const struct raft_entry *entry = &entries[i];
if (entry->type == RAFT_CHANGE) {
rv = membershipCanChangeConfiguration(r);
if (rv != 0) {
assert(rv == RAFT_CANTCHANGE);
goto err;
}
}
rv = TrailAppend(&r->trail, entry->term);
if (rv != 0) {
assert(rv == RAFT_NOMEM);
goto err;
}
if (entry->type == RAFT_CHANGE) {
rv = membershipUncommittedChange(r, index + i, entry);
if (rv != 0) {
goto err_after_trail_append;
}
}
}
rv = replicationTrigger(r, index, entries, n);
if (rv != 0) {
/* TODO: assert the possible error values */
goto err_after_trail_append;
}
return 0;
err_after_trail_append:
TrailTruncate(&r->trail, index);
err:
assert(rv == RAFT_NOTLEADER || rv == RAFT_MALFORMED || rv == RAFT_NOMEM ||
rv == RAFT_NOSPACE || rv == RAFT_CANTCHANGE);
return rv;
}
void ClientCatchUp(struct raft *r, raft_id server_id)
{
const struct raft_server *server;
unsigned server_index;
raft_index last_index;
int rv;
server = configurationGet(&r->configuration, server_id);
assert(server != NULL);
server_index = configurationIndexOf(&r->configuration, server_id);
last_index = TrailLastIndex(&r->trail);
r->leader_state.promotee_id = server->id;
/* Initialize the first catch-up round. */
r->leader_state.round_number = 1;
r->leader_state.round_index = last_index;
r->leader_state.round_start = r->now;
progressCatchUpStart(r, server_index);
/* Immediately initiate an AppendEntries request. */
rv = replicationProgress(r, server_index);
if (rv != 0 && rv != RAFT_NOCONNECTION) {
/* This error is not fatal. */
tracef("failed to send append entries to server %llu: %s (%d)",
server->id, raft_strerror(rv), rv);
}
}
/* Find a suitable voting follower. */
static raft_id clientSelectTransferee(const struct raft *r)
{
const struct raft_server *transferee = NULL;
unsigned i;
for (i = 0; i < r->configuration.n; i++) {
const struct raft_server *server = &r->configuration.servers[i];
if (server->id == r->id || server->role != RAFT_VOTER) {
continue;
}
transferee = server;
if (progressMatchIndex(r, i) == TrailLastIndex(&r->trail)) {
break;
}
}
if (transferee != NULL) {
return transferee->id;
}
return 0;
}
int ClientTransfer(struct raft *r, raft_id server_id)
{
const struct raft_server *server;
unsigned i;
int rv;
if (r->state != RAFT_LEADER || r->leader_state.transferee != 0) {
rv = RAFT_NOTLEADER;
ErrMsgFromCode(r->errmsg, rv);
goto err;
}
if (server_id == 0) {
server_id = clientSelectTransferee(r);
if (server_id == 0) {
rv = RAFT_NOTFOUND;
ErrMsgPrintf(r->errmsg, "there's no other voting server");
goto err;
}
}
server = configurationGet(&r->configuration, server_id);
if (server == NULL || server->id == r->id || server->role != RAFT_VOTER) {
rv = RAFT_BADID;
ErrMsgFromCode(r->errmsg, rv);
goto err;
}
/* If this follower is up-to-date, we can send it the TimeoutNow message
* right away. */
i = configurationIndexOf(&r->configuration, server->id);
assert(i < r->configuration.n);
r->leader_state.transferee = server_id;
r->leader_state.transfer_start = r->now;
if (progressMatchIndex(r, i) == TrailLastIndex(&r->trail)) {
rv = membershipLeadershipTransferStart(r);
if (rv != 0) {
assert(rv == RAFT_NOMEM);
r->leader_state.transferee = 0;
goto err;
}
} else {
infof("wait for transferee to catch up");
}
return 0;
err:
assert(rv == RAFT_NOTLEADER || rv == RAFT_NOTFOUND || rv == RAFT_BADID ||
rv == RAFT_NOMEM);
return rv;
}
#undef infof
#undef tracef
raft-0.22.1/src/client.h 0000664 0000000 0000000 00000002620 14601504142 0014743 0 ustar 00root root 0000000 0000000 #ifndef CLIENT_H_
#define CLIENT_H_
#include "../include/raft.h"
/* Submit the given entries and start replicating them.
*
* Errors:
*
* RAFT_NOTLEADER
* The server is not leader, or a leadership transfer is in progress.
*
* RAFT_CANTCHANGE
* A configuration entry is being submitted, but a configuration change
* is already in progress.
*
* RAFT_SPACE
* Not enough servers to form a majority are reporting to have remaining
* capacity over the configured threshold.
*
* RAFT_MALFORMED
* The submitted entry is of type RAFT_CHANGE, but the encoded configuration
* is invalid.
*
* RAFT_NOMEM
* Memory could not be allocated to store the new entry.
*/
int ClientSubmit(struct raft *r, struct raft_entry *entries, unsigned n);
/* Start catching-up the given server. */
void ClientCatchUp(struct raft *r, raft_id server_id);
/* Start transferring leadership to the given server.
*
* Errors:
*
* RAFT_NOTLEADER
* The server is not leader, or a leadership transfer is in progress.
*
* RAFT_NOTFOUND
* The server_id parameter is zero and now suitable server could be
* found to transfer leadership to.
*
* RAFT_BADID
* The provided server ID is not part of the configuration.
*
* RAFT_NOMEM
* Memory could not be allocated to enqueue a TimeoutNow message.
*/
int ClientTransfer(struct raft *r, raft_id server_id);
#endif /* CLIENT_H_ */
raft-0.22.1/src/compress.c 0000664 0000000 0000000 00000006517 14601504142 0015324 0 ustar 00root root 0000000 0000000 #include "compress.h"
#ifdef LZ4_AVAILABLE
#include
#endif
#include
#include
#include "assert.h"
#include "byte.h"
#include "err.h"
#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))
int Decompress(struct raft_buffer buf,
struct raft_buffer *decompressed,
char *errmsg)
{
#ifndef LZ4_AVAILABLE
(void)buf;
(void)decompressed;
ErrMsgPrintf(errmsg, "LZ4 not available");
return RAFT_INVALID;
#else
assert(decompressed != NULL);
int rv = RAFT_IOERR;
size_t src_offset = 0;
size_t dst_offset = 0;
size_t src_size = 0;
size_t dst_size = 0;
size_t ret = 0;
LZ4F_decompressionContext_t ctx;
if (LZ4F_isError(LZ4F_createDecompressionContext(&ctx, LZ4F_VERSION))) {
ErrMsgPrintf(errmsg, "LZ4F_createDecompressionContext");
rv = RAFT_NOMEM;
goto err;
}
src_size = buf.len;
LZ4F_frameInfo_t frameInfo = {0};
/* `src_size` will contain the size of the LZ4 Frame Header after the call,
* decompression must resume at that offset. */
ret = LZ4F_getFrameInfo(ctx, &frameInfo, buf.base, &src_size);
if (LZ4F_isError(ret)) {
ErrMsgPrintf(errmsg, "LZ4F_getFrameInfo %s", LZ4F_getErrorName(ret));
rv = RAFT_IOERR;
goto err_after_ctx_alloc;
}
src_offset = src_size;
decompressed->base = raft_malloc((size_t)frameInfo.contentSize);
decompressed->len = (size_t)frameInfo.contentSize;
if (decompressed->base == NULL) {
rv = RAFT_NOMEM;
goto err_after_ctx_alloc;
}
ret = 1;
while (ret != 0) {
src_size = buf.len - src_offset;
/* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
* The next line works around a bug in an older lz4 lib where the
* `size_t` dst_size parameter would overflow an `int`.
* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */
dst_size = min(decompressed->len - dst_offset, (size_t)INT_MAX);
/* `dst_size` will contain the number of bytes written to
* decompressed->base, while `src_size` will contain the number of bytes
* consumed from buf.base */
ret = LZ4F_decompress(ctx, (char *)decompressed->base + dst_offset,
&dst_size, (char *)buf.base + src_offset,
&src_size, NULL);
if (LZ4F_isError(ret)) {
ErrMsgPrintf(errmsg, "LZ4F_decompress %s", LZ4F_getErrorName(ret));
rv = RAFT_IOERR;
goto err_after_buff_alloc;
}
src_offset += src_size;
dst_offset += dst_size;
}
if (LZ4F_freeDecompressionContext(ctx) != 0) {
raft_free(decompressed->base);
decompressed->base = NULL;
return RAFT_IOERR;
}
return 0;
err_after_buff_alloc:
raft_free(decompressed->base);
decompressed->base = NULL;
err_after_ctx_alloc:
LZ4F_freeDecompressionContext(ctx);
err:
return rv;
#endif /* LZ4_AVAILABLE */
}
bool IsCompressed(const void *data, size_t sz)
{
if (data == NULL || sz < 4) {
return false;
}
const uint8_t *cursor = data;
#ifdef LZ4F_MAGICNUMBER
#define RAFT_LZ4F_MAGICNUMBER LZ4F_MAGICNUMBER
#else
#define RAFT_LZ4F_MAGICNUMBER 0x184D2204U
#endif
return byteGet32(&cursor) == RAFT_LZ4F_MAGICNUMBER;
}
raft-0.22.1/src/compress.h 0000664 0000000 0000000 00000000764 14601504142 0015327 0 ustar 00root root 0000000 0000000 #ifndef COMPRESS_H_
#define COMPRESS_H_
#include "../include/raft.h"
/*
* Decompresses the content of `buf` into a newly allocated buffer that is
* returned to the caller through `decompressed`. Returns a non-0 value upon
* failure.
*/
int Decompress(struct raft_buffer buf,
struct raft_buffer *decompressed,
char *errmsg);
/* Returns `true` if `data` is compressed, `false` otherwise. */
bool IsCompressed(const void *data, size_t sz);
#endif /* COMPRESS_H_ */
raft-0.22.1/src/configuration.c 0000664 0000000 0000000 00000022265 14601504142 0016336 0 ustar 00root root 0000000 0000000 #include "configuration.h"
#include "assert.h"
#include "byte.h"
/* Current encoding format version. */
#define ENCODING_FORMAT 1
void configurationInit(struct raft_configuration *c)
{
c->servers = NULL;
c->n = 0;
}
void configurationClose(struct raft_configuration *c)
{
size_t i;
assert(c != NULL);
assert(c->n == 0 || c->servers != NULL);
for (i = 0; i < c->n; i++) {
raft_free(c->servers[i].address);
}
if (c->servers != NULL) {
raft_free(c->servers);
}
}
unsigned configurationIndexOf(const struct raft_configuration *c,
const raft_id id)
{
unsigned i;
assert(c != NULL);
for (i = 0; i < c->n; i++) {
if (c->servers[i].id == id) {
return i;
}
}
return c->n;
}
unsigned configurationIndexOfVoter(const struct raft_configuration *c,
const raft_id id)
{
unsigned i;
unsigned j = 0;
assert(c != NULL);
assert(id > 0);
for (i = 0; i < c->n; i++) {
if (c->servers[i].id == id) {
if (c->servers[i].role == RAFT_VOTER) {
return j;
}
return c->n;
}
if (c->servers[i].role == RAFT_VOTER) {
j++;
}
}
return c->n;
}
unsigned configurationActualIndexOfVoter(const struct raft_configuration *c,
unsigned i)
{
unsigned j;
unsigned k = 0;
assert(c != NULL);
for (j = 0; j < c->n; j++) {
if (c->servers[j].role == RAFT_VOTER) {
if (k == i) {
return j;
}
k++;
}
}
return c->n;
}
const struct raft_server *configurationGet(const struct raft_configuration *c,
const raft_id id)
{
size_t i;
assert(c != NULL);
assert(id > 0);
/* Grab the index of the server with the given ID */
i = configurationIndexOf(c, id);
if (i == c->n) {
/* No server with matching ID. */
return NULL;
}
assert(i < c->n);
return &c->servers[i];
}
unsigned configurationVoterCount(const struct raft_configuration *c)
{
unsigned i;
unsigned n = 0;
assert(c != NULL);
for (i = 0; i < c->n; i++) {
if (c->servers[i].role == RAFT_VOTER) {
n++;
}
}
return n;
}
int configurationCopy(const struct raft_configuration *src,
struct raft_configuration *dst)
{
size_t i;
int rv;
configurationInit(dst);
for (i = 0; i < src->n; i++) {
struct raft_server *server = &src->servers[i];
rv = configurationAdd(dst, server->id, server->address, server->role);
if (rv != 0) {
goto err;
}
}
return 0;
err:
configurationClose(dst);
assert(rv == RAFT_NOMEM);
return rv;
}
int configurationAdd(struct raft_configuration *c,
const raft_id id,
const char *address,
const int role)
{
struct raft_server *servers;
struct raft_server *server;
char *address_copy;
size_t i;
int rv;
assert(c != NULL);
assert(id != 0);
if (role != RAFT_STANDBY && role != RAFT_VOTER && role != RAFT_SPARE) {
rv = RAFT_BADROLE;
goto err;
}
/* Check that neither the given id or address is already in use */
for (i = 0; i < c->n; i++) {
server = &c->servers[i];
if (server->id == id) {
rv = RAFT_DUPLICATEID;
goto err;
}
if (strcmp(server->address, address) == 0) {
rv = RAFT_DUPLICATEADDRESS;
goto err;
}
}
/* Make a copy of the given address */
address_copy = raft_malloc(strlen(address) + 1);
if (address_copy == NULL) {
rv = RAFT_NOMEM;
goto err;
}
strcpy(address_copy, address);
/* Grow the servers array.. */
servers = raft_realloc(c->servers, (c->n + 1) * sizeof *server);
if (servers == NULL) {
rv = RAFT_NOMEM;
goto err_after_address_copy;
}
c->servers = servers;
/* Fill the newly allocated slot (the last one) with the given details. */
server = &servers[c->n];
server->id = id;
server->address = address_copy;
server->role = role;
c->n++;
return 0;
err_after_address_copy:
raft_free(address_copy);
err:
assert(rv == RAFT_BADROLE || rv == RAFT_DUPLICATEID ||
rv == RAFT_DUPLICATEADDRESS || rv == RAFT_NOMEM);
return rv;
}
int configurationRemove(struct raft_configuration *c, const raft_id id)
{
unsigned i;
unsigned j;
struct raft_server *servers;
int rv;
assert(c != NULL);
i = configurationIndexOf(c, id);
if (i == c->n) {
rv = RAFT_BADID;
goto err;
}
assert(i < c->n);
/* If this is the last server in the configuration, reset everything. */
if (c->n - 1 == 0) {
assert(i == 0);
servers = NULL;
goto out;
}
/* Create a new servers array. */
servers = raft_calloc(c->n - 1, sizeof *servers);
if (servers == NULL) {
rv = RAFT_NOMEM;
goto err;
}
/* Copy the first part of the servers array into a new array, excluding the
* i'th server. */
for (j = 0; j < i; j++) {
servers[j] = c->servers[j];
}
/* Copy the second part of the servers array into a new array. */
for (j = i + 1; j < c->n; j++) {
servers[j - 1] = c->servers[j];
}
out:
/* Release the address of the server that was deleted. */
raft_free(c->servers[i].address);
/* Release the old servers array */
raft_free(c->servers);
c->servers = servers;
c->n--;
return 0;
err:
assert(rv == RAFT_BADID || rv == RAFT_NOMEM);
return rv;
}
size_t configurationEncodedSize(const struct raft_configuration *c)
{
size_t n = 0;
unsigned i;
/* We need one byte for the encoding format version */
n++;
/* Then 8 bytes for number of servers. */
n += sizeof(uint64_t);
/* Then some space for each server. */
for (i = 0; i < c->n; i++) {
struct raft_server *server = &c->servers[i];
assert(server->address != NULL);
n += sizeof(uint64_t); /* Server ID */
n += strlen(server->address) + 1; /* Address */
n++; /* Voting flag */
};
return bytePad64(n);
}
void configurationEncodeToBuf(const struct raft_configuration *c,
void *buf,
size_t buf_len)
{
uint8_t *cursor = buf;
uint8_t *end = cursor + buf_len;
unsigned i;
/* Encoding format version */
bytePut8(&cursor, ENCODING_FORMAT);
/* Number of servers. */
bytePut64(&cursor, c->n);
for (i = 0; i < c->n; i++) {
struct raft_server *server = &c->servers[i];
assert(server->address != NULL);
bytePut64(&cursor, server->id);
bytePutString(&cursor, server->address);
assert(server->role < 255);
bytePut8(&cursor, (uint8_t)server->role);
};
assert(cursor <= end);
memset(cursor, 0, (size_t)(end - cursor));
}
int configurationEncode(const struct raft_configuration *c,
struct raft_buffer *buf)
{
int rv;
assert(c != NULL);
assert(buf != NULL);
/* The configuration can't be empty. */
assert(c->n > 0);
buf->len = configurationEncodedSize(c);
buf->base = raft_malloc(buf->len);
if (buf->base == NULL) {
rv = RAFT_NOMEM;
goto err;
}
configurationEncodeToBuf(c, buf->base, buf->len);
return 0;
err:
assert(rv == RAFT_NOMEM);
return rv;
}
int configurationDecode(const struct raft_buffer *buf,
struct raft_configuration *c)
{
const uint8_t *cursor;
size_t i;
size_t n;
int rv;
assert(c != NULL);
assert(buf != NULL);
/* TODO: use 'if' instead of assert for checking buffer boundaries */
assert(buf->len > 0);
configurationInit(c);
cursor = buf->base;
/* Check the encoding format version */
if (byteGet8(&cursor) != ENCODING_FORMAT) {
rv = RAFT_MALFORMED;
goto err;
}
/* Read the number of servers. */
n = (size_t)byteGet64(&cursor);
/* Decode the individual servers. */
for (i = 0; i < n; i++) {
raft_id id;
const char *address;
int role;
/* Server ID. */
id = byteGet64(&cursor);
/* Server Address. */
address = byteGetString(
&cursor,
buf->len - (size_t)((uint8_t *)cursor - (uint8_t *)buf->base));
if (address == NULL) {
rv = RAFT_MALFORMED;
goto err;
}
/* Role code. */
role = byteGet8(&cursor);
rv = configurationAdd(c, id, address, role);
if (rv != 0) {
/* Only valid configurations should be ever be encoded, so in case
* configurationAdd() fails because of invalid data we return
* RAFT_MALFORMED. */
if (rv != RAFT_NOMEM) {
rv = RAFT_MALFORMED;
}
goto err;
}
}
return 0;
err:
assert(rv == RAFT_MALFORMED || rv == RAFT_NOMEM);
configurationClose(c);
return rv;
}
raft-0.22.1/src/configuration.h 0000664 0000000 0000000 00000010710 14601504142 0016333 0 ustar 00root root 0000000 0000000 /* Modify and inspect @raft_configuration objects. */
#ifndef CONFIGURATION_H_
#define CONFIGURATION_H_
#include "../include/raft.h"
/* Initialize an empty configuration. */
void configurationInit(struct raft_configuration *c);
/* Release all memory used by the given configuration. */
void configurationClose(struct raft_configuration *c);
/* Add a server to the given configuration.
*
* The given @address is copied and no reference to it is kept. In case of
* error, @c is left unchanged.
*
* Errors:
*
* RAFT_DUPLICATEID
* @c already has a server with the given id.
*
* RAFT_DUPLICATEADDRESS
* @c already has a server with the given @address.
*
* RAFT_BADROLE
* @role is not one of ROLE_STANDBY, ROLE_VOTER or ROLE_SPARE.
*
* RAFT_NOMEM
* A copy of @address could not be made or the @c->servers could not
* be extended
*/
int configurationAdd(struct raft_configuration *c,
raft_id id,
const char *address,
int role);
/* Return the number of servers with the RAFT_VOTER role. */
unsigned configurationVoterCount(const struct raft_configuration *c);
/* Return the index of the server with the given ID (relative to the c->servers
* array). If there's no server with the given ID, return the number of
* servers. */
unsigned configurationIndexOf(const struct raft_configuration *c, raft_id id);
/* Return the index of the RAFT_VOTER server with the given ID (relative to the
* sub array of c->servers that has only voting servers). If there's no server
* with the given ID, or if it's not flagged as voting, return the number of
* servers. */
unsigned configurationIndexOfVoter(const struct raft_configuration *c,
raft_id id);
/* Return the actual index of the RAFT_VOTER server having the relative index
* with respect to the sub array of c->servers that has only voting servers. */
unsigned configurationActualIndexOfVoter(const struct raft_configuration *c,
unsigned i);
/* Get the server with the given ID, or #NULL if no matching server is found. */
const struct raft_server *configurationGet(const struct raft_configuration *c,
raft_id id);
/* Remove a server from a raft configuration. The given ID must match the one of
* an existing server in the configuration.
*
* In case of error @c is left unchanged.
*
* Errors:
*
* RAFT_BADID
* @c does not contain any server with the given @id
*
* RAFT_NOMEM
* Memory to hold the new set of servers could not be allocated.
*/
int configurationRemove(struct raft_configuration *c, raft_id id);
/* Deep copy @src to @dst.
*
* The configuration @src is assumed to be valid (i.e. each of its servers has a
* valid ID, address and role).
*
* The @dst configuration object must be uninitialized or empty.
*
* In case of error, both @src and @dst are left unchanged.
*
* Errors:
*
* RAFT_NOMEM
* Memory to copy all the servers could not be allocated.
*/
int configurationCopy(const struct raft_configuration *src,
struct raft_configuration *dst);
/* Number of bytes needed to encode the given configuration object. */
size_t configurationEncodedSize(const struct raft_configuration *c);
/* Encode the given configuration object to the given pre-allocated buffer,
* which must be at least configurationEncodedSize(c) bytes. */
void configurationEncodeToBuf(const struct raft_configuration *c,
void *buf,
size_t buf_len);
/* Encode the given configuration object. The memory of the returned buffer is
* allocated using raft_malloc(), and client code is responsible for releasing
* it when no longer needed.
*
* Errors:
*
* RAFT_NOMEM
* Memory for the encoded buffer could not be allocated.
*/
int configurationEncode(const struct raft_configuration *c,
struct raft_buffer *buf);
/* Populate a configuration object by decoding the given serialized payload.
*
* The @c configuration object must be uninitialized or empty.
*
* In case of error, @c will be left empty.
*
* Errors:
*
* RAFT_MALFORMED
* The given buffer does not contain a valid encoded configuration.
*
* RAFT_NOMEM
* Memory to populate the given configuration could not be allocated.
*/
int configurationDecode(const struct raft_buffer *buf,
struct raft_configuration *c);
#endif /* CONFIGURATION_H_ */
raft-0.22.1/src/convert.c 0000664 0000000 0000000 00000017154 14601504142 0015150 0 ustar 00root root 0000000 0000000 #include "convert.h"
#include "assert.h"
#include "client.h"
#include "configuration.h"
#include "election.h"
#include "membership.h"
#include "progress.h"
#include "queue.h"
#include "replication.h"
#include "request.h"
#include "string.h"
#include "tracing.h"
#include "trail.h"
#define infof(...) Infof(r->tracer, " " __VA_ARGS__)
#define tracef(...) Tracef(r->tracer, __VA_ARGS__)
/* Convenience for setting a new state value and asserting that the transition
* is valid. */
static void convertSetState(struct raft *r, unsigned short new_state)
{
r->state = new_state;
r->update->flags |= RAFT_UPDATE_STATE;
}
/* Clear follower state. */
static void convertClearFollower(struct raft *r)
{
r->follower_state.current_leader.id = 0;
if (r->follower_state.current_leader.address != NULL) {
raft_free(r->follower_state.current_leader.address);
}
r->follower_state.current_leader.address = NULL;
}
/* Clear candidate state. */
static void convertClearCandidate(struct raft *r)
{
if (r->candidate_state.votes != NULL) {
raft_free(r->candidate_state.votes);
r->candidate_state.votes = NULL;
}
}
/* Clear leader state. */
static void convertClearLeader(struct raft *r)
{
if (r->leader_state.progress != NULL) {
raft_free(r->leader_state.progress);
r->leader_state.progress = NULL;
}
}
void convertClear(struct raft *r)
{
assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE ||
r->state == RAFT_LEADER);
switch (r->state) {
case RAFT_FOLLOWER:
convertClearFollower(r);
break;
case RAFT_CANDIDATE:
convertClearCandidate(r);
break;
case RAFT_LEADER:
convertClearLeader(r);
break;
}
}
void convertToFollower(struct raft *r)
{
assert(r->state == RAFT_CANDIDATE || r->state == RAFT_LEADER);
switch (r->state) {
case RAFT_CANDIDATE:
convertClearCandidate(r);
break;
case RAFT_LEADER:
convertClearLeader(r);
break;
}
convertSetState(r, RAFT_FOLLOWER);
/* Reset election timer. */
electionResetTimer(r);
r->follower_state.current_leader.id = 0;
r->follower_state.current_leader.address = NULL;
/* The follower's match index tracks the highest index in the local log that
* is known to match the same index in the leader log, because the leader
* has sent an AppendEntries request containing that index.
*
* This is necessary in order to avoid sending AppendEntries results that
* contain indexes that were never checked against the log matching
* property. */
r->follower_state.match = 0;
}
int convertToCandidate(struct raft *r, const bool disrupt_leader)
{
const struct raft_server *server;
size_t n_voters = configurationVoterCount(&r->configuration);
assert(r->state == RAFT_FOLLOWER);
(void)server; /* Only used for assertions. */
/* Check that we're a voter in the current configuration. */
server = configurationGet(&r->configuration, r->id);
assert(server != NULL);
assert(server->role == RAFT_VOTER);
convertClearFollower(r);
convertSetState(r, RAFT_CANDIDATE);
/* Allocate the votes array. */
r->candidate_state.votes =
raft_calloc(n_voters, sizeof *r->candidate_state.votes);
if (r->candidate_state.votes == NULL) {
return RAFT_NOMEM;
}
r->candidate_state.disrupt_leader = disrupt_leader;
r->candidate_state.in_pre_vote = disrupt_leader ? false : r->pre_vote;
/* Fast-forward to leader if we're the only voting server in the
* configuration. */
if (n_voters == 1) {
infof("self elect and convert to leader");
return convertToLeader(r);
}
/* Start a new election round */
electionStart(r);
return 0;
}
extern char *__progname;
/* Detect if we're being run as dqlite unit test.
*
* Those tests assume that a barrier is *always* issued when converting to
* leader, and since we can't change those tests we maintain that behavior for
* compability. */
static bool isDqliteUnitTest(void)
{
return strcmp(__progname, "unit-test") == 0;
}
int convertToLeader(struct raft *r)
{
struct raft_progress *progress;
size_t n_voters;
unsigned i;
int rv;
assert(r->state == RAFT_CANDIDATE);
/* Allocate and initialize the progress array. */
progress = progressBuildArray(r);
if (progress == NULL) {
rv = RAFT_NOMEM;
goto err;
}
n_voters = configurationVoterCount(&r->configuration);
assert(n_voters > 0);
/* Copy features and capacity information. */
for (i = 0; i < n_voters; i++) {
unsigned j;
j = configurationActualIndexOfVoter(&r->configuration, i);
progress[j].features = r->candidate_state.votes[i].features;
progress[j].capacity = r->candidate_state.votes[i].capacity;
}
convertClearCandidate(r);
convertSetState(r, RAFT_LEADER);
r->leader_state.progress = progress;
/* Reset timers */
r->election_timer_start = r->now;
r->update->flags |= RAFT_UPDATE_TIMEOUT;
/* Reset promotion state. */
r->leader_state.promotee_id = 0;
r->leader_state.round_number = 0;
r->leader_state.round_index = 0;
r->leader_state.round_start = 0;
/* Reset leadership transfer. */
r->leader_state.transferee = 0;
r->leader_state.transferring = false;
/* If there is only one voter, by definition all entries until the
* last_stored can be considered committed (and the voter must be us, since
* no one else could have become leader).
*
* Otherwise, if we have some entries in the log that are past our current
* commit index, they must be from previous terms and we immediately append
* a barrier entry, in order to finalize any pending transaction in the user
* state machine or any pending configuration change. */
if (n_voters == 1) {
assert(configurationIndexOfVoter(&r->configuration, r->id) == 0);
if (r->last_stored > r->commit_index) {
r->commit_index = r->last_stored;
r->update->flags |= RAFT_UPDATE_COMMIT_INDEX;
}
} else if (TrailLastIndex(&r->trail) > r->commit_index ||
isDqliteUnitTest()) {
/* Raft Dissertation, paragraph 6.4:
*
* The Leader Completeness Property guarantees that a leader has all
* committed entries, but at the start of its term, it may not know
* which those are. To find out, it needs to commit an entry from its
* term. Raft handles this by having each leader commit a blank no-op
* entry into the log at the start of its term. */
r->barrier.type = RAFT_BARRIER;
r->barrier.term = r->current_term;
r->barrier.buf.len = 8;
r->barrier.buf.base = raft_malloc(r->barrier.buf.len);
if (r->barrier.buf.base == NULL) {
rv = RAFT_NOMEM;
goto err;
}
*(uint64_t *)r->barrier.buf.base = 0;
r->barrier.batch = r->barrier.buf.base;
rv = ClientSubmit(r, &r->barrier, 1);
if (rv != 0) {
/* This call to ClientSubmit can only fail with RAFT_NOMEM, because
* it's not a RAFT_CHANGE entry (RAFT_MALFORMED can't be returned)
* and we're leader (RAFT_NOTLEADER can't be returned) */
assert(rv == RAFT_NOMEM);
infof("can't submit no-op after converting to leader: %s",
raft_strerror(rv));
raft_free(r->barrier.buf.base);
goto err;
}
}
return 0;
err:
assert(rv == RAFT_NOMEM);
return rv;
}
#undef infof
#undef tracef
raft-0.22.1/src/convert.h 0000664 0000000 0000000 00000003451 14601504142 0015150 0 ustar 00root root 0000000 0000000 /* Convert from one state to another. */
#ifndef CONVERT_H_
#define CONVERT_H_
#include "../include/raft.h"
/* Convert from unavailable, or candidate or leader to follower.
*
* From Figure 3.1:
*
* If election timeout elapses without receiving AppendEntries RPC from
* current leader or granting vote to candidate: convert to candidate.
*
* The above implies that we need to reset the election timer when converting to
* follower. */
void convertToFollower(struct raft *r);
/* Convert from follower to candidate, starting a new election.
*
* From Figure 3.1:
*
* On conversion to candidate, start election
*
* If the disrupt_leader flag is true, the server will set the disrupt leader
* flag of the RequestVote messages it sends.
*
* Errors:
*
* RAFT_NOMEM
* Memory for the votes array could not be allocated.
*/
int convertToCandidate(struct raft *r, bool disrupt_leader);
/* Convert from candidate to leader.
*
* From Figure 3.1:
*
* Upon election: send initial empty AppendEntries RPC (heartbeat) to each
* server.
*
* From Section 3.4:
*
* Once a candidate wins an election, it becomes leader. It then sends
* heartbeat messages to all of the other servers to establish its authority
* and prevent new elections.
*
* From Section 3.3:
*
* The leader maintains a nextIndex for each follower, which is the index
* of the next log entry the leader will send to that follower. When a
* leader first comes to power, it initializes all nextIndex values to the
* index just after the last one in its log.
* Errors:
*
* RAFT_NOMEM
* Memory for the progress array or for the initial no-op entry could
* not be allocated.
*/
int convertToLeader(struct raft *r);
/* Clear the current state */
void convertClear(struct raft *r);
#endif /* CONVERT_H_ */
raft-0.22.1/src/election.c 0000664 0000000 0000000 00000025635 14601504142 0015275 0 ustar 00root root 0000000 0000000 #include "election.h"
#include "assert.h"
#include "configuration.h"
#include "heap.h"
#include "message.h"
#include "random.h"
#include "tracing.h"
#include "trail.h"
#define infof(...) Infof(r->tracer, " " __VA_ARGS__)
/* Common fields between follower and candidate state.
*
* The follower_state and candidate_state structs in raft.h must be kept
* consistent with this definition. */
struct followerOrCandidateState
{
unsigned randomized_election_timeout;
};
/* Return a pointer to either the follower or candidate state. */
struct followerOrCandidateState *getFollowerOrCandidateState(
const struct raft *r)
{
struct followerOrCandidateState *state;
assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE);
if (r->state == RAFT_FOLLOWER) {
state = (struct followerOrCandidateState *)&r->follower_state;
} else {
state = (struct followerOrCandidateState *)&r->candidate_state;
}
return state;
}
void electionUpdateRandomizedTimeout(struct raft *r)
{
struct followerOrCandidateState *state = getFollowerOrCandidateState(r);
unsigned timeout = RandomWithinRange(&r->random, r->election_timeout,
2 * r->election_timeout);
assert(timeout >= r->election_timeout);
assert(timeout <= r->election_timeout * 2);
state->randomized_election_timeout = timeout;
}
void electionResetTimer(struct raft *r)
{
electionUpdateRandomizedTimeout(r);
r->election_timer_start = r->now;
r->update->flags |= RAFT_UPDATE_TIMEOUT;
}
raft_time electionTimerExpiration(const struct raft *r)
{
struct followerOrCandidateState *state;
assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE);
state = getFollowerOrCandidateState(r);
return r->election_timer_start + state->randomized_election_timeout;
}
/* Send a RequestVote RPC to the given server. */
static int electionSend(struct raft *r, const struct raft_server *server)
{
struct raft_message message;
raft_term term;
int rv;
assert(server->id != r->id);
assert(server->id != 0);
/* If we are in the pre-vote phase, we indicate our future term in the
* request. */
term = r->current_term;
if (r->candidate_state.in_pre_vote) {
term++;
}
/* Fill the RequestVote message.
*
* Note that we set last_log_index and last_log_term to the index and term
* of the last persisted entry, not to the last entry in our in-memory log
* cache, because we must advertise only log entries that can't be lost at
* restart.
*
* Also note that, for a similar reason, we apply pending configuration
* changes only once they are persisted. When running an election we then
* use only persisted information, which is safe (while using unpersisted
* information for the log and persisted information for the configuration
* or viceversa would lead to inconsistencies and violations of Raft
* invariants).
*/
message.type = RAFT_REQUEST_VOTE;
message.request_vote.version = MESSAGE__REQUEST_VOTE_VERSION;
message.request_vote.term = term;
message.request_vote.candidate_id = r->id;
message.request_vote.last_log_index = r->last_stored;
message.request_vote.last_log_term = TrailTermOf(&r->trail, r->last_stored);
message.request_vote.disrupt_leader = r->candidate_state.disrupt_leader;
message.request_vote.pre_vote = r->candidate_state.in_pre_vote;
message.server_id = server->id;
message.server_address = server->address;
rv = MessageEnqueue(r, &message);
if (rv != 0) {
assert(rv == RAFT_NOMEM);
return rv;
}
return 0;
}
void electionStart(struct raft *r)
{
size_t n_voters;
size_t voting_index;
size_t i;
assert(r->state == RAFT_CANDIDATE);
n_voters = configurationVoterCount(&r->configuration);
voting_index = configurationIndexOfVoter(&r->configuration, r->id);
/* This function should not be invoked if we are not a voting server, hence
* voting_index must be lower than the number of servers in the
* configuration (meaning that we are a voting server). */
assert(voting_index < r->configuration.n);
/* Coherence check that configurationVoterCount and
* configurationIndexOfVoter have returned something that makes sense. */
assert(n_voters <= r->configuration.n);
assert(voting_index < n_voters);
/* During pre-vote we don't increment our term, or reset our vote. Resetting
* our vote could lead to double-voting if we were to receive a RequestVote
* RPC during our Candidate state, while we actually already voted for a
* server during the term. */
if (!r->candidate_state.in_pre_vote) {
/* Increment current term and vote for self */
r->current_term += 1;
r->voted_for = r->id;
/* Mark both the current term and vote as changed. */
r->update->flags |= RAFT_UPDATE_CURRENT_TERM | RAFT_UPDATE_VOTED_FOR;
}
/* Reset election timer. */
electionResetTimer(r);
assert(r->candidate_state.votes != NULL);
/* Initialize the votes array and send vote requests. */
for (i = 0; i < n_voters; i++) {
if (i == voting_index) {
r->candidate_state.votes[i].grant = true; /* Vote for self */
r->candidate_state.votes[i].features = MESSAGE__FEATURE_CAPACITY;
r->candidate_state.votes[i].capacity = r->capacity;
} else {
r->candidate_state.votes[i].grant = false;
r->candidate_state.votes[i].features = 0;
r->candidate_state.votes[i].capacity = 0;
}
}
for (i = 0; i < r->configuration.n; i++) {
const struct raft_server *server = &r->configuration.servers[i];
int rv;
if (server->id == r->id || server->role != RAFT_VOTER) {
continue;
}
rv = electionSend(r, server);
if (rv != 0) {
/* This is not a critical failure, let's just log it. */
assert(rv == RAFT_NOMEM);
infof("can't send vote request to server %llu: %s", server->id,
raft_strerror(rv));
}
}
}
void electionVote(struct raft *r,
const struct raft_request_vote *args,
bool *granted)
{
const struct raft_server *local_server;
const char *grant_text;
const char *deny_text;
raft_index local_last_index;
raft_term local_last_term;
assert(r != NULL);
assert(args != NULL);
assert(granted != NULL);
local_server = configurationGet(&r->configuration, r->id);
*granted = false;
if (args->pre_vote) {
grant_text = "pre-vote ok";
deny_text = "deny pre-vote";
} else {
grant_text = "grant vote";
deny_text = "don't grant vote";
}
if (local_server == NULL || local_server->role != RAFT_VOTER) {
infof("local server is not voting -> %s", deny_text);
return;
}
if (!args->pre_vote && r->voted_for != 0 &&
r->voted_for != args->candidate_id) {
infof("already voted for server %llu -> %s", r->voted_for, deny_text);
return;
}
/* From Section 9.6:
*
* In the Pre-Vote algorithm, a candidate only increments its term if it
* first learns from a majority of the cluster that they would be willing
* to grant the candidate their votes (if the candidate's log is
* sufficiently up-to-date, and the voters have not received heartbeats
* from a valid leader for at least a baseline election timeout).
*
* Arriving here means that in a pre-vote phase, we will cast our vote if
* the candidate's log is sufficiently up-to-date, no matter what the
* candidate's term is. We have already checked if we currently have a
* leader upon reception of the RequestVote RPC, meaning the 2 conditions
* will be satisfied if the candidate's log is up-to-date. */
local_last_index = TrailLastIndex(&r->trail);
/* Our log is definitely not more up-to-date if it's empty! */
if (local_last_index == 0) {
infof("local log is empty -> %s", grant_text);
goto grant_vote;
}
local_last_term = TrailLastTerm(&r->trail);
/* If the term of the last entry of the requesting server's log is lower
* than the term of the last entry of our log, then our log is more
* up-to-date and we don't grant the vote. */
if (args->last_log_term < local_last_term) {
infof("remote log older (%llu^%llu vs %llu^%llu) -> %s",
args->last_log_index, args->last_log_term, local_last_index,
local_last_term, deny_text);
return;
}
/* If the term of the last entry of our log is lower than the term of the
* last entry of the requesting server's log, then the requesting server's
* log is more up-to-date and we grant our vote. */
if (local_last_term < args->last_log_term) {
infof("remote log is more recent (%llu^%llu vs %llu^%llu) -> %s",
args->last_log_index, args->last_log_term, local_last_index,
local_last_term, grant_text);
goto grant_vote;
}
/* The term of the last log entry is the same, so let's compare the length
* of the log. */
assert(args->last_log_term == local_last_term);
if (local_last_index <= args->last_log_index) {
/* Our log is shorter or equal to the one of the requester. */
if (local_last_index == args->last_log_index) {
infof("remote log is equal (%llu^%llu) -> %s", args->last_log_index,
args->last_log_term, grant_text);
} else {
assert(local_last_index < args->last_log_index);
infof("remote log is longer (%llu^%llu vs %llu^%llu) -> %s",
args->last_log_index, args->last_log_term, local_last_index,
local_last_term, grant_text);
}
goto grant_vote;
}
infof("remote log shorter (%llu^%llu vs %llu^%llu) -> %s",
args->last_log_index, args->last_log_term, local_last_index,
local_last_term, deny_text);
return;
grant_vote:
if (!args->pre_vote) {
/* Mark the vote as changed. */
r->update->flags |= RAFT_UPDATE_CURRENT_TERM | RAFT_UPDATE_VOTED_FOR;
r->voted_for = args->candidate_id;
/* Reset the election timer. */
r->election_timer_start = r->now;
r->update->flags |= RAFT_UPDATE_TIMEOUT;
}
*granted = true;
}
bool electionTally(struct raft *r,
const size_t voter_index,
unsigned *votes,
unsigned *n_voters)
{
size_t half;
size_t i;
*n_voters = configurationVoterCount(&r->configuration);
*votes = 0;
half = *n_voters / 2;
assert(r->state == RAFT_CANDIDATE);
assert(r->candidate_state.votes != NULL);
r->candidate_state.votes[voter_index].grant = true;
for (i = 0; i < *n_voters; i++) {
if (r->candidate_state.votes[i].grant) {
*votes += 1;
}
}
return *votes >= half + 1;
}
#undef infof
raft-0.22.1/src/election.h 0000664 0000000 0000000 00000006634 14601504142 0015300 0 ustar 00root root 0000000 0000000 /* Election-related logic and helpers. */
#ifndef ELECTION_H_
#define ELECTION_H_
#include "../include/raft.h"
/* This function must be called after the election timeout value has been
* changed and the server is in follower or candidate state. It generates a new
* value of the randomized election timeout. */
void electionUpdateRandomizedTimeout(struct raft *r);
/* Reset the election_timer clock and set randomized_election_timeout to a
* random value between election_timeout and 2 * election_timeout.
*
* From Section 3.4:
*
* Raft uses randomized election timeouts to ensure that split votes are rare
* and that they are resolved quickly. To prevent split votes in the first
* place, election timeouts are chosen randomly from a fixed interval (e.g.,
* 150-300 ms). This spreads out the servers so that in most cases only a
* single server will time out.
*
* From Section 9.4:
*
* We used AvailSim to approximate a WAN spanning the continental US. Each
* message was assigned a latency chosen randomly from the uniform range of
* 30-40 ms, and the servers' election timeout range was set accordingly to
* 300-600 ms (about 10-20 times the one-way network latency). When only one
* of the five servers has failed, the average election completes within about
* 475 ms, and 99.9% of elections complete within 1.5 s. Even when two of the
* five servers have failed, the average election takes about 650 ms (about 20
* times the one-way network latency), and 99.9% of elections complete in 3
* s. We believe these election times are more than adequate for most WAN
* deployments.
*
* Must be called in follower or candidate state. */
void electionResetTimer(struct raft *r);
/* Return the time at which the election timer will expire next.
*
* Must be called in follower or candidate state. */
raft_time electionTimerExpiration(const struct raft *r);
/* Start a new election round.
*
* From Figure 3.1:
*
* [Rules for Servers] Candidates: On conversion to candidates, start
* election:
*
* - Increment current term
* - Vote for self
* - Reset election timer
* - Send RequestVote RPCs to all other servers
*
* From Section 3.4:
*
* To begin an election, a follower increments its current term and
* transitions to candidate state. It then votes for itself and issues
* RequestVote RPCs in parallel to each of the other servers in the
* cluster.
*/
void electionStart(struct raft *r);
/* Decide whether our vote should be granted to the requesting server and update
* our state accordingly.
*
* From Figure 3.1:
*
* RequestVote RPC: Receiver Implementation:
*
* - If votedFor is null or candidateId, and candidate's log is at least as
* up-to-date as receiver's log, grant vote.
*
* The outcome of the decision is stored through the @granted pointer. */
void electionVote(struct raft *r,
const struct raft_request_vote *args,
bool *granted);
/* Update the votes array by adding the vote from the server at the given
* index. Return true if with this vote the server has reached the majority of
* votes and won elections.
*
* The 'votes' and 'n_voters' output parameters indicate how many votes the
* server has and how many voters are there. */
bool electionTally(struct raft *r,
size_t voter_index,
unsigned *votes,
unsigned *n_voters);
#endif /* ELECTION_H_ */
raft-0.22.1/src/entry.c 0000664 0000000 0000000 00000003612 14601504142 0014623 0 ustar 00root root 0000000 0000000 #include
#include
#include "assert.h"
#include "entry.h"
void entryBatchesDestroy(struct raft_entry *entries, const size_t n)
{
void *batch = NULL;
size_t i;
if (entries == NULL) {
assert(n == 0);
return;
}
assert(n > 0);
for (i = 0; i < n; i++) {
assert(entries[i].batch != NULL);
if (entries[i].batch != batch) {
batch = entries[i].batch;
raft_free(batch);
}
}
raft_free(entries);
}
int entryCopy(const struct raft_entry *src, struct raft_entry *dst)
{
dst->term = src->term;
dst->type = src->type;
dst->buf.len = src->buf.len;
dst->buf.base = raft_malloc(dst->buf.len);
if (dst->buf.len > 0 && dst->buf.base == NULL) {
return RAFT_NOMEM;
}
memcpy(dst->buf.base, src->buf.base, dst->buf.len);
dst->batch = NULL;
return 0;
}
int entryBatchCopy(const struct raft_entry *src,
struct raft_entry **dst,
const size_t n)
{
size_t size = 0;
void *batch;
uint8_t *cursor;
unsigned i;
if (n == 0) {
*dst = NULL;
return 0;
}
/* Calculate the total size of the entries content and allocate the
* batch. */
for (i = 0; i < n; i++) {
size += src[i].buf.len;
}
batch = raft_malloc(size);
if (batch == NULL) {
return RAFT_NOMEM;
}
/* Copy the entries. */
*dst = raft_malloc(n * sizeof **dst);
if (*dst == NULL) {
raft_free(batch);
return RAFT_NOMEM;
}
cursor = batch;
for (i = 0; i < n; i++) {
(*dst)[i].term = src[i].term;
(*dst)[i].type = src[i].type;
(*dst)[i].buf.base = cursor;
(*dst)[i].buf.len = src[i].buf.len;
(*dst)[i].batch = batch;
memcpy((*dst)[i].buf.base, src[i].buf.base, src[i].buf.len);
cursor += src[i].buf.len;
}
return 0;
}
raft-0.22.1/src/entry.h 0000664 0000000 0000000 00000001217 14601504142 0014627 0 ustar 00root root 0000000 0000000 #ifndef ENTRY_H_
#define ENTRY_H_
#include "../include/raft.h"
/* Release all memory associated with the given entries, including the array
* itself. The entries are supposed to belong to one or more batches. */
void entryBatchesDestroy(struct raft_entry *entries, size_t n);
/* Create a copy of a log entry, including its data. */
int entryCopy(const struct raft_entry *src, struct raft_entry *dst);
/* Create a single batch of entries containing a copy of the given entries,
* including their data. */
int entryBatchCopy(const struct raft_entry *src,
struct raft_entry **dst,
size_t n);
#endif /* ENTRY_H */
raft-0.22.1/src/err.c 0000664 0000000 0000000 00000003445 14601504142 0014256 0 ustar 00root root 0000000 0000000 #include "err.h"
#include
#include "../include/raft.h"
#include "assert.h"
#define WRAP_SEP ": "
#define WRAP_SEP_LEN ((size_t)strlen(WRAP_SEP))
void errMsgWrap(char *e, const char *format)
{
size_t n = RAFT_ERRMSG_BUF_SIZE;
size_t prefix_n;
size_t prefix_and_sep_n;
size_t trail_n;
size_t i;
/* Calculate the length of the prefix. */
prefix_n = strlen(format);
/* If there isn't enough space for the ": " separator and at least one
* character of the wrapped error message, then just print the prefix. */
if (prefix_n >= n - (WRAP_SEP_LEN + 1)) {
/* We explicitly allow truncation here + silence clang about unknown
* warning-group "-Wformat-truncation" */
#ifdef __GNUC__
#ifndef __clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wformat-truncation"
#endif
#endif
ErrMsgPrintf(e, "%s", format);
#ifdef __GNUC__
#ifndef __clang__
#pragma GCC diagnostic pop
#endif
#endif
return;
}
/* Right-shift the wrapped message, to make room for the prefix. */
prefix_and_sep_n = prefix_n + WRAP_SEP_LEN;
trail_n = strnlen(e, n - prefix_and_sep_n - 1);
memmove(e + prefix_and_sep_n, e, trail_n);
e[prefix_and_sep_n + trail_n] = 0;
/* Print the prefix. */
ErrMsgPrintf(e, "%s", format);
/* Print the separator.
*
* Avoid using strncpy(e->msg + prefix_n, WRAP_SEP, WRAP_SEP_LEN) since it
* generates a warning. */
for (i = 0; i < WRAP_SEP_LEN; i++) {
e[prefix_n + i] = WRAP_SEP[i];
}
}
#define ERR_CODE_TO_STRING_CASE(CODE, MSG) \
case CODE: \
return MSG;
const char *errCodeToString(int code)
{
switch (code) {
ERR_CODE_TO_STRING_MAP(ERR_CODE_TO_STRING_CASE);
default:
return "unknown error";
}
}
raft-0.22.1/src/err.h 0000664 0000000 0000000 00000006120 14601504142 0014254 0 ustar 00root root 0000000 0000000 /* Utilities around error handling. */
#ifndef ERROR_H_
#define ERROR_H_
#include
#include
#define ERR_CODE_TO_STRING_MAP(X) \
X(RAFT_NOMEM, "out of memory") \
X(RAFT_BADID, "server ID is not valid") \
X(RAFT_DUPLICATEID, "server ID already in use") \
X(RAFT_DUPLICATEADDRESS, "server address already in use") \
X(RAFT_BADROLE, "server role is not valid") \
X(RAFT_MALFORMED, "encoded data is malformed") \
X(RAFT_NOTLEADER, "server is not the leader") \
X(RAFT_LEADERSHIPLOST, "server has lost leadership") \
X(RAFT_SHUTDOWN, "server is shutting down") \
X(RAFT_CANTBOOTSTRAP, "bootstrap only works on new clusters") \
X(RAFT_CANTCHANGE, "a configuration change is already in progress") \
X(RAFT_CORRUPT, "persisted data is corrupted") \
X(RAFT_CANCELED, "operation canceled") \
X(RAFT_NAMETOOLONG, "resource name too long") \
X(RAFT_TOOBIG, "data is too big") \
X(RAFT_NOCONNECTION, "no connection to remote server available") \
X(RAFT_BUSY, "operation can't be performed at this time") \
X(RAFT_IOERR, "I/O error") \
X(RAFT_NOTFOUND, "Resource not found") \
X(RAFT_INVALID, "Invalid parameter") \
X(RAFT_UNAUTHORIZED, "No access to resource") \
X(RAFT_NOSPACE, "Not enough disk space") \
X(RAFT_TOOMANY, "System or raft limit met or exceeded")
/* Format an error message. */
#define ErrMsgPrintf(ERRMSG, ...) \
snprintf(ERRMSG, RAFT_ERRMSG_BUF_SIZE, __VA_ARGS__)
/* Wrap the given error message with an additional prefix message.. */
#define ErrMsgWrapf(ERRMSG, ...) \
do { \
char _errmsg[RAFT_ERRMSG_BUF_SIZE]; \
ErrMsgPrintf(_errmsg, __VA_ARGS__); \
errMsgWrap(ERRMSG, _errmsg); \
} while (0)
void errMsgWrap(char *e, const char *format);
/* Transfer an error message from an object to another, wrapping it. */
#define ErrMsgTransfer(ERRMSG1, ERRMSG2, FORMAT) \
memcpy(ERRMSG2, ERRMSG1, RAFT_ERRMSG_BUF_SIZE); \
ErrMsgWrapf(ERRMSG2, FORMAT)
#define ErrMsgTransferf(ERRMSG1, ERRMSG2, FORMAT, ...) \
memcpy(ERRMSG2, ERRMSG1, RAFT_ERRMSG_BUF_SIZE); \
ErrMsgWrapf(ERRMSG2, FORMAT, __VA_ARGS__)
/* Use the static error message for the error with the given code. */
#define ErrMsgFromCode(ERRMSG, CODE) \
ErrMsgPrintf(ERRMSG, "%s", errCodeToString(CODE))
/* Format the out of memory error message. */
#define ErrMsgOom(ERRMSG) ErrMsgFromCode(ERRMSG, RAFT_NOMEM)
/* Convert a numeric raft error code to a human-readable error message. */
const char *errCodeToString(int code);
#endif /* ERROR_H_ */
raft-0.22.1/src/fixture.c 0000664 0000000 0000000 00000155512 14601504142 0015157 0 ustar 00root root 0000000 0000000 #include "../include/raft/fixture.h"
#include
#include
#include
#include
#include "assert.h"
#include "configuration.h"
#include "convert.h"
#include "entry.h"
#include "legacy.h"
#include "log.h"
#include "queue.h"
#include "random.h"
#include "snapshot.h"
#include "tracing.h"
#define tracef(...) Tracef(r->tracer, __VA_ARGS__)
/* Defaults */
#define HEARTBEAT_TIMEOUT 100
#define INSTALL_SNAPSHOT_TIMEOUT 30000
#define ELECTION_TIMEOUT 1000
#define NETWORK_LATENCY 15
#define DISK_LATENCY 10
#define WORK_DURATION 200
/* To keep in sync with raft.h */
#define N_MESSAGE_TYPES 6
/* Maximum number of peer stub instances connected to a certain stub
* instance. This should be enough for testing purposes. */
#define MAX_PEERS 8
struct raft_fixture_server
{
bool alive; /* If false, the server is down. */
raft_id id; /* Server ID. */
char address[16]; /* Server address (stringified ID). */
struct raft_tracer tracer; /* Tracer. */
struct raft_io io; /* In-memory raft_io implementation. */
struct raft raft; /* Raft instance. */
};
struct raft_fixture_event
{
unsigned server_index; /* Index of the server the event occurred on. */
int type; /* Type of the event. */
};
RAFT_API int raft_fixture_event_type(struct raft_fixture_event *event)
{
assert(event != NULL);
return event->type;
}
RAFT_API unsigned raft_fixture_event_server_index(
struct raft_fixture_event *event)
{
assert(event != NULL);
return event->server_index;
}
/* Fields common across all request types. */
#define REQUEST \
int type; /* Request code type. */ \
raft_time completion_time; /* When the request should be fulfilled. */ \
queue queue /* Link the I/O pending requests queue. */
/* Request type codes. */
enum { APPEND = 1, SEND, TRANSMIT, SNAPSHOT_PUT, SNAPSHOT_GET, ASYNC_WORK };
/* Abstract base type for an asynchronous request submitted to the stub I/o
* implementation. */
struct ioRequest
{
REQUEST;
};
/* Pending request to append entries to the log. */
struct append
{
REQUEST;
struct raft_io_append *req;
const struct raft_entry *entries;
unsigned n;
unsigned start; /* Request timestamp. */
};
/* Pending request to send a message. */
struct send
{
REQUEST;
struct raft_io_send *req;
struct raft_message message;
};
/* Pending request to store a snapshot. */
struct snapshot_put
{
REQUEST;
unsigned trailing;
struct raft_io_snapshot_put *req;
const struct raft_snapshot *snapshot;
};
/* Pending request to load a snapshot. */
struct snapshot_get
{
REQUEST;
struct raft_io_snapshot_get *req;
};
/* Message that has been written to the network and is waiting to be delivered
* (or discarded). */
struct transmit
{
REQUEST;
struct raft_message message; /* Message to deliver */
int timer; /* Deliver after this n of msecs. */
};
/* Information about a peer server. */
struct peer
{
struct io *io; /* The peer's I/O backend. */
bool connected; /* Whether a connection is established. */
bool saturated; /* Whether the established connection is saturated. */
};
/* Stub I/O implementation implementing all operations in-memory. */
struct io
{
struct raft_io *io; /* I/O object we're implementing. */
unsigned index; /* Fixture server index. */
raft_time *time; /* Global cluster time. */
raft_time next_tick; /* Time the next tick should occurs. */
/* Term and vote */
raft_term term;
raft_id voted_for;
/* Log */
struct raft_snapshot *snapshot; /* Latest snapshot */
struct raft_entry *entries; /* Array or persisted entries */
size_t n; /* Size of the persisted entries array */
raft_index start_index;
/* Parameters passed via raft_io->init and raft_io->start */
raft_id id;
const char *address;
unsigned tick_interval;
raft_io_tick_cb tick_cb;
raft_io_recv_cb recv_cb;
/* Queue of pending asynchronous requests, whose callbacks still haven't
* been fired. */
queue requests;
/* Peers connected to us. */
struct peer peers[MAX_PEERS];
unsigned n_peers;
/* The randomized_election_timeout field stores the value that the raft
* instance will obtain the next time it calls RandomWithinRange() to obtain
* a random number in the [election_timeout, election_timeout * 2] range. We
* do that by passing raft_seed() a value that makes the pseudor random
* number generator produce exactly randomized_election_timeout. That value
* is what we store in the seed field below. Since calculating the seed that
* matches the desired randomized_election_timeout is somehow expensive, we
* also use randomized_election_timeout_prev to store the previous value of
* randomized_election_timeout, in order to re-use the same seed if nothing
* has changed.
*
* See serverSeed for more details. */
unsigned randomized_election_timeout;
unsigned randomized_election_timeout_prev;
unsigned seed;
unsigned network_latency; /* Milliseconds to deliver RPCs */
unsigned disk_latency; /* Milliseconds to perform disk I/O */
unsigned work_duration; /* Milliseconds to run async work */
struct
{
int countdown; /* Trigger the fault when this counter gets to zero. */
int n; /* Repeat the fault this many times. Default is -1. */
} fault;
/* If flag i is true, messages of type i will be silently dropped. */
bool drop[N_MESSAGE_TYPES];
/* Counters of events that happened so far. */
unsigned n_send[N_MESSAGE_TYPES];
unsigned n_recv[N_MESSAGE_TYPES];
unsigned n_append;
};
/* Advance the fault counters and return @true if an error should occur. */
static bool ioFaultTick(struct io *io)
{
/* If the countdown is negative, faults are disabled. */
if (io->fault.countdown < 0) {
return false;
}
/* If the countdown didn't reach zero, it's still not come the time to
* trigger faults. */
if (io->fault.countdown > 0) {
io->fault.countdown--;
return false;
}
assert(io->fault.countdown == 0);
/* If n is negative we keep triggering the fault forever. */
if (io->fault.n < 0) {
return true;
}
/* If n is positive we need to trigger the fault at least this time. */
if (io->fault.n > 0) {
io->fault.n--;
return true;
}
assert(io->fault.n == 0);
/* We reached 'n', let's disable faults. */
io->fault.countdown--;
return false;
}
static int ioMethodInit(struct raft_io *raft_io,
raft_id id,
const char *address)
{
struct io *io = raft_io->impl;
io->id = id;
io->address = address;
return 0;
}
static int ioMethodStart(struct raft_io *raft_io,
unsigned msecs,
raft_io_tick_cb tick_cb,
raft_io_recv_cb recv_cb)
{
struct io *io = raft_io->impl;
if (ioFaultTick(io)) {
return RAFT_IOERR;
}
io->tick_interval = msecs;
io->tick_cb = tick_cb;
io->recv_cb = recv_cb;
io->next_tick = *io->time + io->tick_interval;
return 0;
}
/* Flush an append entries request, appending its entries to the local in-memory
* log. */
static void ioFlushAppend(struct io *s, struct append *append)
{
struct raft_entry *entries;
unsigned i;
int status = 0;
/* Simulates a disk write failure. */
if (ioFaultTick(s)) {
status = RAFT_IOERR;
goto done;
}
/* Allocate an array for the old entries plus the new ones. */
entries = raft_realloc(s->entries, (s->n + append->n) * sizeof *s->entries);
assert(entries != NULL);
/* Copy new entries into the new array. */
for (i = 0; i < append->n; i++) {
const struct raft_entry *src = &append->entries[i];
struct raft_entry *dst = &entries[s->n + i];
int rv = entryCopy(src, dst);
assert(rv == 0);
}
s->entries = entries;
s->n += append->n;
done:
if (append->req->cb != NULL) {
append->req->cb(append->req, status);
}
raft_free(append);
}
/* Flush a snapshot put request, copying the snapshot data. */
static void ioFlushSnapshotPut(struct io *s, struct snapshot_put *r)
{
int rv;
if (s->snapshot == NULL) {
s->snapshot = raft_malloc(sizeof *s->snapshot);
assert(s->snapshot != NULL);
} else {
snapshotClose(s->snapshot);
}
rv = snapshotCopy(r->snapshot, s->snapshot);
assert(rv == 0);
if (r->trailing == 0) {
rv = s->io->truncate(s->io, 1);
assert(rv == 0);
s->start_index = s->snapshot->index;
}
if (r->req->cb != NULL) {
r->req->cb(r->req, 0);
}
raft_free(r);
}
/* Flush a snapshot get request, returning to the client a copy of the local
* snapshot (if any). */
static void ioFlushSnapshotGet(struct io *s, struct snapshot_get *r)
{
struct raft_snapshot *snapshot;
int rv;
snapshot = raft_malloc(sizeof *snapshot);
assert(snapshot != NULL);
rv = snapshotCopy(s->snapshot, snapshot);
assert(rv == 0);
r->req->cb(r->req, snapshot, 0);
raft_free(r);
}
/* Search for the peer with the given ID. */
static struct peer *ioGetPeer(struct io *io, raft_id id)
{
unsigned i;
for (i = 0; i < io->n_peers; i++) {
struct peer *peer = &io->peers[i];
if (peer->io->id == id) {
return peer;
}
}
return NULL;
}
/* Copy the dynamically allocated memory of an AppendEntries message. */
static void copyAppendEntries(const struct raft_append_entries *src,
struct raft_append_entries *dst)
{
int rv;
rv = entryBatchCopy(src->entries, &dst->entries, src->n_entries);
assert(rv == 0);
dst->n_entries = src->n_entries;
}
/* Copy the dynamically allocated memory of an InstallSnapshot message. */
static void copyInstallSnapshot(const struct raft_install_snapshot *src,
struct raft_install_snapshot *dst)
{
int rv;
rv = configurationCopy(&src->conf, &dst->conf);
assert(rv == 0);
dst->data.base = raft_malloc(dst->data.len);
assert(dst->data.base != NULL);
memcpy(dst->data.base, src->data.base, src->data.len);
}
/* Flush a raft_io_send request, copying the message content into a new struct
* transmit object and invoking the user callback. */
static void ioFlushSend(struct io *io, struct send *send)
{
struct peer *peer;
struct transmit *transmit;
struct raft_message *src;
struct raft_message *dst;
int status;
/* If the peer doesn't exist or was disconnected, fail the request. */
peer = ioGetPeer(io, send->message.server_id);
if (peer == NULL || !peer->connected) {
status = RAFT_NOCONNECTION;
goto out;
}
transmit = raft_calloc(1, sizeof *transmit);
assert(transmit != NULL);
transmit->type = TRANSMIT;
transmit->completion_time = *io->time + io->network_latency;
src = &send->message;
dst = &transmit->message;
QUEUE_PUSH(&io->requests, &transmit->queue);
*dst = *src;
switch (dst->type) {
case RAFT_APPEND_ENTRIES:
/* Make a copy of the entries being sent */
copyAppendEntries(&src->append_entries, &dst->append_entries);
break;
case RAFT_INSTALL_SNAPSHOT:
copyInstallSnapshot(&src->install_snapshot, &dst->install_snapshot);
break;
default:
break;
}
/* tracef("io: flush: %s", describeMessage(&send->message)); */
io->n_send[send->message.type]++;
status = 0;
out:
if (send->req->cb != NULL) {
send->req->cb(send->req, status);
}
raft_free(send);
}
/* Release the memory used by the given message transmit object. */
static void ioDestroyTransmit(struct transmit *transmit)
{
struct raft_message *message;
message = &transmit->message;
switch (message->type) {
case RAFT_APPEND_ENTRIES:
if (message->append_entries.entries != NULL) {
raft_free(message->append_entries.entries[0].batch);
raft_free(message->append_entries.entries);
}
break;
case RAFT_INSTALL_SNAPSHOT:
raft_configuration_close(&message->install_snapshot.conf);
raft_free(message->install_snapshot.data.base);
break;
default:
break;
}
raft_free(transmit);
}
/* Flush all requests in the queue. */
static void ioFlushAll(struct io *io)
{
while (!QUEUE_IS_EMPTY(&io->requests)) {
queue *head;
struct ioRequest *r;
head = QUEUE_HEAD(&io->requests);
QUEUE_REMOVE(head);
r = QUEUE_DATA(head, struct ioRequest, queue);
switch (r->type) {
case APPEND:
ioFlushAppend(io, (struct append *)r);
break;
case SEND:
ioFlushSend(io, (struct send *)r);
break;
case TRANSMIT:
ioDestroyTransmit((struct transmit *)r);
break;
case SNAPSHOT_PUT:
ioFlushSnapshotPut(io, (struct snapshot_put *)r);
break;
case SNAPSHOT_GET:
ioFlushSnapshotGet(io, (struct snapshot_get *)r);
break;
default:
assert(0);
}
}
}
static void ioMethodClose(struct raft_io *raft_io, raft_io_close_cb cb)
{
if (cb != NULL) {
cb(raft_io);
}
}
static int ioMethodLoad(struct raft_io *io,
raft_term *term,
raft_id *voted_for,
struct raft_snapshot **snapshot,
raft_index *start_index,
struct raft_entry **entries,
size_t *n_entries)
{
struct io *s;
int rv;
s = io->impl;
if (ioFaultTick(s)) {
return RAFT_IOERR;
}
*term = s->term;
*voted_for = s->voted_for;
*start_index = 1;
*n_entries = s->n;
/* Make a copy of the persisted entries, storing their data into a single
* batch. */
rv = entryBatchCopy(s->entries, entries, s->n);
assert(rv == 0);
if (s->snapshot != NULL) {
*snapshot = raft_malloc(sizeof **snapshot);
assert(*snapshot != NULL);
rv = snapshotCopy(s->snapshot, *snapshot);
assert(rv == 0);
*start_index = (*snapshot)->index + 1;
} else {
*snapshot = NULL;
}
return 0;
}
static int ioMethodBootstrap(struct raft_io *raft_io,
const struct raft_configuration *conf)
{
struct io *io = raft_io->impl;
struct raft_buffer buf;
struct raft_entry *entries;
int rv;
if (ioFaultTick(io)) {
return RAFT_IOERR;
}
if (io->term != 0) {
return RAFT_CANTBOOTSTRAP;
}
assert(io->voted_for == 0);
assert(io->snapshot == NULL);
assert(io->entries == NULL);
assert(io->n == 0);
/* Encode the given configuration. */
rv = configurationEncode(conf, &buf);
if (rv != 0) {
return rv;
}
entries = raft_calloc(1, sizeof *io->entries);
if (entries == NULL) {
return RAFT_NOMEM;
}
entries[0].term = 1;
entries[0].type = RAFT_CHANGE;
entries[0].buf = buf;
io->term = 1;
io->voted_for = 0;
io->snapshot = NULL;
io->entries = entries;
io->n = 1;
return 0;
}
static int ioMethodRecover(struct raft_io *io,
const struct raft_configuration *conf)
{
/* TODO: implement this API */
(void)io;
(void)conf;
return RAFT_IOERR;
}
static int ioMethodSetTerm(struct raft_io *raft_io, const raft_term term)
{
struct io *io = raft_io->impl;
if (ioFaultTick(io)) {
return RAFT_IOERR;
}
io->term = term;
io->voted_for = 0;
return 0;
}
static int ioMethodSetVote(struct raft_io *raft_io, const raft_id server_id)
{
struct io *io = raft_io->impl;
if (ioFaultTick(io)) {
return RAFT_IOERR;
}
/* tracef("io: set vote: %d %d", server_id, io->index); */
io->voted_for = server_id;
return 0;
}
static int ioMethodAppend(struct raft_io *raft_io,
struct raft_io_append *req,
const struct raft_entry entries[],
unsigned n,
raft_io_append_cb cb)
{
struct io *io = raft_io->impl;
struct append *r;
if (ioFaultTick(io)) {
return RAFT_IOERR;
}
r = raft_malloc(sizeof *r);
assert(r != NULL);
r->type = APPEND;
r->completion_time = *io->time + io->disk_latency;
r->req = req;
r->entries = entries;
r->n = n;
req->cb = cb;
QUEUE_PUSH(&io->requests, &r->queue);
return 0;
}
static int ioMethodTruncate(struct raft_io *raft_io, raft_index index)
{
struct io *io = raft_io->impl;
raft_index last_index = io->start_index + io->n;
size_t n;
if (index >= last_index + 1) {
/* Nothing to truncate */
return 0;
}
if (ioFaultTick(io)) {
return RAFT_IOERR;
}
n = (size_t)(index - 1); /* Number of entries left after truncation */
if (n > 0) {
struct raft_entry *entries;
/* Create a new array of entries holding the non-truncated entries */
entries = raft_malloc(n * sizeof *entries);
if (entries == NULL) {
return RAFT_NOMEM;
}
memcpy(entries, io->entries, n * sizeof *io->entries);
/* Release any truncated entry */
if (io->entries != NULL) {
size_t i;
for (i = n; i < io->n; i++) {
raft_free(io->entries[i].buf.base);
}
raft_free(io->entries);
}
io->entries = entries;
} else {
/* Release everything we have */
if (io->entries != NULL) {
size_t i;
for (i = 0; i < io->n; i++) {
raft_free(io->entries[i].buf.base);
}
raft_free(io->entries);
io->entries = NULL;
}
}
io->n = n;
return 0;
}
static int ioMethodSnapshotPut(struct raft_io *raft_io,
unsigned trailing,
struct raft_io_snapshot_put *req,
const struct raft_snapshot *snapshot,
raft_io_snapshot_put_cb cb)
{
struct io *io = raft_io->impl;
struct snapshot_put *r;
r = raft_malloc(sizeof *r);
assert(r != NULL);
r->type = SNAPSHOT_PUT;
r->req = req;
r->req->cb = cb;
r->snapshot = snapshot;
r->completion_time = *io->time + io->disk_latency;
r->trailing = trailing;
QUEUE_PUSH(&io->requests, &r->queue);
return 0;
}
static int ioMethodSnapshotGet(struct raft_io *raft_io,
struct raft_io_snapshot_get *req,
raft_io_snapshot_get_cb cb)
{
struct io *io = raft_io->impl;
struct snapshot_get *r;
r = raft_malloc(sizeof *r);
assert(r != NULL);
r->type = SNAPSHOT_GET;
r->req = req;
r->req->cb = cb;
r->completion_time = *io->time + io->disk_latency;
QUEUE_PUSH(&io->requests, &r->queue);
return 0;
}
static raft_time ioMethodTime(struct raft_io *raft_io)
{
struct io *io = raft_io->impl;
return *io->time;
}
static int ioMethodRandom(struct raft_io *raft_io, int min, int max)
{
(void)raft_io;
(void)min;
(void)max;
/* This method is used by raft_init() to get the initial seed for the
* pseudo random number generator. However, it doesn't matter what we return
* here, since in serverSeed() we'll call raft_seed() again overwriting
* whatever value we return here. */
return 0;
}
/* Queue up a request which will be processed later, when io_stub_flush()
* is invoked. */
static int ioMethodSend(struct raft_io *raft_io,
struct raft_io_send *req,
const struct raft_message *message,
raft_io_send_cb cb)
{
struct io *io = raft_io->impl;
struct send *r;
if (ioFaultTick(io)) {
return RAFT_IOERR;
}
/* tracef("io: send: %s to server %d", describeMessage(message),
message->server_id); */
r = raft_malloc(sizeof *r);
assert(r != NULL);
r->type = SEND;
r->req = req;
r->message = *message;
r->req->cb = cb;
/* TODO: simulate the presence of an OS send buffer, whose available size
* might delay the completion of send requests */
r->completion_time = *io->time;
QUEUE_PUSH(&io->requests, &r->queue);
return 0;
}
static void ioReceive(struct io *io, struct raft_message *message)
{
/* tracef("io: recv: %s from server %d", describeMessage(message),
message->server_id); */
io->recv_cb(io->io, message);
io->n_recv[message->type]++;
}
static void ioDeliverTransmit(struct io *io, struct transmit *transmit)
{
struct raft_message *message = &transmit->message;
struct peer *peer; /* Destination peer */
/* If this message type is in the drop list, let's discard it */
if (io->drop[message->type - 1]) {
ioDestroyTransmit(transmit);
return;
}
peer = ioGetPeer(io, message->server_id);
/* We don't have any peer with this ID or it's disconnected or if the
* connection is saturated, let's drop the message */
if (peer == NULL || !peer->connected || peer->saturated) {
ioDestroyTransmit(transmit);
return;
}
/* Update the message object with our details. */
message->server_id = io->id;
message->server_address = io->address;
ioReceive(peer->io, message);
raft_free(transmit);
}
/* Connect @raft_io to @other, enabling delivery of messages sent from @io to
* @other.
*/
static void ioConnect(struct raft_io *raft_io, struct raft_io *other)
{
struct io *io = raft_io->impl;
struct io *io_other = other->impl;
assert(io->n_peers < MAX_PEERS);
io->peers[io->n_peers].io = io_other;
io->peers[io->n_peers].connected = true;
io->peers[io->n_peers].saturated = false;
io->n_peers++;
}
/* Return whether the connection with the given peer is saturated. */
static bool ioSaturated(struct raft_io *raft_io, struct raft_io *other)
{
struct io *io = raft_io->impl;
struct io *io_other = other->impl;
struct peer *peer;
peer = ioGetPeer(io, io_other->id);
return peer != NULL && peer->saturated;
}
/* Disconnect @raft_io and @other, causing calls to @io->send() to fail
* asynchronously when sending messages to @other. */
static void ioDisconnect(struct raft_io *raft_io, struct raft_io *other)
{
struct io *io = raft_io->impl;
struct io *io_other = other->impl;
struct peer *peer;
peer = ioGetPeer(io, io_other->id);
assert(peer != NULL);
peer->connected = false;
}
/* Reconnect @raft_io and @other. */
static void ioReconnect(struct raft_io *raft_io, struct raft_io *other)
{
struct io *io = raft_io->impl;
struct io *io_other = other->impl;
struct peer *peer;
peer = ioGetPeer(io, io_other->id);
assert(peer != NULL);
peer->connected = true;
}
/* Saturate the connection from @io to @other, causing messages sent from @io to
* @other to be dropped. */
static void ioSaturate(struct raft_io *io, struct raft_io *other)
{
struct io *s;
struct io *s_other;
struct peer *peer;
s = io->impl;
s_other = other->impl;
peer = ioGetPeer(s, s_other->id);
assert(peer != NULL && peer->connected);
peer->saturated = true;
}
/* Desaturate the connection from @raft_io to @other, re-enabling delivery of
* messages sent from @raft_io to @other. */
static void ioDesaturate(struct raft_io *raft_io, struct raft_io *other)
{
struct io *io = raft_io->impl;
struct io *io_other = other->impl;
struct peer *peer;
peer = ioGetPeer(io, io_other->id);
assert(peer != NULL && peer->connected);
peer->saturated = false;
}
/* Enable or disable silently dropping all outgoing messages of type @type. */
void ioDrop(struct io *io, int type, bool flag)
{
io->drop[type - 1] = flag;
}
static int ioInit(struct raft_io *raft_io, unsigned index, raft_time *time)
{
struct io *io;
io = raft_malloc(sizeof *io);
assert(io != NULL);
io->io = raft_io;
io->index = index;
io->time = time;
io->term = 0;
io->voted_for = 0;
io->snapshot = NULL;
io->entries = NULL;
io->n = 0;
io->start_index = 0;
QUEUE_INIT(&io->requests);
io->n_peers = 0;
io->randomized_election_timeout = ELECTION_TIMEOUT + index * 100;
io->randomized_election_timeout_prev = 0;
io->network_latency = NETWORK_LATENCY;
io->disk_latency = DISK_LATENCY;
io->work_duration = WORK_DURATION;
io->fault.countdown = -1;
io->fault.n = -1;
memset(io->drop, 0, sizeof io->drop);
memset(io->n_send, 0, sizeof io->n_send);
memset(io->n_recv, 0, sizeof io->n_recv);
io->n_append = 0;
raft_io->impl = io;
raft_io->version = 2;
raft_io->capacity = 4096;
raft_io->init = ioMethodInit;
raft_io->close = ioMethodClose;
raft_io->start = ioMethodStart;
raft_io->load = ioMethodLoad;
raft_io->bootstrap = ioMethodBootstrap;
raft_io->recover = ioMethodRecover;
raft_io->set_term = ioMethodSetTerm;
raft_io->set_vote = ioMethodSetVote;
raft_io->append = ioMethodAppend;
raft_io->truncate = ioMethodTruncate;
raft_io->send = ioMethodSend;
raft_io->snapshot_put = ioMethodSnapshotPut;
raft_io->snapshot_get = ioMethodSnapshotGet;
raft_io->time = ioMethodTime;
raft_io->random = ioMethodRandom;
return 0;
}
/* Release all memory held by the given stub I/O implementation. */
void ioClose(struct raft_io *raft_io)
{
struct io *io = raft_io->impl;
size_t i;
for (i = 0; i < io->n; i++) {
struct raft_entry *entry = &io->entries[i];
raft_free(entry->buf.base);
}
if (io->entries != NULL) {
raft_free(io->entries);
}
if (io->snapshot != NULL) {
snapshotClose(io->snapshot);
raft_free(io->snapshot);
}
raft_free(io);
}
/* Custom emit tracer function which include the server ID. */
static void fixtureTrace(struct raft_tracer *t, int type, const void *data)
{
struct raft_fixture_server *s = t->impl;
const struct raft_tracer_info *info = data;
if (type != RAFT_TRACER_DIAGNOSTIC) {
return;
}
if (info->diagnostic.level > 3) {
return;
}
fprintf(stderr, "[%4llu] %llu: %s\n", s->io.time(&s->io), s->id,
info->diagnostic.message);
}
static int serverInit(struct raft_fixture *f, unsigned i, struct raft_fsm *fsm)
{
int rv;
struct raft_fixture_server *s;
s = raft_malloc(sizeof(*s));
if (s == NULL) {
return RAFT_NOMEM;
}
f->servers[i] = s;
s->alive = true;
s->id = i + 1;
sprintf(s->address, "%llu", s->id);
rv = ioInit(&s->io, i, &f->time);
if (rv != 0) {
return rv;
}
rv = raft_init(&s->raft, &s->io, fsm, s->id, s->address);
if (rv != 0) {
return rv;
}
raft_set_election_timeout(&s->raft, ELECTION_TIMEOUT);
raft_set_heartbeat_timeout(&s->raft, HEARTBEAT_TIMEOUT);
raft_set_install_snapshot_timeout(&s->raft, INSTALL_SNAPSHOT_TIMEOUT);
s->tracer.impl = s;
s->tracer.version = 2;
s->tracer.emit = fixtureTrace;
s->raft.tracer = &s->tracer;
return 0;
}
static void serverClose(struct raft_fixture_server *s)
{
raft_close(&s->raft, NULL);
ioClose(&s->io);
raft_free(s);
}
/* Set the state of raft's internal pseudo random number generator so that the
* next time RandomWithinRange() is run it will return the value configured by
* the fixture, which is stored in the randomized_election_timeout field. */
static void serverSeed(struct raft_fixture_server *s)
{
struct io *io = s->io.impl;
unsigned timeout = s->raft.election_timeout;
if (io->randomized_election_timeout ==
io->randomized_election_timeout_prev) {
goto done;
}
io->seed = s->raft.random;
while (1) {
unsigned random = io->seed;
unsigned n = RandomWithinRange(&random, timeout, timeout * 2);
if (n == io->randomized_election_timeout) {
goto done;
}
io->seed = random;
}
done:
raft_seed(&s->raft, io->seed);
io->randomized_election_timeout_prev = io->randomized_election_timeout;
}
/* Connect the server with the given index to all others */
static void serverConnectToAll(struct raft_fixture *f, unsigned i)
{
unsigned j;
for (j = 0; j < f->n; j++) {
struct raft_io *io1 = &f->servers[i]->io;
struct raft_io *io2 = &f->servers[j]->io;
if (i == j) {
continue;
}
ioConnect(io1, io2);
}
}
int raft_fixture_init(struct raft_fixture *f)
{
f->time = 0;
f->n = 0;
f->log = logInit();
if (f->log == NULL) {
return RAFT_NOMEM;
}
f->commit_index = 0;
f->hook = NULL;
f->event = raft_malloc(sizeof(*f->event));
if (f->event == NULL) {
return RAFT_NOMEM;
}
return 0;
}
int raft_fixture_initialize(struct raft_fixture *f)
{
return raft_fixture_init(f);
}
void raft_fixture_close(struct raft_fixture *f)
{
unsigned i;
for (i = 0; i < f->n; i++) {
struct io *io = f->servers[i]->io.impl;
ioFlushAll(io);
}
for (i = 0; i < f->n; i++) {
serverClose(f->servers[i]);
}
raft_free(f->event);
logClose(f->log);
}
int raft_fixture_configuration(struct raft_fixture *f,
unsigned n_voting,
struct raft_configuration *configuration)
{
unsigned i;
assert(f->n > 0);
assert(n_voting > 0);
assert(n_voting <= f->n);
raft_configuration_init(configuration);
for (i = 0; i < f->n; i++) {
struct raft_fixture_server *s;
int role = i < n_voting ? RAFT_VOTER : RAFT_STANDBY;
int rv;
s = f->servers[i];
rv = raft_configuration_add(configuration, s->id, s->address, role);
if (rv != 0) {
return rv;
}
}
return 0;
}
int raft_fixture_bootstrap(struct raft_fixture *f,
struct raft_configuration *configuration)
{
unsigned i;
for (i = 0; i < f->n; i++) {
struct raft *raft = raft_fixture_get(f, i);
int rv;
rv = raft_bootstrap(raft, configuration);
if (rv != 0) {
return rv;
}
}
return 0;
}
static void seedAll(struct raft_fixture *f)
{
unsigned i;
for (i = 0; i < f->n; i++) {
serverSeed(f->servers[i]);
}
}
int raft_fixture_start(struct raft_fixture *f)
{
unsigned i;
int rv;
seedAll(f);
for (i = 0; i < f->n; i++) {
struct raft_fixture_server *s = f->servers[i];
rv = raft_start(&s->raft);
if (rv != 0) {
return rv;
}
}
return 0;
}
unsigned raft_fixture_n(struct raft_fixture *f)
{
return f->n;
}
raft_time raft_fixture_time(struct raft_fixture *f)
{
return f->time;
}
struct raft *raft_fixture_get(struct raft_fixture *f, unsigned i)
{
assert(i < f->n);
return &f->servers[i]->raft;
}
bool raft_fixture_alive(struct raft_fixture *f, unsigned i)
{
assert(i < f->n);
return f->servers[i]->alive;
}
unsigned raft_fixture_leader_index(struct raft_fixture *f)
{
if (f->leader_id != 0) {
return (unsigned)(f->leader_id - 1);
}
return f->n;
}
raft_id raft_fixture_voted_for(struct raft_fixture *f, unsigned i)
{
struct io *io = f->servers[i]->io.impl;
return io->voted_for;
}
/* Update the leader and check for election safety.
*
* From figure 3.2:
*
* Election Safety -> At most one leader can be elected in a given
* term.
*
* Return true if the current leader turns out to be different from the one at
* the time this function was called.
*/
static bool updateLeaderAndCheckElectionSafety(struct raft_fixture *f)
{
raft_id leader_id = 0;
unsigned leader_i = 0;
raft_term leader_term = 0;
unsigned i;
bool changed;
for (i = 0; i < f->n; i++) {
struct raft *raft = raft_fixture_get(f, i);
unsigned j;
/* If the server is not alive or is not the leader, skip to the next
* server. */
if (!raft_fixture_alive(f, i) || raft_state(raft) != RAFT_LEADER) {
continue;
}
/* Check that no other server is leader for this term. */
for (j = 0; j < f->n; j++) {
struct raft *other = raft_fixture_get(f, j);
if (other->id == raft->id || other->state != RAFT_LEADER) {
continue;
}
if (other->current_term == raft->current_term) {
fprintf(stderr,
"server %llu and %llu are both leaders in term %llu",
raft->id, other->id, raft->current_term);
abort();
}
}
if (raft->current_term > leader_term) {
leader_id = raft->id;
leader_i = i;
leader_term = raft->current_term;
}
}
/* Check that the leader is stable, in the sense that it has been
* acknowledged by all alive servers connected to it, and those servers
* together with the leader form a majority. */
if (leader_id != 0) {
unsigned n_acks = 0;
bool acked = true;
unsigned n_quorum = 0;
for (i = 0; i < f->n; i++) {
struct raft *raft = raft_fixture_get(f, i);
const struct raft_server *server =
configurationGet(&raft->configuration, raft->id);
/* If the server is not in the configuration or is idle, then don't
* count it. */
if (server == NULL || server->role == RAFT_SPARE) {
continue;
}
n_quorum++;
/* If this server is itself the leader, or it's not alive or it's
* not connected to the leader, then don't count it in for
* stability. */
if (i == leader_i || !raft_fixture_alive(f, i) ||
raft_fixture_saturated(f, leader_i, i)) {
continue;
}
if (raft->current_term != leader_term) {
acked = false;
break;
}
if (raft->state != RAFT_FOLLOWER) {
acked = false;
break;
}
if (raft->follower_state.current_leader.id == 0) {
acked = false;
break;
}
if (raft->follower_state.current_leader.id != leader_id) {
acked = false;
break;
}
n_acks++;
}
if (!acked || n_acks < (n_quorum / 2)) {
leader_id = 0;
}
}
changed = leader_id != f->leader_id;
f->leader_id = leader_id;
return changed;
}
/* Check for leader append-only.
*
* From figure 3.2:
*
* Leader Append-Only -> A leader never overwrites or deletes entries in its
* own log; it only appends new entries.
*/
static void checkLeaderAppendOnly(struct raft_fixture *f)
{
struct raft *raft;
raft_index index;
raft_index last = logLastIndex(f->log);
/* If the cached log is empty it means there was no leader before. */
if (last == 0) {
return;
}
/* If there's no new leader, just return. */
if (f->leader_id == 0) {
return;
}
raft = raft_fixture_get(f, (unsigned)f->leader_id - 1);
last = logLastIndex(f->log);
for (index = 1; index <= last; index++) {
const struct raft_entry *entry1;
const struct raft_entry *entry2;
size_t i;
entry1 = logGet(f->log, index);
entry2 = logGet(raft->legacy.log, index);
assert(entry1 != NULL);
/* Check if the entry was snapshotted. */
if (entry2 == NULL) {
assert(raft->legacy.log->snapshot.last_index >= index);
continue;
}
/* Entry was not overwritten. */
assert(entry1->type == entry2->type);
assert(entry1->term == entry2->term);
for (i = 0; i < entry1->buf.len; i++) {
assert(((uint8_t *)entry1->buf.base)[i] ==
((uint8_t *)entry2->buf.base)[i]);
}
}
}
/* Make a copy of the the current leader log, in order to perform the Leader
* Append-Only check at the next iteration. */
static void copyLeaderLog(struct raft_fixture *f)
{
struct raft *raft = raft_fixture_get(f, (unsigned)f->leader_id - 1);
struct raft_entry *entries;
unsigned n;
size_t i;
int rv;
logClose(f->log);
f->log = logInit();
if (f->log == NULL) {
assert(false);
return;
}
rv = logAcquire(raft->legacy.log, 1, &entries, &n);
assert(rv == 0);
for (i = 0; i < n; i++) {
struct raft_entry *entry = &entries[i];
struct raft_buffer buf;
buf.len = entry->buf.len;
buf.base = raft_malloc(buf.len);
assert(buf.base != NULL);
memcpy(buf.base, entry->buf.base, buf.len);
rv = logAppend(f->log, entry->term, entry->type, &buf, NULL);
assert(rv == 0);
}
logRelease(raft->legacy.log, 1, entries, n);
}
/* Update the commit index to match the one from the current leader. */
static void updateCommitIndex(struct raft_fixture *f)
{
struct raft *raft = raft_fixture_get(f, (unsigned)f->leader_id - 1);
if (raft->commit_index > f->commit_index) {
f->commit_index = raft->commit_index;
}
}
/* Return the lowest tick time across all servers, along with the associated
* server index */
static void getLowestTickTime(struct raft_fixture *f, raft_time *t, unsigned *i)
{
unsigned j;
*t = (raft_time)-1 /* Maximum value */;
for (j = 0; j < f->n; j++) {
struct io *io = f->servers[j]->io.impl;
if (io->next_tick < *t) {
*t = io->next_tick;
*i = j;
}
}
}
/* Return the completion time of the request with the lowest completion time
* across all servers, along with the associated server index. */
static void getLowestRequestCompletionTime(struct raft_fixture *f,
raft_time *t,
unsigned *i)
{
unsigned j;
*t = (raft_time)-1 /* Maximum value */;
for (j = 0; j < f->n; j++) {
struct io *io = f->servers[j]->io.impl;
queue *head;
QUEUE_FOREACH (head, &io->requests) {
struct ioRequest *r = QUEUE_DATA(head, struct ioRequest, queue);
if (r->completion_time < *t) {
*t = r->completion_time;
*i = j;
}
}
}
}
/* Fire the tick callback of the i'th server. */
static void fireTick(struct raft_fixture *f, unsigned i)
{
struct io *io = f->servers[i]->io.impl;
f->time = io->next_tick;
f->event->server_index = i;
f->event->type = RAFT_FIXTURE_TICK;
io->next_tick += io->tick_interval;
if (f->servers[i]->alive) {
io->tick_cb(io->io);
}
}
/* Complete the first request with completion time @t on the @i'th server. */
static void completeRequest(struct raft_fixture *f, unsigned i, raft_time t)
{
struct io *io = f->servers[i]->io.impl;
queue *head;
struct ioRequest *r = NULL;
bool found = false;
f->time = t;
f->event->server_index = i;
QUEUE_FOREACH (head, &io->requests) {
r = QUEUE_DATA(head, struct ioRequest, queue);
if (r->completion_time == t) {
found = true;
break;
}
}
assert(found);
QUEUE_REMOVE(head);
switch (r->type) {
case APPEND:
ioFlushAppend(io, (struct append *)r);
f->event->type = RAFT_FIXTURE_DISK;
break;
case SEND:
ioFlushSend(io, (struct send *)r);
f->event->type = RAFT_FIXTURE_NETWORK;
break;
case TRANSMIT:
ioDeliverTransmit(io, (struct transmit *)r);
f->event->type = RAFT_FIXTURE_NETWORK;
break;
case SNAPSHOT_PUT:
ioFlushSnapshotPut(io, (struct snapshot_put *)r);
f->event->type = RAFT_FIXTURE_DISK;
break;
case SNAPSHOT_GET:
ioFlushSnapshotGet(io, (struct snapshot_get *)r);
f->event->type = RAFT_FIXTURE_DISK;
break;
default:
assert(0);
}
}
struct raft_fixture_event *raft_fixture_step(struct raft_fixture *f)
{
raft_time tick_time;
raft_time completion_time;
unsigned i = f->n;
unsigned j = f->n;
seedAll(f);
getLowestTickTime(f, &tick_time, &i);
getLowestRequestCompletionTime(f, &completion_time, &j);
assert(i < f->n || j < f->n);
if (tick_time < completion_time ||
(tick_time == completion_time && i <= j)) {
fireTick(f, i);
} else {
completeRequest(f, j, completion_time);
}
for (j = 0; j < f->n; j++) {
struct raft *r = &f->servers[j]->raft;
LegacyFireCompletedRequests(r);
}
/* If the leader has not changed check the Leader Append-Only
* guarantee. */
if (!updateLeaderAndCheckElectionSafety(f)) {
checkLeaderAppendOnly(f);
}
/* If we have a leader, update leader-related state . */
if (f->leader_id != 0) {
copyLeaderLog(f);
updateCommitIndex(f);
}
if (f->hook != NULL) {
f->hook(f, f->event);
}
return f->event;
}
struct raft_fixture_event *raft_fixture_step_n(struct raft_fixture *f,
unsigned n)
{
unsigned i;
assert(n > 0);
for (i = 0; i < n - 1; i++) {
raft_fixture_step(f);
}
return raft_fixture_step(f);
}
bool raft_fixture_step_until(struct raft_fixture *f,
bool (*stop)(struct raft_fixture *f, void *arg),
void *arg,
unsigned max_msecs)
{
raft_time start = f->time;
while (!stop(f, arg) && (f->time - start) < max_msecs) {
raft_fixture_step(f);
}
return f->time - start < max_msecs;
}
/* A step function which return always false, forcing
* raft_fixture_step_n to advance time at each iteration. */
static bool spin(struct raft_fixture *f, void *arg)
{
(void)f;
(void)arg;
return false;
}
void raft_fixture_step_until_elapsed(struct raft_fixture *f, unsigned msecs)
{
raft_fixture_step_until(f, spin, NULL, msecs);
}
static bool hasLeader(struct raft_fixture *f, void *arg)
{
(void)arg;
return f->leader_id != 0;
}
bool raft_fixture_step_until_has_leader(struct raft_fixture *f,
unsigned max_msecs)
{
return raft_fixture_step_until(f, hasLeader, NULL, max_msecs);
}
static bool hasNoLeader(struct raft_fixture *f, void *arg)
{
(void)arg;
return f->leader_id == 0;
}
bool raft_fixture_step_until_has_no_leader(struct raft_fixture *f,
unsigned max_msecs)
{
return raft_fixture_step_until(f, hasNoLeader, NULL, max_msecs);
}
/* Enable/disable dropping outgoing messages of a certain type from all
* servers except one. */
static void dropAllExcept(struct raft_fixture *f,
int type,
bool flag,
unsigned i)
{
unsigned j;
for (j = 0; j < f->n; j++) {
struct raft_fixture_server *s = f->servers[j];
if (j == i) {
continue;
}
ioDrop(s->io.impl, type, flag);
}
}
/* Set the randomized election timeout of the given server to the
* minimum value compatible with its current state and timers. */
static void minimizeRandomizedElectionTimeout(struct raft_fixture *f,
unsigned i)
{
struct raft *raft = &f->servers[i]->raft;
raft_time now = raft->io->time(raft->io);
unsigned timeout = raft->election_timeout;
assert(raft->state == RAFT_FOLLOWER);
/* If the minimum election timeout value would make the timer expire
* in the past, cap it. */
if (now - raft->election_timer_start > timeout) {
timeout = (unsigned)(now - raft->election_timer_start);
}
raft->follower_state.randomized_election_timeout = timeout;
}
/* Set the randomized election timeout to the maximum value on all
* servers except the given one. */
static void maximizeAllRandomizedElectionTimeoutsExcept(struct raft_fixture *f,
unsigned i)
{
unsigned j;
for (j = 0; j < f->n; j++) {
struct raft *raft = &f->servers[j]->raft;
unsigned timeout = raft->election_timeout * 2;
if (j == i) {
continue;
}
assert(raft->state == RAFT_FOLLOWER);
raft->follower_state.randomized_election_timeout = timeout;
}
}
void raft_fixture_hook(struct raft_fixture *f, raft_fixture_event_cb hook)
{
f->hook = hook;
}
void raft_fixture_start_elect(struct raft_fixture *f, unsigned i)
{
struct raft *raft = raft_fixture_get(f, i);
unsigned j;
/* Make sure there's currently no leader. */
assert(f->leader_id == 0);
/* Make sure that the given server is voting. */
assert(configurationGet(&raft->configuration, raft->id)->role ==
RAFT_VOTER);
/* Make sure all servers are currently followers. */
for (j = 0; j < f->n; j++) {
assert(raft_state(&f->servers[j]->raft) == RAFT_FOLLOWER);
}
/* Pretend that the last randomized election timeout was set at the
* maximum value on all server expect the one to be elected, which
* is instead set to the minimum possible value compatible with its
* current state. */
minimizeRandomizedElectionTimeout(f, i);
maximizeAllRandomizedElectionTimeoutsExcept(f, i);
}
void raft_fixture_elect(struct raft_fixture *f, unsigned i)
{
struct raft *raft = raft_fixture_get(f, i);
raft_fixture_start_elect(f, i);
raft_fixture_step_until_has_leader(f, ELECTION_TIMEOUT * 20);
assert(f->leader_id == raft->id);
}
void raft_fixture_depose(struct raft_fixture *f)
{
unsigned leader_i;
/* Make sure there's a leader. */
assert(f->leader_id != 0);
leader_i = (unsigned)f->leader_id - 1;
assert(raft_state(&f->servers[leader_i]->raft) == RAFT_LEADER);
/* Set a very large election timeout on all followers, to prevent
* them from starting an election. */
maximizeAllRandomizedElectionTimeoutsExcept(f, leader_i);
/* Prevent all servers from sending append entries results, so the
* leader will eventually step down. */
dropAllExcept(f, RAFT_APPEND_ENTRIES_RESULT, true, leader_i);
raft_fixture_step_until_has_no_leader(f, ELECTION_TIMEOUT * 3);
assert(f->leader_id == 0);
dropAllExcept(f, RAFT_APPEND_ENTRIES_RESULT, false, leader_i);
}
struct step_apply
{
unsigned i;
raft_index index;
};
static bool hasAppliedIndex(struct raft_fixture *f, void *arg)
{
struct step_apply *apply = (struct step_apply *)arg;
struct raft *raft;
unsigned n = 0;
unsigned i;
if (apply->i < f->n) {
raft = raft_fixture_get(f, apply->i);
return raft_last_applied(raft) >= apply->index;
}
for (i = 0; i < f->n; i++) {
raft = raft_fixture_get(f, i);
if (raft_last_applied(raft) >= apply->index) {
n++;
}
}
return n == f->n;
}
bool raft_fixture_step_until_applied(struct raft_fixture *f,
unsigned i,
raft_index index,
unsigned max_msecs)
{
struct step_apply apply = {i, index};
return raft_fixture_step_until(f, hasAppliedIndex, &apply, max_msecs);
}
struct step_state
{
unsigned i;
enum raft_state state;
};
static bool hasState(struct raft_fixture *f, void *arg)
{
struct step_state *target = (struct step_state *)arg;
struct raft *raft;
raft = raft_fixture_get(f, target->i);
return raft_state(raft) == target->state;
}
bool raft_fixture_step_until_state_is(struct raft_fixture *f,
unsigned i,
int state,
unsigned max_msecs)
{
struct step_state target = {i, (enum raft_state)state};
return raft_fixture_step_until(f, hasState, &target, max_msecs);
}
struct step_term
{
unsigned i;
raft_term term;
};
static bool hasTerm(struct raft_fixture *f, void *arg)
{
struct step_term *target = (struct step_term *)arg;
struct raft *raft;
raft = raft_fixture_get(f, target->i);
return raft->current_term == target->term;
}
bool raft_fixture_step_until_term_is(struct raft_fixture *f,
unsigned i,
raft_term term,
unsigned max_msecs)
{
struct step_term target = {i, term};
return raft_fixture_step_until(f, hasTerm, &target, max_msecs);
}
struct step_vote
{
unsigned i;
unsigned j;
};
static bool hasVotedFor(struct raft_fixture *f, void *arg)
{
struct step_vote *target = (struct step_vote *)arg;
struct raft *raft;
raft = raft_fixture_get(f, target->i);
return raft->voted_for == target->j + 1;
}
bool raft_fixture_step_until_voted_for(struct raft_fixture *f,
unsigned i,
unsigned j,
unsigned max_msecs)
{
struct step_vote target = {i, j};
return raft_fixture_step_until(f, hasVotedFor, &target, max_msecs);
}
struct step_deliver
{
unsigned i;
unsigned j;
};
static bool hasDelivered(struct raft_fixture *f, void *arg)
{
struct step_deliver *target = (struct step_deliver *)arg;
struct raft *raft;
struct io *io;
struct raft_message *message;
queue *head;
raft = raft_fixture_get(f, target->i);
io = raft->io->impl;
QUEUE_FOREACH (head, &io->requests) {
struct ioRequest *r;
r = QUEUE_DATA(head, struct ioRequest, queue);
message = NULL;
switch (r->type) {
case SEND:
message = &((struct send *)r)->message;
break;
case TRANSMIT:
message = &((struct transmit *)r)->message;
break;
}
if (message != NULL && message->server_id == target->j + 1) {
return false;
}
}
return true;
}
bool raft_fixture_step_until_delivered(struct raft_fixture *f,
unsigned i,
unsigned j,
unsigned max_msecs)
{
struct step_deliver target = {i, j};
return raft_fixture_step_until(f, hasDelivered, &target, max_msecs);
}
void raft_fixture_disconnect(struct raft_fixture *f, unsigned i, unsigned j)
{
struct raft_io *io1 = &f->servers[i]->io;
struct raft_io *io2 = &f->servers[j]->io;
ioDisconnect(io1, io2);
}
void raft_fixture_reconnect(struct raft_fixture *f, unsigned i, unsigned j)
{
struct raft_io *io1 = &f->servers[i]->io;
struct raft_io *io2 = &f->servers[j]->io;
ioReconnect(io1, io2);
}
void raft_fixture_saturate(struct raft_fixture *f, unsigned i, unsigned j)
{
struct raft_io *io1 = &f->servers[i]->io;
struct raft_io *io2 = &f->servers[j]->io;
ioSaturate(io1, io2);
}
static void disconnectFromAll(struct raft_fixture *f, unsigned i)
{
unsigned j;
for (j = 0; j < f->n; j++) {
if (j == i) {
continue;
}
raft_fixture_saturate(f, i, j);
raft_fixture_saturate(f, j, i);
}
}
static void reconnectToAll(struct raft_fixture *f, unsigned i)
{
unsigned j;
for (j = 0; j < f->n; j++) {
if (j == i) {
continue;
}
/* Don't reconnect to disconnected peers */
if (!f->servers[j]->alive) {
continue;
}
raft_fixture_desaturate(f, i, j);
raft_fixture_desaturate(f, j, i);
}
}
bool raft_fixture_saturated(struct raft_fixture *f, unsigned i, unsigned j)
{
struct raft_io *io1 = &f->servers[i]->io;
struct raft_io *io2 = &f->servers[j]->io;
return ioSaturated(io1, io2);
}
void raft_fixture_desaturate(struct raft_fixture *f, unsigned i, unsigned j)
{
struct raft_io *io1 = &f->servers[i]->io;
struct raft_io *io2 = &f->servers[j]->io;
ioDesaturate(io1, io2);
}
void raft_fixture_kill(struct raft_fixture *f, unsigned i)
{
disconnectFromAll(f, i);
f->servers[i]->alive = false;
}
void raft_fixture_revive(struct raft_fixture *f, unsigned i)
{
reconnectToAll(f, i);
f->servers[i]->alive = true;
}
int raft_fixture_grow(struct raft_fixture *f, struct raft_fsm *fsm)
{
unsigned i;
unsigned j;
int rc;
i = f->n;
f->n++;
rc = serverInit(f, i, fsm);
if (rc != 0) {
return rc;
}
serverConnectToAll(f, i);
for (j = 0; j < f->n; j++) {
struct raft_io *io1 = &f->servers[i]->io;
struct raft_io *io2 = &f->servers[j]->io;
ioConnect(io2, io1);
}
return 0;
}
void raft_fixture_set_randomized_election_timeout(struct raft_fixture *f,
unsigned i,
unsigned msecs)
{
struct io *io = f->servers[i]->io.impl;
io->randomized_election_timeout = msecs;
}
void raft_fixture_set_network_latency(struct raft_fixture *f,
unsigned i,
unsigned msecs)
{
struct io *io = f->servers[i]->io.impl;
io->network_latency = msecs;
}
void raft_fixture_set_disk_latency(struct raft_fixture *f,
unsigned i,
unsigned msecs)
{
struct io *io = f->servers[i]->io.impl;
io->disk_latency = msecs;
}
void raft_fixture_set_term(struct raft_fixture *f, unsigned i, raft_term term)
{
struct io *io = f->servers[i]->io.impl;
io->term = term;
}
void raft_fixture_set_snapshot(struct raft_fixture *f,
unsigned i,
struct raft_snapshot *snapshot)
{
struct io *io = f->servers[i]->io.impl;
io->snapshot = snapshot;
}
void raft_fixture_add_entry(struct raft_fixture *f,
unsigned i,
struct raft_entry *entry)
{
struct io *io = f->servers[i]->io.impl;
struct raft_entry *entries;
entries = raft_realloc(io->entries, (io->n + 1) * sizeof *entries);
assert(entries != NULL);
entries[io->n] = *entry;
io->entries = entries;
io->n++;
}
void raft_fixture_io_fault(struct raft_fixture *f,
unsigned i,
int delay,
int repeat)
{
struct io *io = f->servers[i]->io.impl;
io->fault.countdown = delay;
io->fault.n = repeat;
}
unsigned raft_fixture_n_send(struct raft_fixture *f, unsigned i, int type)
{
struct io *io = f->servers[i]->io.impl;
return io->n_send[type];
}
unsigned raft_fixture_n_recv(struct raft_fixture *f, unsigned i, int type)
{
struct io *io = f->servers[i]->io.impl;
return io->n_recv[type];
}
#undef tracef
raft-0.22.1/src/heap.c 0000664 0000000 0000000 00000004460 14601504142 0014401 0 ustar 00root root 0000000 0000000 #include "heap.h"
#include
#include "../include/raft.h"
static void *defaultMalloc(void *data, size_t size)
{
(void)data;
return malloc(size);
}
static void defaultFree(void *data, void *ptr)
{
(void)data;
free(ptr);
}
static void *defaultCalloc(void *data, size_t nmemb, size_t size)
{
(void)data;
return calloc(nmemb, size);
}
static void *defaultRealloc(void *data, void *ptr, size_t size)
{
(void)data;
return realloc(ptr, size);
}
static void *defaultAlignedAlloc(void *data, size_t alignment, size_t size)
{
(void)data;
return aligned_alloc(alignment, size);
}
static void defaultAlignedFree(void *data, size_t alignment, void *ptr)
{
(void)alignment;
defaultFree(data, ptr);
}
static struct raft_heap defaultHeap = {
NULL, /* data */
defaultMalloc, /* malloc */
defaultFree, /* free */
defaultCalloc, /* calloc */
defaultRealloc, /* realloc */
defaultAlignedAlloc, /* aligned_alloc */
defaultAlignedFree /* aligned_free */
};
static struct raft_heap *currentHeap = &defaultHeap;
void *RaftHeapMalloc(size_t size)
{
return currentHeap->malloc(currentHeap->data, size);
}
void RaftHeapFree(void *ptr)
{
if (ptr == NULL) {
return;
}
currentHeap->free(currentHeap->data, ptr);
}
void *RaftHeapCalloc(size_t nmemb, size_t size)
{
return currentHeap->calloc(currentHeap->data, nmemb, size);
}
void *RaftHeapRealloc(void *ptr, size_t size)
{
return currentHeap->realloc(currentHeap->data, ptr, size);
}
void *raft_malloc(size_t size)
{
return RaftHeapMalloc(size);
}
void raft_free(void *ptr)
{
RaftHeapFree(ptr);
}
void *raft_calloc(size_t nmemb, size_t size)
{
return RaftHeapCalloc(nmemb, size);
}
void *raft_realloc(void *ptr, size_t size)
{
return RaftHeapRealloc(ptr, size);
}
void *raft_aligned_alloc(size_t alignment, size_t size)
{
return currentHeap->aligned_alloc(currentHeap->data, alignment, size);
}
void raft_aligned_free(size_t alignment, void *ptr)
{
currentHeap->aligned_free(currentHeap->data, alignment, ptr);
}
void raft_heap_set(struct raft_heap *heap)
{
currentHeap = heap;
}
void raft_heap_set_default(void)
{
currentHeap = &defaultHeap;
}
const struct raft_heap *raft_heap_get(void)
{
return currentHeap;
}
raft-0.22.1/src/heap.h 0000664 0000000 0000000 00000000413 14601504142 0014400 0 ustar 00root root 0000000 0000000 /* Internal heap APIs. */
#ifndef HEAP_H_
#define HEAP_H_
#include
void *RaftHeapMalloc(size_t size);
void *RaftHeapCalloc(size_t nmemb, size_t size);
void *RaftHeapRealloc(void *ptr, size_t size);
void RaftHeapFree(void *ptr);
#endif /* HEAP_H_ */
raft-0.22.1/src/legacy.c 0000664 0000000 0000000 00000137061 14601504142 0014734 0 ustar 00root root 0000000 0000000 #include "legacy.h"
#include "assert.h"
#include "configuration.h"
#include "entry.h"
#include "err.h"
#include "log.h"
#include "membership.h"
#include "queue.h"
#include "request.h"
#include "snapshot.h"
#include "tracing.h"
#define tracef(...) Tracef(r->tracer, __VA_ARGS__)
struct legacySendMessage
{
struct raft_io_send send;
struct raft_io_snapshot_get get;
struct raft *r;
struct raft_message message;
};
static void legacySendMessageCb(struct raft_io_send *send, int status)
{
struct legacySendMessage *req = send->data;
struct raft *r = req->r;
(void)status;
switch (req->message.type) {
case RAFT_APPEND_ENTRIES:
logRelease(r->legacy.log,
req->message.append_entries.prev_log_index + 1,
req->message.append_entries.entries,
req->message.append_entries.n_entries);
break;
case RAFT_INSTALL_SNAPSHOT:
configurationClose(&req->message.install_snapshot.conf);
raft_free(req->message.install_snapshot.data.base);
break;
default:
break;
}
raft_free(req);
}
static int legacyLoadSnapshot(struct legacySendMessage *req);
static int legacyFillAppendEntries(struct raft *r,
struct raft_append_entries *args)
{
raft_index index = args->prev_log_index + 1;
unsigned n = args->n_entries;
int rv;
rv = logAcquireAtMost(r->legacy.log, index, (int)n, &args->entries,
&args->n_entries);
if (rv != 0) {
return rv;
}
assert(args->n_entries == n);
return 0;
}
static void legacyAbortAppendEntries(struct raft *r,
struct raft_append_entries *args)
{
raft_index index = args->prev_log_index + 1;
logRelease(r->legacy.log, index, args->entries, args->n_entries);
}
static int legacySendMessage(struct raft *r, struct raft_message *message)
{
struct legacySendMessage *req;
int rv;
req = raft_malloc(sizeof *req);
if (req == NULL) {
return RAFT_NOMEM;
}
req->r = r;
req->message = *message;
req->send.data = req;
switch (req->message.type) {
case RAFT_APPEND_ENTRIES:
rv = legacyFillAppendEntries(r, &req->message.append_entries);
if (rv != 0) {
return rv;
}
break;
case RAFT_INSTALL_SNAPSHOT:
rv = legacyLoadSnapshot(req);
if (rv != 0) {
return rv;
}
return 0;
default:
break;
}
rv = r->io->send(r->io, &req->send, &req->message, legacySendMessageCb);
if (rv != 0) {
switch (req->message.type) {
case RAFT_APPEND_ENTRIES:
legacyAbortAppendEntries(r, &req->message.append_entries);
break;
default:
break;
}
raft_free(req);
ErrMsgTransferf(r->io->errmsg, r->errmsg,
"send message of type %d to %llu", message->type,
message->server_id);
return rv;
}
return 0;
}
struct legacyPersistEntries
{
struct raft_io_append append;
struct raft *r;
raft_index index;
struct raft_entry *entries;
unsigned n;
};
static void legacyPersistEntriesCb(struct raft_io_append *append, int status)
{
struct legacyPersistEntries *req = append->data;
struct raft *r = req->r;
struct raft_event event;
unsigned n = 0;
unsigned i;
if (status != 0) {
assert(r->legacy.closing);
assert(status == RAFT_CANCELED);
goto out;
}
/* If we're installing a snapshot discard these entries because they are
* supposed to be truncated. */
if (r->legacy.snapshot_install) {
goto out;
}
/* Check which of these entries is still in our in-memory log */
for (i = 0; i < req->n; i++) {
struct raft_entry *entry = &req->entries[i];
raft_index index = req->index + i;
raft_term local_term = logTermOf(r->legacy.log, index);
/* If we have no entry at this index, or if the entry we have now has a
* different term, it means that this entry got truncated, so let's stop
* here. */
if (local_term == 0 || (local_term > 0 && local_term != entry->term)) {
if (i == 0) {
goto out; /* No entries in this batch is still in our log */
}
break;
}
/* If we do have an entry at this index, its term must match the one of
* the entry we wrote on disk. */
assert(local_term != 0 && local_term == entry->term);
n += 1;
}
assert(n > 0);
event.type = RAFT_PERSISTED_ENTRIES;
event.persisted_entries.index = req->index + n - 1;
LegacyForwardToRaftIo(r, &event);
out:
logRelease(r->legacy.log, req->index, req->entries, req->n);
raft_free(req);
}
static int legacyHandleUpdateEntries(struct raft *r,
raft_index index,
struct raft_entry *entries,
unsigned n)
{
struct legacyPersistEntries *req;
struct raft_entry *acquired;
unsigned n_acquired;
unsigned i;
int rv;
req = raft_malloc(sizeof *req);
if (req == NULL) {
return RAFT_NOMEM;
}
req->r = r;
req->index = index;
req->n = n;
req->append.data = req;
if (index <= logLastIndex(r->legacy.log)) {
logTruncate(r->legacy.log, index);
}
for (i = 0; i < n; i++) {
struct raft_entry entry;
rv = entryCopy(&entries[i], &entry);
if (rv != 0) {
goto err;
}
rv = logAppend(r->legacy.log, entry.term, entry.type, &entry.buf, NULL);
if (rv != 0) {
goto err;
}
}
assert(n > 0);
assert(entries[0].batch != NULL);
raft_free(entries[0].batch);
rv = r->io->truncate(r->io, index);
if (rv != 0) {
goto err;
}
rv = logAcquire(r->legacy.log, index, &acquired, &n_acquired);
assert(n_acquired == n);
if (rv != 0) {
goto err;
}
req->entries = acquired;
rv =
r->io->append(r->io, &req->append, acquired, n, legacyPersistEntriesCb);
if (rv != 0) {
goto err_after_acquired;
}
return 0;
err_after_acquired:
logRelease(r->legacy.log, index, acquired, n_acquired);
err:
logDiscard(r->legacy.log, index);
raft_free(req);
ErrMsgTransferf(r->io->errmsg, r->errmsg, "append %u entries", n);
return rv;
}
struct legacyPersistSnapshot
{
struct raft_io_snapshot_put put;
struct raft_snapshot snapshot;
struct raft *r;
struct raft_snapshot_metadata metadata;
size_t offset;
struct raft_buffer chunk;
bool last;
};
static void legacyCancelPersistSnapshot(struct legacyPersistSnapshot *req)
{
raft_free(req->chunk.base);
raft_configuration_close(&req->metadata.configuration);
}
static void legacyPersistSnapshotCb(struct raft_io_snapshot_put *put,
int status)
{
struct legacyPersistSnapshot *req = put->data;
struct raft *r = req->r;
struct raft_event event;
r->legacy.snapshot_install = false;
event.type = RAFT_PERSISTED_SNAPSHOT;
event.persisted_snapshot.metadata = req->metadata;
event.persisted_snapshot.offset = req->offset;
event.persisted_snapshot.last = req->last;
/* If we successfully persisted the snapshot, keep the snapshot data around,
* since we'll then need it immediately after calling raft_step(), in order
* to restore the FSM state.
*
* Otherwise, discard the snapshot data altogether. */
if (status == 0) {
assert(r->legacy.snapshot_index == 0);
r->legacy.snapshot_index = req->metadata.index;
r->legacy.snapshot_chunk = req->chunk;
LegacyForwardToRaftIo(r, &event);
} else {
assert(r->legacy.closing);
assert(status == RAFT_CANCELED);
legacyCancelPersistSnapshot(req);
}
raft_free(req);
}
static int legacyPersistSnapshotStart(struct legacyPersistSnapshot *req)
{
struct raft *r = req->r;
int rv;
logRestore(r->legacy.log, req->metadata.index, req->metadata.term);
rv = r->io->snapshot_put(r->io, 0, &req->put, &req->snapshot,
legacyPersistSnapshotCb);
if (rv != 0) {
goto err;
}
return 0;
err:
raft_free(req);
ErrMsgTransferf(r->io->errmsg, r->errmsg, "put snapshot at %llu",
req->metadata.index);
return rv;
}
static int legacyHandleUpdateSnapshot(struct raft *r,
struct raft_snapshot_metadata *metadata,
size_t offset,
struct raft_buffer *chunk,
bool last)
{
struct legacyPersistSnapshot *req;
int rv;
assert(!r->legacy.snapshot_install);
assert(r->legacy.snapshot_pending == NULL);
req = raft_malloc(sizeof *req);
if (req == NULL) {
return RAFT_NOMEM;
}
req->r = r;
req->metadata = *metadata;
req->offset = offset;
req->chunk = *chunk;
req->last = last;
req->put.data = req;
req->snapshot.index = req->metadata.index;
req->snapshot.term = req->metadata.term;
req->snapshot.configuration = req->metadata.configuration;
req->snapshot.configuration_index = req->metadata.configuration_index;
req->snapshot.bufs = &req->chunk;
req->snapshot.n_bufs = 1;
r->legacy.snapshot_install = true;
/* If we're taking a snapshot, put this install on hold until it's
* completed. */
if (r->legacy.snapshot_taking) {
r->legacy.snapshot_pending = req;
return 0;
}
rv = legacyPersistSnapshotStart(req);
if (rv != 0) {
goto err;
}
return 0;
err:
raft_free(req);
ErrMsgTransferf(r->io->errmsg, r->errmsg, "put snapshot at %llu",
req->metadata.index);
return rv;
}
static int legacyHandleUpdateMessages(struct raft *r,
struct raft_message *messages,
unsigned n)
{
unsigned i;
int rv;
for (i = 0; i < n; i++) {
rv = legacySendMessage(r, &messages[i]);
if (rv != 0) {
return rv;
}
}
return 0;
}
static void legacyLoadSnapshotCb(struct raft_io_snapshot_get *get,
struct raft_snapshot *snapshot,
int status)
{
struct legacySendMessage *req = get->data;
struct raft *r = req->r;
struct raft_install_snapshot *params = &req->message.install_snapshot;
int rv;
if (status != 0) {
goto abort;
}
/* The old raft_io interface makes no guarantee about the index of the
* loaded snapshot. */
if (snapshot->index != params->last_index) {
assert(snapshot->index > params->last_index);
params->last_index = snapshot->index;
}
assert(snapshot->n_bufs == 1);
params->data = snapshot->bufs[0];
params->conf = snapshot->configuration;
params->conf_index = snapshot->configuration_index;
raft_free(snapshot->bufs);
raft_free(snapshot);
rv = r->io->send(r->io, &req->send, &req->message, legacySendMessageCb);
if (rv != 0) {
ErrMsgTransferf(r->io->errmsg, r->errmsg,
"send message of type %d to %llu", req->message.type,
req->message.server_id);
status = rv;
goto abort;
}
return;
abort:
configurationClose(¶ms->conf);
raft_free(params->data.base);
raft_free(req);
}
static int legacyLoadSnapshot(struct legacySendMessage *req)
{
struct raft *r = req->r;
int rv;
req->get.data = req;
rv = r->io->snapshot_get(r->io, &req->get, legacyLoadSnapshotCb);
if (rv != 0) {
raft_free(req);
ErrMsgTransferf(r->io->errmsg, r->errmsg, "load snapshot at %llu",
req->message.install_snapshot.last_index);
return rv;
}
return 0;
}
struct legacyTakeSnapshot
{
struct raft *r;
struct raft_snapshot_metadata metadata;
struct raft_snapshot snapshot;
struct raft_io_snapshot_put put;
};
/*
* When taking a snapshot, ownership of the snapshot data is with raft if
* `snapshot_finalize` is NULL.
*/
static void takeSnapshotClose(struct raft *r, struct raft_snapshot *s)
{
r->legacy.snapshot_taking = false;
if (r->fsm->version == 1 ||
(r->fsm->version > 1 && r->fsm->snapshot_finalize == NULL)) {
unsigned i;
for (i = 0; i < s->n_bufs; i++) {
raft_free(s->bufs[i].base);
}
raft_free(s->bufs);
return;
}
r->fsm->snapshot_finalize(r->fsm, &s->bufs, &s->n_bufs);
}
static void takeSnapshotCb(struct raft_io_snapshot_put *put, int status)
{
struct legacyTakeSnapshot *req = put->data;
struct raft *r = req->r;
struct raft_snapshot_metadata metadata = req->metadata;
struct raft_snapshot *snapshot = &req->snapshot;
struct raft_event event;
takeSnapshotClose(r, snapshot);
raft_free(req);
assert(metadata.term != 0);
assert(logTermOf(r->legacy.log, metadata.index) == metadata.term);
/* If we are shutting down, cancel the snapshot. */
if (r->legacy.closing) {
tracef("cancelling snapshot");
status = RAFT_CANCELED;
/* Also cancel any persist snapshot request. */
if (r->legacy.snapshot_pending != NULL) {
struct legacyPersistSnapshot *persist;
persist = r->legacy.snapshot_pending;
legacyCancelPersistSnapshot(persist);
}
}
if (status != 0) {
assert(r->legacy.closing);
assert(status == RAFT_CANCELED);
configurationClose(&metadata.configuration);
return;
}
logSnapshot(r->legacy.log, metadata.index, r->legacy.snapshot_trailing);
event.type = RAFT_SNAPSHOT;
memset(&event.reserved, 0, sizeof event.reserved);
event.snapshot.metadata = metadata;
event.snapshot.trailing = r->legacy.snapshot_trailing;
LegacyForwardToRaftIo(r, &event);
if (r->legacy.snapshot_pending != NULL) {
struct legacyPersistSnapshot *persist;
int rv;
r->legacy.snapshot_pending = NULL;
persist = r->legacy.snapshot_pending;
rv = legacyPersistSnapshotStart(persist);
assert(rv == 0);
}
}
static int putSnapshot(struct legacyTakeSnapshot *req)
{
struct raft *r = req->r;
struct raft_snapshot *snapshot = &req->snapshot;
int rv;
assert(!r->snapshot.installing);
req->put.data = req;
rv = r->io->snapshot_put(r->io, r->legacy.snapshot_trailing, &req->put,
snapshot, takeSnapshotCb);
return rv;
}
static bool legacyShouldTakeSnapshot(const struct raft *r)
{
/* We currently support only synchronous FSMs, where entries are applied
* synchronously as soon as we advance the commit index, so the two
* values always match when we get here. */
if (r->last_applied < r->commit_index) {
return false;
}
/* If we are shutting down, let's not do anything. */
if (r->legacy.closing) {
return false;
}
/* If a snapshot is already in progress or we're installing a snapshot, we
* don't want to start another one. */
if (r->legacy.snapshot_taking || r->snapshot.installing) {
return false;
};
/* If we didn't reach the threshold yet, do nothing. */
if (r->commit_index - r->legacy.log->snapshot.last_index <
r->legacy.snapshot_threshold) {
return false;
}
/* If the last committed index is not anymore in our log, it means that the
* log got truncated because we have received an InstallSnapshot
* message. Don't take a snapshot now.*/
if (logTermOf(r->legacy.log, r->commit_index) == 0) {
return false;
}
return true;
}
static void legacyTakeSnapshot(struct raft *r)
{
struct raft_snapshot_metadata metadata;
struct raft_snapshot *snapshot;
struct legacyTakeSnapshot *req;
int rv;
/* We currently support only synchronous FSMs, where entries are applied
* synchronously as soon as we advance the commit index, so the two
* values always match when we get here. */
assert(r->last_applied == r->commit_index);
assert(!r->snapshot.installing);
assert(r->legacy.snapshot_pending == NULL);
tracef("take snapshot at %lld", r->commit_index);
metadata.index = r->commit_index;
metadata.term = logTermOf(r->legacy.log, r->commit_index);
req = raft_malloc(sizeof *req);
if (req == NULL) {
goto abort;
}
req->r = r;
rv =
configurationCopy(&r->configuration_committed, &metadata.configuration);
if (rv != 0) {
goto abort_after_req_alloc;
}
metadata.configuration_index = r->configuration_committed_index;
req->metadata = metadata;
snapshot = &req->snapshot;
snapshot->index = metadata.index;
snapshot->term = metadata.term;
snapshot->configuration = metadata.configuration;
snapshot->configuration_index = metadata.configuration_index;
snapshot->bufs = NULL;
snapshot->n_bufs = 0;
rv = r->fsm->snapshot(r->fsm, &snapshot->bufs, &snapshot->n_bufs);
if (rv == 0 && r->fsm->version >= 3 && r->fsm->snapshot_async != NULL) {
rv = r->fsm->snapshot_async(r->fsm, &snapshot->bufs, &snapshot->n_bufs);
}
if (rv != 0) {
ErrMsgTransferf(r->io->errmsg, r->errmsg, "load snapshot at %llu",
metadata.index);
goto abort_after_conf_fetched;
}
/* putSnapshot will clean up config and buffers in case of error */
rv = putSnapshot(req);
if (rv != 0) {
goto abort_after_snapshot;
}
r->legacy.snapshot_taking = true;
return;
abort_after_snapshot:
takeSnapshotClose(r, snapshot);
abort_after_conf_fetched:
configurationClose(&metadata.configuration);
abort_after_req_alloc:
raft_free(req);
abort:
return;
}
static void legacyFailApply(struct raft *r, struct raft_apply *req)
{
if (req != NULL && req->cb != NULL) {
req->status = RAFT_LEADERSHIPLOST;
req->result = NULL;
QUEUE_PUSH(&r->legacy.requests, &req->queue);
}
}
static void legacyFailBarrier(struct raft *r, struct raft_barrier *req)
{
if (req != NULL && req->cb != NULL) {
req->status = RAFT_LEADERSHIPLOST;
QUEUE_PUSH(&r->legacy.requests, &req->queue);
}
}
void LegacyFailPendingRequests(struct raft *r)
{
/* Fail any promote request that is still outstanding because the server is
* still catching up and no entry was submitted. */
if (r->legacy.change != NULL) {
struct raft_change *req = r->legacy.change;
if (req != NULL && req->cb != NULL) {
/* XXX: set the type here, since it's not done in client.c */
req->type = RAFT_CHANGE;
req->status = RAFT_LEADERSHIPLOST;
QUEUE_PUSH(&r->legacy.requests, &req->queue);
}
r->legacy.change = NULL;
}
/* Fail all outstanding requests */
while (!QUEUE_IS_EMPTY(&r->legacy.pending)) {
struct request *req;
queue *head;
head = QUEUE_HEAD(&r->legacy.pending);
QUEUE_REMOVE(head);
req = QUEUE_DATA(head, struct request, queue);
assert(req->type == RAFT_COMMAND || req->type == RAFT_BARRIER);
switch (req->type) {
case RAFT_COMMAND:
legacyFailApply(r, (struct raft_apply *)req);
break;
case RAFT_BARRIER:
legacyFailBarrier(r, (struct raft_barrier *)req);
break;
};
}
}
static void legacyFireApply(struct raft_apply *req)
{
req->cb(req, req->status, req->result);
}
static void legacyFireBarrier(struct raft_barrier *req)
{
req->cb(req, req->status);
}
static void legacyFireChange(struct raft_change *req)
{
req->cb(req, req->status);
}
static void legacyFireTransfer(struct raft_transfer *req)
{
req->cb(req);
}
void LegacyFireCompletedRequests(struct raft *r)
{
while (!QUEUE_IS_EMPTY(&r->legacy.requests)) {
struct request *req;
queue *head;
head = QUEUE_HEAD(&r->legacy.requests);
QUEUE_REMOVE(head);
req = QUEUE_DATA(head, struct request, queue);
switch (req->type) {
case RAFT_COMMAND:
legacyFireApply((struct raft_apply *)req);
break;
case RAFT_BARRIER:
legacyFireBarrier((struct raft_barrier *)req);
break;
case RAFT_CHANGE:
legacyFireChange((struct raft_change *)req);
break;
case RAFT_TRANSFER_:
legacyFireTransfer((struct raft_transfer *)req);
break;
default:
tracef("unknown request type, shutdown.");
assert(false);
break;
};
}
}
/* Check whether a raft_change request has been completed, and put it in the
* completed requests queue if so. */
static void legacyCheckChangeRequest(struct raft *r,
struct raft_entry *entry,
struct raft_event **events,
unsigned *n_events)
{
struct raft_change *change;
int status;
int rv;
if (r->legacy.change == NULL) {
return;
}
if (r->legacy.change->catch_up_id == 0) {
return;
}
change = r->legacy.change;
/* A raft_catch_up() call can fail only if the server is not the
* leader or if the given ID is invalid. If the server was not the
* leader then r->legacy.change would be NULL, and we know that the
* ID is valid, otherwise the request couldn't have been submitted.
*/
rv = raft_catch_up(r, r->legacy.change->catch_up_id, &status);
assert(rv == 0);
if (status == RAFT_CATCH_UP_ABORTED) {
r->legacy.change = NULL;
if (change->cb != NULL) {
change->type = RAFT_CHANGE;
change->status = RAFT_NOCONNECTION;
QUEUE_PUSH(&r->legacy.requests, &change->queue);
}
}
if (status == RAFT_CATCH_UP_FINISHED) {
struct raft_configuration configuration;
struct raft_server *server;
struct raft_event *event;
unsigned i;
/* If we're transferring leadership, fail the request. */
if (raft_transferee(r) != 0) {
r->legacy.change = NULL;
if (change->cb != NULL) {
change->type = RAFT_CHANGE;
change->status = RAFT_LEADERSHIPLOST;
QUEUE_PUSH(&r->legacy.requests, &change->queue);
}
return;
}
i = configurationIndexOf(&r->configuration, change->catch_up_id);
assert(i < r->configuration.n);
server = &r->configuration.servers[i];
assert(server->role != RAFT_VOTER);
change->catch_up_id = 0;
/* Update our current configuration. */
rv = configurationCopy(&r->configuration, &configuration);
assert(rv == 0);
configuration.servers[i].role = RAFT_VOTER;
entry->type = RAFT_CHANGE;
entry->term = r->current_term;
/* Encode the configuration. */
rv = configurationEncode(&configuration, &entry->buf);
assert(rv == 0);
entry->batch = entry->buf.base;
*n_events += 1;
*events = raft_realloc(*events, *n_events * sizeof **events);
assert(*events != NULL);
event = &(*events)[*n_events - 1];
event->type = RAFT_SUBMIT;
event->submit.entries = entry;
event->submit.n = 1;
configurationClose(&configuration);
}
}
/* Get the request matching the given @index and @type, if any.
* The type check is skipped when @type == -1. */
static struct request *legacyGetRequest(struct raft *r,
const raft_index index,
int type)
{
queue *head;
struct request *req;
QUEUE_FOREACH (head, &r->legacy.pending) {
req = QUEUE_DATA(head, struct request, queue);
if (req->index == index) {
if (type != -1) {
assert(req->type == type);
}
QUEUE_REMOVE(&req->queue);
return req;
}
}
return NULL;
}
/* Apply a RAFT_COMMAND entry that has been committed. */
static int applyCommand(struct raft *r,
const raft_index index,
const struct raft_buffer *buf)
{
struct raft_apply *req;
void *result;
int rv;
rv = r->fsm->apply(r->fsm, buf, &result);
if (rv != 0) {
return rv;
}
r->last_applied = index;
req = (struct raft_apply *)legacyGetRequest(r, index, RAFT_COMMAND);
if (req != NULL && req->cb != NULL) {
req->status = 0;
req->result = result;
QUEUE_PUSH(&r->legacy.requests, &req->queue);
}
return 0;
}
/* Fire the callback of a barrier request whose entry has been committed. */
static void applyBarrier(struct raft *r, const raft_index index)
{
r->last_applied = index;
struct raft_barrier *req;
req = (struct raft_barrier *)legacyGetRequest(r, index, RAFT_BARRIER);
if (req != NULL && req->cb != NULL) {
req->status = 0;
QUEUE_PUSH(&r->legacy.requests, &req->queue);
}
}
/* Apply a RAFT_CHANGE entry that has been committed. */
static void applyChange(struct raft *r, const raft_index index)
{
struct raft_change *req;
assert(index > 0);
r->last_applied = index;
if (r->state == RAFT_LEADER) {
req = r->legacy.change;
r->legacy.change = NULL;
if (req != NULL && req->cb != NULL) {
/* XXX: set the type here, since it's not done in client.c */
req->type = RAFT_CHANGE;
req->status = 0;
QUEUE_PUSH(&r->legacy.requests, &req->queue);
}
}
}
static int legacyApply(struct raft *r,
struct raft_event **events,
unsigned *n_events)
{
raft_index index;
struct raft_event *event;
int rv = 0;
assert(r->state == RAFT_LEADER || r->state == RAFT_FOLLOWER);
assert(r->last_applied <= r->commit_index);
if (r->last_applied == r->commit_index) {
/* Nothing to do. */
return 0;
}
for (index = r->last_applied + 1; index <= r->commit_index; index++) {
const struct raft_entry *entry = logGet(r->legacy.log, index);
if (entry == NULL) {
/* This can happen while installing a snapshot */
tracef("replicationApply - ENTRY NULL");
return 0;
}
assert(entry->type == RAFT_COMMAND || entry->type == RAFT_BARRIER ||
entry->type == RAFT_CHANGE);
switch (entry->type) {
case RAFT_COMMAND:
rv = applyCommand(r, index, &entry->buf);
break;
case RAFT_BARRIER:
applyBarrier(r, index);
rv = 0;
break;
case RAFT_CHANGE:
applyChange(r, index);
*n_events += 1;
*events = raft_realloc(*events, *n_events * sizeof **events);
assert(*events != NULL);
event = &(*events)[*n_events - 1];
event->type = RAFT_CONFIGURATION;
event->configuration.index = index;
rv = configurationDecode(&entry->buf,
&event->configuration.conf);
break;
default:
rv = 0; /* For coverity. This case can't be taken. */
break;
}
if (rv != 0) {
break;
}
}
return rv;
}
void LegacyLeadershipTransferClose(struct raft *r)
{
struct raft_transfer *req = r->transfer;
/* Only assert raft_trasferee() if we're not closing, because the result is
* effectively undefined in that case. */
if (!r->legacy.closing) {
assert(raft_transferee(r) == 0);
}
r->transfer = NULL;
if (req->cb != NULL) {
req->type = RAFT_TRANSFER_;
QUEUE_PUSH(&r->legacy.requests, &req->queue);
}
}
static void legacyHandleStateUpdate(struct raft *r)
{
assert(r->legacy.prev_state != r->state);
if (r->legacy.prev_state == RAFT_LEADER) {
LegacyFailPendingRequests(r);
assert(QUEUE_IS_EMPTY(&r->legacy.pending));
}
if (raft_state(r) == RAFT_LEADER) {
assert(r->legacy.change == NULL);
}
if (r->legacy.closing) {
if (r->transfer != NULL) {
LegacyLeadershipTransferClose(r);
}
LegacyFailPendingRequests(r);
LegacyFireCompletedRequests(r);
}
r->legacy.prev_state = r->state;
}
/* Whether the state_cb callback should be invoked. */
static bool legacyShouldFireStepCb(struct raft *r)
{
queue *head;
struct request *req;
if (r->legacy.step_cb == NULL) {
return false;
}
/* Check if there's a client request in the completion queue which has
* failed due to a RAFT_NOSPACE error. In that case we will not call the
* step_cb just yet, because otherwise cowsql/dqlite would notice that
* the leader has stepped down and immediately close all connections,
* without a chance of properly returning the error to the client. */
QUEUE_FOREACH (head, &r->legacy.requests) {
req = QUEUE_DATA(head, struct request, queue);
if (req->type == RAFT_COMMAND) {
if (((struct raft_apply *)req)->status == RAFT_NOSPACE) {
return false;
}
}
}
return true;
}
static int legacyHandleUpdateCommitIndex(struct raft *r,
struct raft_event **events,
unsigned *n_events)
{
raft_index commit_index = raft_commit_index(r);
int rv;
/* If the new commit index matches the index of a snapshot we have just
* persisted, then restore the FSM state using its cached data. */
if (commit_index != 0 && commit_index == r->legacy.snapshot_index) {
/* From Figure 5.3:
*
* 8. Reset state machine using snapshot contents.
*/
r->legacy.snapshot_index = 0;
rv = r->fsm->restore(r->fsm, &r->legacy.snapshot_chunk);
if (rv != 0) {
tracef("restore snapshot: %s", errCodeToString(rv));
return rv;
}
r->last_applied = commit_index;
}
rv = legacyApply(r, events, n_events);
if (rv != 0) {
return rv;
}
return 0;
}
/* Handle a single event, possibly adding more events. */
static int legacyHandleEvent(struct raft *r,
struct raft_entry *entry,
struct raft_event **events,
unsigned *n_events,
unsigned i)
{
struct raft_event *event;
struct raft_update update;
int rv;
event = &(*events)[i];
event->time = r->io->time(r->io);
event->capacity = r->io->capacity;
rv = raft_step(r, event, &update);
if (rv != 0) {
return rv;
}
if (update.flags & RAFT_UPDATE_STATE) {
legacyHandleStateUpdate(r);
}
/* Check whether a raft_change request has been completed. */
legacyCheckChangeRequest(r, entry, events, n_events);
if (legacyShouldFireStepCb(r)) {
r->legacy.step_cb(r);
}
if (legacyShouldTakeSnapshot(r)) {
legacyTakeSnapshot(r);
}
/* If the current term was updated, persist it. */
if (update.flags & RAFT_UPDATE_CURRENT_TERM) {
rv = r->io->set_term(r->io, raft_current_term(r));
if (rv != 0) {
return rv;
}
}
/* If the current vote was updated, persist it. */
if (update.flags & RAFT_UPDATE_VOTED_FOR) {
rv = r->io->set_vote(r->io, raft_voted_for(r));
if (rv != 0) {
return rv;
}
}
if (update.flags & RAFT_UPDATE_ENTRIES) {
rv = legacyHandleUpdateEntries(r, update.entries.index,
update.entries.batch, update.entries.n);
if (rv != 0) {
return rv;
}
}
if (update.flags & RAFT_UPDATE_SNAPSHOT) {
rv = legacyHandleUpdateSnapshot(
r, &update.snapshot.metadata, update.snapshot.offset,
&update.snapshot.chunk, update.snapshot.last);
if (rv != 0) {
return rv;
}
}
if (update.flags & RAFT_UPDATE_MESSAGES) {
rv = legacyHandleUpdateMessages(r, update.messages.batch,
update.messages.n);
if (rv != 0) {
return rv;
}
}
if (update.flags & RAFT_UPDATE_COMMIT_INDEX) {
rv = legacyHandleUpdateCommitIndex(r, events, n_events);
if (rv != 0) {
return rv;
}
}
/* If there's a pending leadership transfer request, and no leadership
* transfer is in progress, check if it has completed. */
if (r->transfer != NULL && raft_transferee(r) == 0) {
/* If we are leader it means that the request was aborted. If we are
* follower we wait until we find a new leader. */
if (raft_state(r) == RAFT_LEADER) {
LegacyLeadershipTransferClose(r);
} else if (raft_state(r) == RAFT_FOLLOWER) {
raft_id leader_id;
const char *leader_address;
raft_leader(r, &leader_id, &leader_address);
if (leader_id != 0) {
LegacyLeadershipTransferClose(r);
}
}
}
return 0;
}
int LegacyForwardToRaftIo(struct raft *r, struct raft_event *event)
{
struct raft_event *events;
unsigned n_events;
unsigned i;
struct raft_entry entry; /* Used for actual promotion of RAFT_CHANGE reqs */
int rv;
assert(r->io != NULL);
/* Initially the set of events contains only the event passed as
* argument, but might grow if some further events get generated by the
* handling code. */
events = raft_malloc(sizeof *events);
if (events == NULL) {
return RAFT_NOMEM;
}
events[0] = *event;
n_events = 1;
for (i = 0; i < n_events; i++) {
if (r->legacy.closing) {
break;
}
rv = legacyHandleEvent(r, &entry, &events, &n_events, i);
if (rv != 0) {
break;
}
}
raft_free(events);
if (rv != 0) {
return rv;
}
return 0;
}
static void legacyLeadershipTransferInit(struct raft *r,
struct raft_transfer *req,
raft_id id,
raft_transfer_cb cb)
{
assert(r->state == RAFT_LEADER);
req->cb = cb;
req->id = id;
r->transfer = req;
}
int raft_apply(struct raft *r,
struct raft_apply *req,
const struct raft_buffer bufs[],
const unsigned n,
raft_apply_cb cb)
{
raft_index index;
struct raft_event event;
struct raft_entry entry;
int rv;
assert(r != NULL);
assert(bufs != NULL);
assert(n == 1);
/* Index of the first entry being appended. */
index = logLastIndex(r->legacy.log) + 1;
req->type = RAFT_COMMAND;
req->index = index;
req->cb = cb;
entry.type = RAFT_COMMAND;
entry.term = r->current_term;
entry.buf = bufs[0];
entry.batch = entry.buf.base;
event.time = r->io->time(r->io);
event.type = RAFT_SUBMIT;
event.submit.entries = &entry;
event.submit.n = 1;
rv = LegacyForwardToRaftIo(r, &event);
if (rv != 0) {
return rv;
}
QUEUE_PUSH(&r->legacy.pending, &req->queue);
return 0;
}
int raft_barrier(struct raft *r, struct raft_barrier *req, raft_barrier_cb cb)
{
struct raft_event event;
struct raft_entry entry;
raft_index index;
int rv;
/* Index of the barrier entry being appended. */
index = logLastIndex(r->legacy.log) + 1;
req->type = RAFT_BARRIER;
req->index = index;
req->cb = cb;
entry.type = RAFT_BARRIER;
entry.term = r->current_term;
entry.buf.len = 8;
entry.buf.base = raft_malloc(entry.buf.len);
if (entry.buf.base == NULL) {
rv = RAFT_NOMEM;
goto err;
}
entry.batch = entry.buf.base;
event.time = r->io->time(r->io);
event.type = RAFT_SUBMIT;
event.submit.entries = &entry;
event.submit.n = 1;
rv = LegacyForwardToRaftIo(r, &event);
if (rv != 0) {
goto err_after_buf_alloc;
}
QUEUE_PUSH(&r->legacy.pending, &req->queue);
return 0;
err_after_buf_alloc:
raft_free(entry.buf.base);
err:
assert(rv != 0);
return rv;
}
static int clientChangeConfiguration(
struct raft *r,
const struct raft_configuration *configuration)
{
struct raft_entry entry;
struct raft_event event;
int rv;
assert(r->state == RAFT_LEADER);
entry.type = RAFT_CHANGE;
entry.term = r->current_term;
/* Encode the configuration. */
rv = configurationEncode(configuration, &entry.buf);
if (rv != 0) {
return rv;
}
entry.batch = entry.buf.base;
event.time = r->io->time(r->io);
event.type = RAFT_SUBMIT;
event.submit.entries = &entry;
event.submit.n = 1;
rv = LegacyForwardToRaftIo(r, &event);
if (rv != 0) {
return rv;
}
return 0;
}
int raft_add(struct raft *r,
struct raft_change *req,
raft_id id,
const char *address,
raft_change_cb cb)
{
struct raft_configuration configuration;
int rv;
/* Make a copy of the current configuration, and add the new server to
* it. */
rv = configurationCopy(&r->configuration, &configuration);
if (rv != 0) {
goto err;
}
rv = raft_configuration_add(&configuration, id, address, RAFT_SPARE);
if (rv != 0) {
goto err_after_configuration_copy;
}
req->cb = cb;
req->catch_up_id = 0;
rv = clientChangeConfiguration(r, &configuration);
if (rv != 0) {
goto err_after_configuration_copy;
}
assert(r->legacy.change == NULL);
r->legacy.change = req;
raft_configuration_close(&configuration);
return 0;
err_after_configuration_copy:
raft_configuration_close(&configuration);
err:
assert(rv != 0);
return rv;
}
int raft_assign(struct raft *r,
struct raft_change *req,
raft_id id,
int role,
raft_change_cb cb)
{
const struct raft_server *server;
struct raft_event event;
raft_index match_index;
int rv;
if (r->state != RAFT_LEADER || r->leader_state.transferee != 0) {
rv = RAFT_NOTLEADER;
goto err;
}
if (role != RAFT_STANDBY && role != RAFT_VOTER && role != RAFT_SPARE) {
rv = RAFT_BADROLE;
ErrMsgFromCode(r->errmsg, rv);
return rv;
}
rv = membershipCanChangeConfiguration(r);
if (rv != 0) {
return rv;
}
server = configurationGet(&r->configuration, id);
if (server == NULL) {
rv = RAFT_NOTFOUND;
ErrMsgPrintf(r->errmsg, "no server has ID %llu", id);
goto err;
}
/* Check if we have already the desired role. */
if (server->role == role) {
const char *name;
rv = RAFT_BADROLE;
switch (role) {
case RAFT_VOTER:
name = "voter";
break;
case RAFT_STANDBY:
name = "stand-by";
break;
case RAFT_SPARE:
name = "spare";
break;
default:
name = NULL;
assert(0);
break;
}
ErrMsgPrintf(r->errmsg, "server is already %s", name);
goto err;
}
rv = raft_match_index(r, id, &match_index);
assert(rv == 0);
req->cb = cb;
req->catch_up_id = 0;
assert(r->legacy.change == NULL);
r->legacy.change = req;
/* If we are not promoting to the voter role or if the log of this
* server is already up-to-date, we can submit the configuration change
* immediately. */
if (role != RAFT_VOTER || match_index == raft_last_index(r)) {
unsigned server_index = configurationIndexOf(&r->configuration, id);
int old_role = r->configuration.servers[server_index].role;
r->configuration.servers[server_index].role = role;
rv = clientChangeConfiguration(r, &r->configuration);
if (rv != 0) {
r->configuration.servers[server_index].role = old_role;
return rv;
}
return 0;
}
event.time = r->now;
event.type = RAFT_CATCH_UP;
event.catch_up.server_id = server->id;
rv = LegacyForwardToRaftIo(r, &event);
if (rv != 0) {
return rv;
}
req->catch_up_id = server->id;
return 0;
err:
assert(rv != 0);
return rv;
}
int raft_remove(struct raft *r,
struct raft_change *req,
raft_id id,
raft_change_cb cb)
{
struct raft_configuration configuration;
int rv;
/* Make a copy of the current configuration, and remove the given server
* from it. */
rv = configurationCopy(&r->configuration, &configuration);
if (rv != 0) {
goto err;
}
rv = configurationRemove(&configuration, id);
if (rv != 0) {
goto err_after_configuration_copy;
}
req->cb = cb;
req->catch_up_id = 0;
rv = clientChangeConfiguration(r, &configuration);
if (rv != 0) {
goto err_after_configuration_copy;
}
assert(r->legacy.change == NULL);
r->legacy.change = req;
raft_configuration_close(&configuration);
return 0;
err_after_configuration_copy:
raft_configuration_close(&configuration);
err:
assert(rv != 0);
return rv;
}
int raft_transfer(struct raft *r,
struct raft_transfer *req,
raft_id id,
raft_transfer_cb cb)
{
struct raft_event event;
int rv;
event.time = r->io->time(r->io);
event.type = RAFT_TRANSFER;
event.transfer.server_id = id;
rv = LegacyForwardToRaftIo(r, &event);
if (rv != 0) {
goto err;
}
assert(raft_transferee(r) != 0);
legacyLeadershipTransferInit(r, req, raft_transferee(r), cb);
return 0;
err:
assert(rv != 0);
return rv;
}
int raft_bootstrap(struct raft *r, const struct raft_configuration *conf)
{
int rv;
rv = r->io->bootstrap(r->io, conf);
if (rv != 0) {
return rv;
}
return 0;
}
int raft_recover(struct raft *r, const struct raft_configuration *conf)
{
int rv;
rv = r->io->recover(r->io, conf);
if (rv != 0) {
return rv;
}
return 0;
}
static void tickCb(struct raft_io *io)
{
struct raft *r;
struct raft_event event;
int rv;
r = io->data;
event.type = RAFT_TIMEOUT;
event.time = r->io->time(io);
rv = LegacyForwardToRaftIo(r, &event);
assert(rv == 0); /* TODO: just log warning? */
}
static void recvCb(struct raft_io *io, struct raft_message *message)
{
struct raft *r = io->data;
struct raft_event event;
int rv;
if (r->legacy.closing) {
switch (message->type) {
case RAFT_APPEND_ENTRIES:
entryBatchesDestroy(message->append_entries.entries,
message->append_entries.n_entries);
break;
case RAFT_INSTALL_SNAPSHOT:
raft_configuration_close(&message->install_snapshot.conf);
raft_free(message->install_snapshot.data.base);
break;
default:
break;
}
return;
}
event.type = RAFT_RECEIVE;
event.time = r->io->time(r->io);
event.receive.message = message;
rv = LegacyForwardToRaftIo(r, &event);
switch (message->type) {
case RAFT_APPEND_ENTRIES:
if (message->append_entries.n_entries > 0) {
if (rv != 0) {
raft_free(message->append_entries.entries[0].batch);
}
raft_free(message->append_entries.entries);
}
break;
default:
break;
}
assert(rv == 0); /* TODO: just log warning? */
}
int raft_start(struct raft *r)
{
struct raft_snapshot *snapshot;
struct raft_snapshot_metadata metadata;
raft_term term;
raft_id voted_for;
raft_index start_index;
struct raft_entry *entries;
size_t n_entries;
struct raft_event event;
raft_index snapshot_index = 0;
raft_term snapshot_term = 0;
unsigned i;
int rv;
assert(r != NULL);
assert(r->heartbeat_timeout != 0);
assert(r->heartbeat_timeout < r->election_timeout);
assert(r->install_snapshot_timeout != 0);
assert(logNumEntries(r->legacy.log) == 0);
assert(logSnapshotIndex(r->legacy.log) == 0);
assert(r->last_stored == 0);
tracef("starting");
rv = r->io->load(r->io, &term, &voted_for, &snapshot, &start_index,
&entries, &n_entries);
if (rv != 0) {
ErrMsgTransfer(r->io->errmsg, r->errmsg, "io");
return rv;
}
assert(start_index >= 1);
tracef("current_term:%llu voted_for:%llu start_index:%llu n_entries:%zu",
term, voted_for, start_index, n_entries);
/* If we have a snapshot, let's restore it. */
if (snapshot != NULL) {
tracef("restore snapshot with last index %llu and last term %llu",
snapshot->index, snapshot->term);
/* Save the snapshot data in the cache, it will be used by legacy
* compat code to avoid loading the snapshot asynchronously. */
rv = r->fsm->restore(r->fsm, &snapshot->bufs[0]);
if (rv != 0) {
tracef("restore snapshot %llu: %s", snapshot->index,
errCodeToString(rv));
snapshotDestroy(snapshot);
entryBatchesDestroy(entries, n_entries);
return rv;
}
r->last_applied = snapshot->index;
snapshot_index = snapshot->index;
snapshot_term = snapshot->term;
} else if (n_entries > 1) {
r->last_applied = 1;
}
logStart(r->legacy.log, snapshot_index, snapshot_term, start_index);
for (i = 0; i < n_entries; i++) {
struct raft_entry *entry = &entries[i];
rv = logAppend(r->legacy.log, entry->term, entry->type, &entry->buf,
entry->batch);
if (rv != 0) {
return rv;
}
}
event.time = r->now;
event.type = RAFT_START;
event.start.term = term;
event.start.voted_for = voted_for;
event.start.metadata = NULL;
if (snapshot != NULL) {
metadata.index = snapshot->index;
metadata.term = snapshot->term;
metadata.configuration = snapshot->configuration;
metadata.configuration_index = snapshot->configuration_index;
event.start.metadata = &metadata;
}
event.start.start_index = start_index;
event.start.entries = entries;
event.start.n_entries = (unsigned)n_entries;
LegacyForwardToRaftIo(r, &event);
if (entries != NULL) {
raft_free(entries);
}
/* Start the I/O backend. The tickCb function is expected to fire every
* r->heartbeat_timeout milliseconds and recvCb whenever an RPC is
* received. */
rv = r->io->start(r->io, r->heartbeat_timeout, tickCb, recvCb);
if (rv != 0) {
tracef("io start failed %d", rv);
goto out;
}
out:
if (snapshot != NULL) {
raft_free(snapshot->bufs);
raft_free(snapshot);
}
if (rv != 0) {
return rv;
}
return 0;
}
raft_index raft_last_applied(struct raft *r)
{
return r->last_applied;
}
void raft_set_snapshot_threshold(struct raft *r, unsigned n)
{
r->legacy.snapshot_threshold = n;
}
void raft_set_snapshot_trailing(struct raft *r, unsigned n)
{
r->legacy.snapshot_trailing = n;
}
#undef tracef
raft-0.22.1/src/legacy.h 0000664 0000000 0000000 00000001147 14601504142 0014734 0 ustar 00root root 0000000 0000000 /* Compatibility layer between v1.x and v0.x. */
#ifndef RAFT_LEGACY_H_
#define RAFT_LEGACY_H_
#include "../include/raft.h"
/* Pass the given event to raft_step() and execute the resulting tasks using the
* legacy raft_io interface. */
int LegacyForwardToRaftIo(struct raft *r, struct raft_event *event);
/* Fail all pending client requests with RAFT_LEADERSHIPLOST. */
void LegacyFailPendingRequests(struct raft *r);
/* Fire the callbacks of all completed client requests. */
void LegacyFireCompletedRequests(struct raft *r);
void LegacyLeadershipTransferClose(struct raft *r);
#endif /* RAFT_LEGACY_H_ */
raft-0.22.1/src/log.c 0000664 0000000 0000000 00000062761 14601504142 0014255 0 ustar 00root root 0000000 0000000 #include "log.h"
#include
#include "../include/raft.h"
#include "assert.h"
#include "configuration.h"
/* Calculate the reference count hash table key for the given log entry index in
* an hash table of the given size.
*
* The hash is simply the log entry index minus one modulo the size. This
* minimizes conflicts in the most frequent case, where a new log entry is
* simply appended to the log and can use the hash table bucket next to the
* bucket for the entry with the previous index (possibly resizing the table if
* its cap is reached). */
static size_t refsKey(const raft_index index, const size_t size)
{
assert(index > 0);
assert(size > 0);
return (size_t)((index - 1) % size);
}
/* Try to insert a new reference count item for the given log entry index into
* the given reference count hash table.
*
* A collision happens when the bucket associated with the hash key of the given
* log entry index is already used to refcount log entries with a different
* index. In that case the collision output parameter will be set to true and no
* new reference count item is inserted into the hash table.
*
* If two log entries have the same index but different terms, the associated
* bucket will be grown accordingly. */
static int refsTryInsert(struct raft_entry_ref *table,
const size_t size,
const raft_term term,
const raft_index index,
const unsigned short count,
bool *collision)
{
struct raft_entry_ref *bucket; /* Bucket associated with this index. */
struct raft_entry_ref *next_slot; /* For traversing the bucket slots. */
struct raft_entry_ref *last_slot; /* To track the last traversed slot. */
struct raft_entry_ref *slot; /* Actual slot to use for this entry. */
size_t key;
assert(table != NULL);
assert(size > 0);
assert(term > 0);
assert(index > 0);
assert(count > 0);
assert(collision != NULL);
/* Calculate the hash table key for the given index. */
key = refsKey(index, size);
bucket = &table[key];
/* If a bucket is empty, then there's no collision and we can fill its first
* slot. */
if (bucket->count == 0) {
assert(bucket->next == NULL);
slot = bucket;
goto fill;
}
/* If the bucket is already used to refcount entries with a different
* index, then we have a collision and we must abort here. */
if (bucket->index != index) {
*collision = true;
return 0;
}
/* If we get here it means that the bucket is in use to refcount one or more
* entries with the same index as the given one, but different terms.
*
* We must append a newly allocated slot to refcount the entry with this
* term.
*
* So first let's find the last slot in the bucket. */
for (next_slot = bucket; next_slot != NULL; next_slot = next_slot->next) {
/* All entries in a bucket must have the same index. */
assert(next_slot->index == index);
/* It might happen that two entries with the same index and term get
* appended. For example if the same entry was truncated by a leader
* because it failed to be written to disk, but it is then received
* again from a new leader, while the old leader's reference count for
* that entry hasn't dropped to zero yet (e.g. it's being sent). In that
* case we return an error, and let the leader retry the AppendEntries,
* until it eventually succeed when all references to the old entry are
* gone. */
if (next_slot->term == term) {
return RAFT_BUSY;
}
last_slot = next_slot;
}
/* The last slot must have no next slot. */
assert(last_slot->next == NULL);
slot = raft_malloc(sizeof *slot);
if (slot == NULL) {
return RAFT_NOMEM;
}
last_slot->next = slot;
fill:
slot->term = term;
slot->index = index;
slot->count = count;
slot->next = NULL;
*collision = false;
return 0;
}
/* Move the slots of the given bucket into the given reference count hash
* table. The key of the bucket to use in the given table will be re-calculated
* according to the given size. */
static int refsMove(struct raft_entry_ref *bucket,
struct raft_entry_ref *table,
const size_t size)
{
struct raft_entry_ref *slot;
struct raft_entry_ref *next_slot;
assert(bucket != NULL);
assert(table != NULL);
assert(size > 0);
/* Only non-empty buckets should be moved. */
assert(bucket->count > 0);
/* For each slot in the bucket, insert the relevant entry in the given
* table, then free it. */
next_slot = bucket;
while (next_slot != NULL) {
bool collision;
int rv;
slot = next_slot;
/* Insert the reference count for this entry into the new table. */
rv = refsTryInsert(table, size, slot->term, slot->index, slot->count,
&collision);
next_slot = slot->next;
/* Unless this is the very first slot in the bucket, we need to free the
* slot. */
if (slot != bucket) {
raft_free(slot);
}
if (rv != 0) {
return rv;
}
/* The given hash table is assumed to be large enough to hold all ref
* counts without any conflict. */
assert(!collision);
};
return 0;
}
/* Grow the size of the reference count hash table. */
static int refsGrow(struct raft_log *l)
{
struct raft_entry_ref *table; /* New hash table. */
size_t size; /* Size of the new hash table. */
size_t i;
assert(l != NULL);
assert(l->refs_size > 0);
size = l->refs_size * 2; /* Double the table size */
table = raft_calloc(size, sizeof *table);
if (table == NULL) {
return RAFT_NOMEM;
}
/* Populate the new hash table, inserting all entries existing in the
* current hash table. Each bucket will have a different key in the new hash
* table, since the size has changed. */
for (i = 0; i < l->refs_size; i++) {
struct raft_entry_ref *bucket = &l->refs[i];
if (bucket->count > 0) {
int rv = refsMove(bucket, table, size);
if (rv != 0) {
return rv;
}
} else {
/* If the count is zero, we expect that the bucket is unused. */
assert(bucket->next == NULL);
}
}
raft_free(l->refs);
l->refs = table;
l->refs_size = size;
return 0;
}
/* Initialize the reference count of the entry with the given index, setting it
* to 1. */
static int refsInit(struct raft_log *l,
const raft_term term,
const raft_index index)
{
int i;
assert(l != NULL);
assert(term > 0);
assert(index > 0);
/* Initialize the hash map with a reasonable size */
if (l->refs == NULL) {
l->refs_size = LOG__REFS_INITIAL_SIZE;
l->refs = raft_calloc(l->refs_size, sizeof *l->refs);
if (l->refs == NULL) {
return RAFT_NOMEM;
}
}
/* Check if the bucket associated with the given index is available
* (i.e. there are no collisions), or grow the table and re-key it
* otherwise.
*
* We limit the number of times we try to grow the table to 10, to avoid
* eating up too much memory. In practice, there should never be a case
* where this is not enough. */
for (i = 0; i < 10; i++) {
bool collision;
int rc;
rc = refsTryInsert(l->refs, l->refs_size, term, index, 1, &collision);
if (rc != 0) {
return rc;
}
if (!collision) {
return 0;
}
rc = refsGrow(l);
if (rc != 0) {
return rc;
}
};
return RAFT_NOMEM;
}
/* Increment the refcount of the entry with the given term and index. */
static void refsIncr(struct raft_log *l,
const raft_term term,
const raft_index index)
{
size_t key; /* Hash table key for the given index. */
struct raft_entry_ref *slot; /* Slot for the given term/index */
assert(l != NULL);
assert(term > 0);
assert(index > 0);
key = refsKey(index, l->refs_size);
/* Lookup the slot associated with the given term/index, which must have
* been previously inserted. */
slot = &l->refs[key];
while (1) {
assert(slot != NULL);
assert(slot->index == index);
if (slot->term == term) {
break;
}
slot = slot->next;
}
assert(slot != NULL);
slot->count++;
}
/* Decrement the refcount of the entry with the given index. Return a boolean
* indicating whether the entry has now zero references. */
static bool refsDecr(struct raft_log *l,
const raft_term term,
const raft_index index)
{
size_t key; /* Hash table key for the given index. */
struct raft_entry_ref *slot; /* Slot for the given term/index */
struct raft_entry_ref *prev_slot; /* Slot preceeding the one to decrement */
assert(l != NULL);
assert(term > 0);
assert(index > 0);
key = refsKey(index, l->refs_size);
prev_slot = NULL;
/* Lookup the slot associated with the given term/index, keeping track of
* its previous slot in the bucket list. */
slot = &l->refs[key];
while (1) {
assert(slot != NULL);
assert(slot->index == index);
if (slot->term == term) {
break;
}
prev_slot = slot;
slot = slot->next;
}
slot->count--;
if (slot->count > 0) {
/* The entry is still referenced. */
return false;
}
/* If the refcount has dropped to zero, delete the slot. */
if (prev_slot != NULL) {
/* This isn't the very first slot, simply unlink it from the slot
* list. */
prev_slot->next = slot->next;
raft_free(slot);
} else if (slot->next != NULL) {
/* This is the very first slot, and slot list is not empty. Copy the
* second slot into the first one, then delete it. */
struct raft_entry_ref *second_slot = slot->next;
*slot = *second_slot;
raft_free(second_slot);
}
return true;
}
struct raft_log *logInit(void)
{
struct raft_log *log;
log = raft_malloc(sizeof(*log));
if (log == NULL) {
return NULL;
}
log->entries = NULL;
log->size = 0;
log->front = log->back = 0;
log->offset = 0;
log->refs = NULL;
log->refs_size = 0;
log->snapshot.last_index = 0;
log->snapshot.last_term = 0;
return log;
}
/* Return the index of the i'th entry in the log. */
static raft_index indexAt(struct raft_log *l, size_t i)
{
return l->offset + i + 1;
}
/* Return the circular buffer position of the i'th entry in the log. */
static size_t positionAt(struct raft_log *l, size_t i)
{
return (l->front + i) % l->size;
}
/* Return the i'th entry in the log. */
static struct raft_entry *entryAt(struct raft_log *l, size_t i)
{
return &l->entries[positionAt(l, i)];
}
void logClose(struct raft_log *l)
{
void *batch = NULL; /* Last batch that has been freed */
assert(l != NULL);
if (l->entries != NULL) {
size_t i;
size_t n = logNumEntries(l);
for (i = 0; i < n; i++) {
struct raft_entry *entry = entryAt(l, i);
raft_index index = indexAt(l, i);
size_t key = refsKey(index, l->refs_size);
struct raft_entry_ref *slot = &l->refs[key];
/* We require that there are no outstanding references to active
* entries. */
assert(slot->count == 1);
/* TODO: we should support the case where the bucket has more than
* one slot. */
assert(slot->next == NULL);
/* Release the memory used by the entry data (either directly or via
* a batch). */
if (entry->batch == NULL) {
if (entry->buf.base != NULL) {
raft_free(entry->buf.base);
}
} else {
if (entry->batch != batch) {
/* This batch was not released yet, so let's do it now. */
batch = entry->batch;
raft_free(entry->batch);
}
}
}
raft_free(l->entries);
}
if (l->refs != NULL) {
raft_free(l->refs);
}
raft_free(l);
}
void logStart(struct raft_log *l,
raft_index snapshot_index,
raft_term snapshot_term,
raft_index start_index)
{
assert(logNumEntries(l) == 0);
assert(start_index > 0);
assert(start_index <= snapshot_index + 1);
assert(snapshot_index == 0 || snapshot_term != 0);
l->snapshot.last_index = snapshot_index;
l->snapshot.last_term = snapshot_term;
l->offset = start_index - 1;
}
/* Ensure that the entries array has enough free slots for adding a new entry.
*/
static int ensureCapacity(struct raft_log *l)
{
struct raft_entry *entries; /* New entries array */
size_t n; /* Current number of entries */
size_t size; /* Size of the new array */
size_t i;
n = logNumEntries(l);
if (n + 1 < l->size) {
return 0;
}
/* Make the new size twice the current size plus one (for the new
* entry). Over-allocating now avoids smaller allocations later. */
size = (l->size + 1) * 2;
entries = raft_calloc(size, sizeof *entries);
if (entries == NULL) {
return RAFT_NOMEM;
}
/* Copy all active old entries to the beginning of the newly allocated
* array. */
for (i = 0; i < n; i++) {
memcpy(&entries[i], entryAt(l, i), sizeof *entries);
}
/* Release the old entries array. */
if (l->entries != NULL) {
raft_free(l->entries);
}
l->entries = entries;
l->size = size;
l->front = 0;
l->back = n;
return 0;
}
int logAppend(struct raft_log *l,
const raft_term term,
const enum raft_entry_type type,
const struct raft_buffer *buf,
void *batch)
{
int rv;
struct raft_entry *entry;
raft_index index;
assert(l != NULL);
assert(term > 0);
assert(type == RAFT_CHANGE || type == RAFT_BARRIER || type == RAFT_COMMAND);
assert(buf != NULL);
rv = ensureCapacity(l);
if (rv != 0) {
assert(rv == RAFT_NOMEM);
goto err;
}
index = logLastIndex(l) + 1;
rv = refsInit(l, term, index);
if (rv != 0) {
assert(rv == RAFT_NOMEM || rv == RAFT_BUSY);
goto err;
}
entry = &l->entries[l->back];
entry->term = term;
entry->type = type;
entry->buf = *buf;
entry->batch = batch;
l->back += 1;
l->back = l->back % l->size;
return 0;
err:
assert(rv == RAFT_NOMEM || rv == RAFT_BUSY);
return rv;
}
int logAppendCommands(struct raft_log *l,
const raft_term term,
const struct raft_buffer bufs[],
const unsigned n)
{
unsigned i;
int rv;
assert(l != NULL);
assert(term > 0);
assert(bufs != NULL);
assert(n > 0);
for (i = 0; i < n; i++) {
const struct raft_buffer *buf = &bufs[i];
rv = logAppend(l, term, RAFT_COMMAND, buf, NULL);
if (rv != 0) {
return rv;
}
}
return 0;
}
int logAppendConfiguration(struct raft_log *l,
const raft_term term,
const struct raft_configuration *configuration)
{
struct raft_buffer buf;
int rv;
assert(l != NULL);
assert(term > 0);
assert(configuration != NULL);
/* Encode the configuration into a buffer. */
rv = configurationEncode(configuration, &buf);
if (rv != 0) {
goto err;
}
/* Append the new entry to the log. */
rv = logAppend(l, term, RAFT_CHANGE, &buf, NULL);
if (rv != 0) {
goto err_after_encode;
}
return 0;
err_after_encode:
raft_free(buf.base);
err:
assert(rv != 0);
return rv;
}
size_t logNumEntries(struct raft_log *l)
{
assert(l != NULL);
/* The circular buffer is not wrapped. */
if (l->front <= l->back) {
return l->back - l->front;
}
/* The circular buffer is wrapped. */
return l->size - l->front + l->back;
}
raft_index logLastIndex(struct raft_log *l)
{
/* If there are no entries in the log, but there is a snapshot available
* check that it's last index is consistent with the offset. */
if (logNumEntries(l) == 0 && l->snapshot.last_index != 0) {
assert(l->offset <= l->snapshot.last_index);
}
return l->offset + logNumEntries(l);
}
/* Return the position of the entry with the given index in the entries array.
*
* If no entry with the given index is in the log return the size of the entries
* array. */
static size_t locateEntry(struct raft_log *l, const raft_index index)
{
size_t n = logNumEntries(l);
if (n == 0 || index < indexAt(l, 0) || index > indexAt(l, n - 1)) {
return l->size;
}
/* Get the circular buffer position of the desired entry. Log indexes start
* at 1, so we subtract one to get array indexes. We also need to subtract
* any index offset this log might start at. */
return positionAt(l, (size_t)((index - 1) - l->offset));
}
raft_term logTermOf(struct raft_log *l, const raft_index index)
{
size_t i;
assert(index > 0);
assert(l->offset <= l->snapshot.last_index);
if ((index < l->offset + 1 && index != l->snapshot.last_index) ||
index > logLastIndex(l)) {
return 0;
}
if (index == l->snapshot.last_index) {
assert(l->snapshot.last_term != 0);
/* Coherence check that if we still have the entry at last_index, its
* term matches the one in the snapshot. */
i = locateEntry(l, index);
if (i != l->size) {
assert(l->entries[i].term == l->snapshot.last_term);
}
return l->snapshot.last_term;
}
i = locateEntry(l, index);
assert(i < l->size);
return l->entries[i].term;
}
raft_index logSnapshotIndex(struct raft_log *l)
{
return l->snapshot.last_index;
}
raft_term logSnapshotTerm(struct raft_log *l)
{
return l->snapshot.last_term;
}
raft_term logLastTerm(struct raft_log *l)
{
raft_index last_index;
last_index = logLastIndex(l);
return last_index > 0 ? logTermOf(l, last_index) : 0;
}
const struct raft_entry *logGet(struct raft_log *l, const raft_index index)
{
size_t i;
assert(l != NULL);
/* Get the array index of the desired entry. */
i = locateEntry(l, index);
if (i == l->size) {
return NULL;
}
assert(i < l->size);
return &l->entries[i];
}
int logAcquireAtMost(struct raft_log *l,
const raft_index index,
int max,
struct raft_entry *entries[],
unsigned *n)
{
size_t i;
size_t j;
assert(l != NULL);
assert(index > 0);
assert(entries != NULL);
assert(n != NULL);
/* Get the array index of the first entry to acquire. */
i = locateEntry(l, index);
if (i == l->size || max == 0) {
*n = 0;
*entries = NULL;
return 0;
}
if (i < l->back) {
/* The last entry does not wrap with respect to i, so the number of
* entries is simply the length of the range [i...l->back). */
*n = (unsigned)(l->back - i);
} else {
/* The last entry wraps with respect to i, so the number of entries is
* the sum of the lengths of the ranges [i...l->size) and [0...l->back),
* which is l->size - i + l->back.*/
*n = (unsigned)(l->size - i + l->back);
}
assert(*n > 0);
if (max != -1 && *n > (unsigned)max) {
*n = (unsigned)max;
}
*entries = raft_calloc(*n, sizeof **entries);
if (*entries == NULL) {
return RAFT_NOMEM;
}
for (j = 0; j < *n; j++) {
size_t k = (i + j) % l->size;
struct raft_entry *entry = &(*entries)[j];
*entry = l->entries[k];
refsIncr(l, entry->term, index + j);
}
return 0;
}
int logAcquire(struct raft_log *l,
raft_index index,
struct raft_entry *entries[],
unsigned *n)
{
return logAcquireAtMost(l, index, -1, entries, n);
}
/* Return true if the given batch is referenced by any entry currently in the
* log. */
static bool isBatchReferenced(struct raft_log *l, const void *batch)
{
size_t i;
/* Iterate through all live entries to see if there's one
* belonging to the same batch. This is slightly inefficient but
* this code path should be taken very rarely in practice. */
for (i = 0; i < logNumEntries(l); i++) {
struct raft_entry *entry = entryAt(l, i);
if (entry->batch == batch) {
return true;
}
}
return false;
}
void logRelease(struct raft_log *l,
const raft_index index,
struct raft_entry entries[],
const unsigned n)
{
size_t i;
void *batch = NULL; /* Last batch whose memory was freed */
assert(l != NULL);
assert((entries == NULL && n == 0) || (entries != NULL && n > 0));
for (i = 0; i < n; i++) {
struct raft_entry *entry = &entries[i];
bool unref;
unref = refsDecr(l, entry->term, index + i);
/* If there are no outstanding references to this entry, free its
* payload if it's not part of a batch, or check if we can free the
* batch itself. */
if (unref) {
if (entries[i].batch == NULL) {
if (entry->buf.base != NULL) {
raft_free(entries[i].buf.base);
}
} else {
if (entry->batch != batch) {
if (!isBatchReferenced(l, entry->batch)) {
batch = entry->batch;
raft_free(batch);
}
}
}
}
}
if (entries != NULL) {
raft_free(entries);
}
}
/* Clear the log if it became empty. */
static void clearIfEmpty(struct raft_log *l)
{
if (logNumEntries(l) > 0) {
return;
}
raft_free(l->entries);
l->entries = NULL;
l->size = 0;
l->front = 0;
l->back = 0;
}
/* Destroy an entry, possibly releasing the memory of its buffer. */
static void destroyEntry(struct raft_log *l, struct raft_entry *entry)
{
if (entry->batch == NULL) {
if (entry->buf.base != NULL) {
raft_free(entry->buf.base);
}
} else {
if (!isBatchReferenced(l, entry->batch)) {
raft_free(entry->batch);
}
}
}
/* Core logic of @logTruncate and @logDiscard, removing all log entries from
* @index onward. If @destroy is true, also destroy the removed entries. */
static void removeSuffix(struct raft_log *l,
const raft_index index,
bool destroy)
{
size_t i;
size_t n;
raft_index start = index;
assert(l != NULL);
assert(index > l->offset);
assert(index <= logLastIndex(l));
/* Number of entries to delete */
n = (size_t)(logLastIndex(l) - start) + 1;
for (i = 0; i < n; i++) {
struct raft_entry *entry;
bool unref;
if (l->back == 0) {
l->back = l->size - 1;
} else {
l->back--;
}
entry = &l->entries[l->back];
unref = refsDecr(l, entry->term, start + n - i - 1);
if (unref && destroy) {
destroyEntry(l, entry);
}
}
clearIfEmpty(l);
}
void logTruncate(struct raft_log *l, const raft_index index)
{
if (logNumEntries(l) == 0) {
return;
}
removeSuffix(l, index, true);
}
void logDiscard(struct raft_log *l, const raft_index index)
{
removeSuffix(l, index, false);
}
/* Delete all entries up to the given index (included). */
static void removePrefix(struct raft_log *l, const raft_index index)
{
size_t i;
size_t n;
assert(l != NULL);
assert(index > 0);
assert(index <= logLastIndex(l));
/* Number of entries to delete */
n = (size_t)(index - indexAt(l, 0)) + 1;
for (i = 0; i < n; i++) {
struct raft_entry *entry;
bool unref;
entry = &l->entries[l->front];
if (l->front == l->size - 1) {
l->front = 0;
} else {
l->front++;
}
l->offset++;
unref = refsDecr(l, entry->term, l->offset);
if (unref) {
destroyEntry(l, entry);
}
}
clearIfEmpty(l);
}
void logSnapshot(struct raft_log *l, raft_index last_index, unsigned trailing)
{
raft_term last_term = logTermOf(l, last_index);
/* We must have an entry at this index */
assert(last_term != 0);
l->snapshot.last_index = last_index;
l->snapshot.last_term = last_term;
/* If we have not at least n entries preceeding the given last index, then
* there's nothing to remove and we're done. */
if (last_index <= trailing ||
locateEntry(l, last_index - trailing) == l->size) {
return;
}
removePrefix(l, last_index - trailing);
}
void logRestore(struct raft_log *l, raft_index last_index, raft_term last_term)
{
size_t n = logNumEntries(l);
assert(last_index > 0);
assert(last_term > 0);
if (n > 0) {
logTruncate(l, logLastIndex(l) - n + 1);
}
l->snapshot.last_index = last_index;
l->snapshot.last_term = last_term;
l->offset = last_index;
}
raft-0.22.1/src/log.h 0000664 0000000 0000000 00000016353 14601504142 0014256 0 ustar 00root root 0000000 0000000 /* In-memory cache of the persistent raft log stored on disk. */
#ifndef RAFT_LOG_H_
#define RAFT_LOG_H_
#include "../include/raft.h"
/* Initial size of the entry reference count hash table. */
#define LOG__REFS_INITIAL_SIZE 256
/**
* Counter for outstanding references to a log entry.
*
* When an entry is first appended to the log, its refcount is set to one (the
* log itself is the only one referencing the entry). Whenever an entry is
* included in an I/O request (to write it to disk or to send it to other
* servers) its refcount is increased by one. Whenever an entry gets deleted
* from the log its refcount is decreased by one. Likewise, whenever an I/O
* request is completed the refcount of the relevant entries is decreased by
* one. When the refcount drops to zero the memory that its @buf attribute
* points to gets released, or, if the @batch attribute is non-NULL, a check is
* made to see if all other entries of the same batch also have a zero refcount,
* and the memory that @batch points to gets released if that's the case.
*/
struct raft_entry_ref
{
raft_term term; /* Term of the entry being ref-counted. */
raft_index index; /* Index of the entry being ref-counted. */
unsigned short count; /* Number of references. */
struct raft_entry_ref *next; /* Next item in the bucket (for collisions). */
};
/**
* In-memory cache of the persistent raft log stored on disk.
*
* The raft log cache is implemented as a circular buffer of log entries, which
* makes some frequent operations very efficient (e.g. deleting the first N
* entries when snapshotting).
*/
struct raft_log
{
struct raft_entry *entries; /* Circular buffer of log entries. */
size_t size; /* Number of available slots in the buffer. */
size_t front, back; /* Indexes of used slots [front, back). */
raft_index offset; /* Index of first entry is offset+1. */
struct raft_entry_ref *refs; /* Log entries reference counts hash table. */
size_t refs_size; /* Size of the reference counts hash table. */
struct /* Information about last snapshot, or zero. */
{
raft_index last_index; /* Snapshot replaces all entries up to here. */
raft_term last_term; /* Term of last index. */
} snapshot;
};
/* Initialize an empty in-memory log of raft entries. */
struct raft_log *logInit(void);
/* Release all memory used by the given log object. */
void logClose(struct raft_log *l);
/* Called at startup when populating the log with entries loaded from disk. It
* sets the starting state of the log. The start index must be lower or equal
* than snapshot_index + 1. */
void logStart(struct raft_log *l,
raft_index snapshot_index,
raft_term snapshot_term,
raft_index start_index);
/* Get the number of entries the log currently contains. */
size_t logNumEntries(struct raft_log *l);
/* Get the index of the last entry in the log. Return #0 if the log is empty. */
raft_index logLastIndex(struct raft_log *l);
/* Get the term of the last entry in the log. Return #0 if the log is empty. */
raft_term logLastTerm(struct raft_log *l);
/* Get the term of the entry with the given index. Return #0 if @index is *
* greater than the last index of the log, or if it's lower than oldest index we
* know the term of (either because it's outstanding or because it's the last
* entry in the most recent snapshot). */
raft_term logTermOf(struct raft_log *l, raft_index index);
/* Get the index of the last entry in the most recent snapshot. Return #0 if
* there are no snapshots. */
raft_index logSnapshotIndex(struct raft_log *l);
/* Get the term of the last entry of the most recent snapshot. Return #0 if
* there are no snapshots. */
raft_term logSnapshotTerm(struct raft_log *l);
/* Get the entry with the given index. * The returned pointer remains valid only
* as long as no API that might delete the entry with the given index is
* invoked. Return #NULL if there is no such entry. */
const struct raft_entry *logGet(struct raft_log *l, const raft_index index);
/* Append a new entry to the log.
*
* Errors:
*
* RAFT_BUSY
* Attempt to append an index with the same index and term of a referenced
* one (e.g. the referenced entry is being persisted).
*
* RAFT_NOMEM
* Memory for the new entry could not be allocated.
*/
int logAppend(struct raft_log *l,
raft_term term,
enum raft_entry_type type,
const struct raft_buffer *buf,
void *batch);
/* Convenience to append a series of #RAFT_COMMAND entries. */
int logAppendCommands(struct raft_log *l,
const raft_term term,
const struct raft_buffer bufs[],
const unsigned n);
/* Convenience to encode and append a single #RAFT_CHANGE entry. */
int logAppendConfiguration(struct raft_log *l,
const raft_term term,
const struct raft_configuration *configuration);
/* Acquire at most @max entries from the given index onwards.
*
* If @max is -1, no limit is applied. */
int logAcquireAtMost(struct raft_log *l,
raft_index index,
int max,
struct raft_entry *entries[],
unsigned *n);
/* Acquire an array of entries from the given index onwards.
*
* The payload memory referenced by the @buf attribute of the returned entries
* is guaranteed to be valid until logRelease() is called. */
int logAcquire(struct raft_log *l,
raft_index index,
struct raft_entry *entries[],
unsigned *n);
/* Release a previously acquired array of entries. */
void logRelease(struct raft_log *l,
raft_index index,
struct raft_entry entries[],
unsigned n);
/* Delete all entries from the given index (included) onwards. If the log is
* empty this is a no-op. If @index is lower than or equal to the index of the
* first entry in the log, then the log will become empty. */
void logTruncate(struct raft_log *l, const raft_index index);
/* Discard all entries from the given index (included) onwards. This is exactly
* the same as truncate, but the memory of the entries does not gets
* released. This is called as part of error handling, when reverting the effect
* of previous logAppend calls. */
void logDiscard(struct raft_log *l, const raft_index index);
/* To be called when taking a new snapshot. The log must contain an entry at
* last_index, which is the index of the last entry included in the
* snapshot. The function will update the last snapshot information and delete
* all entries up to last_index - trailing (included). If the log contains no
* entry at last_index - trailing, then no entry will be deleted. */
void logSnapshot(struct raft_log *l, raft_index last_index, unsigned trailing);
/* To be called when installing a snapshot.
*
* The log can be in any state. All outstanding entries will be discarded, the
* last index and last term of the most recent snapshot will be set to the given
* values, and the offset adjusted accordingly. */
void logRestore(struct raft_log *l, raft_index last_index, raft_term last_term);
#endif /* RAFT_LOG_H_ */
raft-0.22.1/src/membership.c 0000664 0000000 0000000 00000014062 14601504142 0015616 0 ustar 00root root 0000000 0000000 #include "membership.h"
#include "../include/raft.h"
#include "assert.h"
#include "configuration.h"
#include "err.h"
#include "heap.h"
#include "message.h"
#include "progress.h"
#include "queue.h"
#include "tracing.h"
#include "trail.h"
#define infof(...) Infof(r->tracer, " " __VA_ARGS__)
int membershipCanChangeConfiguration(struct raft *r)
{
int rv;
if (r->configuration_uncommitted_index != 0) {
rv = RAFT_CANTCHANGE;
goto err;
}
if (r->leader_state.promotee_id != 0) {
rv = RAFT_CANTCHANGE;
goto err;
}
/* In order to become leader at all we are supposed to have committed at
* least the initial configuration at index 1. */
assert(r->configuration_committed_index > 0);
/* The index of the last committed configuration can't be greater than the
* last log index. */
assert(TrailLastIndex(&r->trail) >= r->configuration_committed_index);
/* No catch-up round should be in progress. */
assert(r->leader_state.round_number == 0);
assert(r->leader_state.round_index == 0);
assert(r->leader_state.round_start == 0);
return 0;
err:
assert(rv == RAFT_CANTCHANGE);
ErrMsgFromCode(r->errmsg, rv);
return rv;
}
bool membershipUpdateCatchUpRound(struct raft *r)
{
unsigned server_index;
raft_index match_index;
raft_index last_index;
raft_time round_duration;
bool is_up_to_date;
bool is_fast_enough;
assert(r->state == RAFT_LEADER);
assert(r->leader_state.promotee_id != 0);
server_index =
configurationIndexOf(&r->configuration, r->leader_state.promotee_id);
assert(server_index < r->configuration.n);
match_index = progressMatchIndex(r, server_index);
/* If the server did not reach the target index for this round, it did not
* catch up. */
if (match_index < r->leader_state.round_index) {
infof(
"member (index: %u) not yet caught up match_index:%llu "
"round_index:%llu",
server_index, match_index, r->leader_state.round_index);
return false;
}
last_index = TrailLastIndex(&r->trail);
round_duration = r->now - r->leader_state.round_start;
is_up_to_date = match_index == last_index;
is_fast_enough = round_duration < r->election_timeout;
infof("member is_up_to_date:%d is_fast_enough:%d", is_up_to_date,
is_fast_enough);
/* If the server's log is fully up-to-date or the round that just terminated
* was fast enough, then the server as caught up. */
if (is_up_to_date || is_fast_enough) {
r->leader_state.round_number = 0;
r->leader_state.round_index = 0;
r->leader_state.round_start = 0;
progressCatchUpFinish(r, server_index);
return true;
}
/* If we get here it means that this catch-up round is complete, but there
* are more entries to replicate, or it was not fast enough. Let's start a
* new round. */
r->leader_state.round_number++;
r->leader_state.round_index = last_index;
r->leader_state.round_start = r->now;
return false;
}
int membershipUncommittedChange(struct raft *r,
const raft_index index,
const struct raft_entry *entry)
{
struct raft_configuration configuration;
int rv;
assert(r != NULL);
assert(r->state == RAFT_FOLLOWER || r->state == RAFT_LEADER);
assert(entry != NULL);
assert(entry->type == RAFT_CHANGE);
rv = configurationDecode(&entry->buf, &configuration);
if (rv != 0) {
assert(rv == RAFT_NOMEM || rv == RAFT_MALFORMED);
goto err;
}
if (r->state == RAFT_LEADER) {
/* Rebuild the progress array if the new configuration has a different
* number of servers than the old one. */
if (configuration.n != r->configuration.n) {
rv = progressRebuildArray(r, &configuration);
if (rv != 0) {
assert(rv == RAFT_NOMEM);
goto err_after_decode;
}
}
}
raft_configuration_close(&r->configuration);
r->configuration = configuration;
r->configuration_uncommitted_index = index;
return 0;
err_after_decode:
configurationClose(&configuration);
err:
assert(rv == RAFT_NOMEM || rv == RAFT_MALFORMED);
return rv;
}
int membershipRollback(struct raft *r)
{
int rv;
assert(r != NULL);
assert(r->state == RAFT_FOLLOWER);
assert(r->configuration_uncommitted_index > 0);
infof("roll back uncommitted configuration (%llu^%llu)",
r->configuration_uncommitted_index,
TrailTermOf(&r->trail, r->configuration_uncommitted_index));
/* Replace the current configuration with the last committed one. */
assert(r->configuration_committed_index > 0);
configurationClose(&r->configuration);
rv = configurationCopy(&r->configuration_committed, &r->configuration);
if (rv != 0) {
assert(rv == RAFT_NOMEM);
return rv;
}
r->configuration_uncommitted_index = 0;
return 0;
}
int membershipLeadershipTransferStart(struct raft *r)
{
const struct raft_server *server;
struct raft_message message;
int rv;
assert(r->state == RAFT_LEADER);
assert(r->leader_state.transferee != 0);
assert(!r->leader_state.transferring);
server = configurationGet(&r->configuration, r->leader_state.transferee);
assert(server != NULL);
message.type = RAFT_TIMEOUT_NOW;
message.timeout_now.version = MESSAGE__TIMEOUT_NOW_VERSION;
message.timeout_now.term = r->current_term;
message.timeout_now.last_log_index = TrailLastIndex(&r->trail);
message.timeout_now.last_log_term = TrailLastTerm(&r->trail);
message.server_id = server->id;
message.server_address = server->address;
infof("send timeout to %llu", server->id);
rv = MessageEnqueue(r, &message);
if (rv != 0) {
assert(rv == RAFT_NOMEM);
ErrMsgPrintf(r->errmsg, "send timeout now to %llu", server->id);
return rv;
}
/* Set the leadership transfer in progress flag. */
r->leader_state.transferring = true;
return 0;
}
#undef infof
raft-0.22.1/src/membership.h 0000664 0000000 0000000 00000004472 14601504142 0015627 0 ustar 00root root 0000000 0000000 /* Membership-related APIs. */
#ifndef MEMBERSHIP_H_
#define MEMBERSHIP_H_
#include "../include/raft.h"
/* XXX Internal code for transfer request objects. Used by the legacy layer to
* differentiate between items in the legacy.requests queue. */
#define RAFT_TRANSFER_ (RAFT_CHANGE + 1)
/* Helper returning an error if the configuration can't be changed,
*
* Errors:
*
* RAFT_CANTCHANGE
* A configuration change or a promotion are in progress.
*/
int membershipCanChangeConfiguration(struct raft *r);
/* Update the information about the progress that the non-voting server
* currently being promoted is making in catching with logs.
*
* Return false if the server being promoted did not yet catch-up with logs, and
* true if it did.
*
* This function must be called only by leaders after a @raft_assign request
* has been submitted. */
bool membershipUpdateCatchUpRound(struct raft *r);
/* Update the local configuration replacing it with the content of the given
* RAFT_CHANGE entry, which has just been received in as part of a RAFT_SUBMIT
* event on a leader or a RAFT_RECEIVE event of an AppendEntries message on a
* follower. The uncommitted configuration index will be updated accordingly.
*
* It must be called only by followers or leaders.
*
* Errors:
*
* RAFT_NOMEM
* A new raft_configuration object to hold the decoded configuration could
* not be allocated.
*
* RAFT_MALFORMED
* The entry data does not contain a valid encoded configuration.
*/
int membershipUncommittedChange(struct raft *r,
const raft_index index,
const struct raft_entry *entry);
/* Rollback an uncommitted configuration change that was applied locally, but
* failed to be committed. It must be called by followers after they receive an
* AppendEntries RPC request that instructs them to evict the uncommitted entry
* from their log.
*
* Errors:
*
* RAFT_NOMEM
* A copy of the last committed configuration to rollback to could not be
* made.
*/
int membershipRollback(struct raft *r);
/* Start the leadership transfer by sending a TimeoutNow message to the target
* server.
*
* Errors:
*
* RAFT_NOMEM
* The TimeoutNow message could not be enqueued.
*/
int membershipLeadershipTransferStart(struct raft *r);
#endif /* MEMBERSHIP_H_ */
raft-0.22.1/src/message.c 0000664 0000000 0000000 00000002502 14601504142 0015103 0 ustar 00root root 0000000 0000000 #include "message.h"
#include "assert.h"
#include "heap.h"
#include "queue.h"
/* Ensure that the r->messages array as at least n_messages slots, and expand it
* if needed.
*
* Return NULL if no-memory is available. */
static int messageEnsureQueueCapacity(struct raft *r, const unsigned n_messages)
{
unsigned n_messages_cap = r->n_messages_cap;
struct raft_message *messages;
if (n_messages <= n_messages_cap) {
return 0;
}
if (n_messages_cap == 0) {
n_messages_cap = 16; /* Initial cap */
} else {
n_messages_cap *= 2;
}
messages = raft_realloc(r->messages, sizeof *r->messages * n_messages_cap);
if (messages == NULL) {
return RAFT_NOMEM;
}
r->messages = messages;
r->n_messages_cap = n_messages_cap;
r->update->messages.batch = r->messages;
return 0;
}
int MessageEnqueue(struct raft *r, struct raft_message *message)
{
unsigned n_messages = r->update->messages.n + 1;
int rv;
assert(r->update->messages.batch == r->messages);
rv = messageEnsureQueueCapacity(r, n_messages);
if (rv != 0) {
assert(rv == RAFT_NOMEM);
return rv;
}
r->update->messages.n = n_messages;
r->update->messages.batch[r->update->messages.n - 1] = *message;
r->update->flags |= RAFT_UPDATE_MESSAGES;
return 0;
}
raft-0.22.1/src/message.h 0000664 0000000 0000000 00000001400 14601504142 0015104 0 ustar 00root root 0000000 0000000 /* Enqueue messages to be sent. */
#ifndef RAFT_MESSAGE_H_
#define RAFT_MESSAGE_H_
#include "../include/raft.h"
/* Message types */
#define MESSAGE__REQUEST_VOTE_VERSION 2
#define MESSAGE__REQUEST_VOTE_RESULT_VERSION 2
#define MESSAGE__APPEND_ENTRIES_VERSION 0
#define MESSAGE__APPEND_ENTRIES_RESULT_VERSION 2
#define MESSAGE__INSTALL_SNAPSHOT_VERSION 0
#define MESSAGE__TIMEOUT_NOW_VERSION 0
/* Feature flags */
#define MESSAGE__FEATURE_CAPACITY 1 << 0
/* Add the given message to the array of messages attached to the struct
* raft_update to be returned.
*
* Errors:
*
* RAFT_NOMEM
* The r->messages array could not be resized to fit the new message.
*/
int MessageEnqueue(struct raft *r, struct raft_message *message);
#endif /* RAFT_MESSAGE_H_ */
raft-0.22.1/src/progress.c 0000664 0000000 0000000 00000033117 14601504142 0015331 0 ustar 00root root 0000000 0000000 #include
#include "progress.h"
#include "assert.h"
#include "configuration.h"
#include "tracing.h"
#include "trail.h"
#define infof(...) Infof(r->tracer, " " __VA_ARGS__)
#define tracef(...) Tracef(r->tracer, __VA_ARGS__)
#ifndef max
#define max(a, b) ((a) < (b) ? (b) : (a))
#endif
#ifndef min
#define min(a, b) ((a) < (b) ? (a) : (b))
#endif
/* Initialize a single progress object. */
static void initProgress(struct raft_progress *p, raft_index last_index)
{
p->next_index = last_index + 1;
p->match_index = 0;
p->last_send = ULLONG_MAX;
p->last_recv = ULLONG_MAX;
p->snapshot.index = 0;
p->snapshot.last_send = ULLONG_MAX;
p->state = PROGRESS__PROBE;
p->catch_up = RAFT_CATCH_UP_NONE;
p->features = 0;
p->capacity = 0;
}
struct raft_progress *progressBuildArray(struct raft *r)
{
struct raft_progress *progress;
unsigned i;
raft_index last_index = TrailLastIndex(&r->trail);
progress = raft_malloc(r->configuration.n * sizeof *progress);
if (progress == NULL) {
return NULL;
}
for (i = 0; i < r->configuration.n; i++) {
initProgress(&progress[i], last_index);
if (r->configuration.servers[i].id == r->id) {
progress[i].match_index = r->last_stored;
}
}
return progress;
}
int progressRebuildArray(struct raft *r,
const struct raft_configuration *configuration)
{
raft_index last_index = TrailLastIndex(&r->trail);
struct raft_progress *progress;
unsigned i;
unsigned j;
raft_id id;
progress = raft_malloc(configuration->n * sizeof *progress);
if (progress == NULL) {
return RAFT_NOMEM;
}
/* First copy the progress information for the servers that exists both in
* the current and in the new configuration. */
for (i = 0; i < r->configuration.n; i++) {
id = r->configuration.servers[i].id;
j = configurationIndexOf(configuration, id);
if (j == configuration->n) {
/* This server is not present in the new configuration, so we just
* skip it. */
continue;
}
progress[j] = r->leader_state.progress[i];
}
/* Then reset the replication state for servers that are present in the new
* configuration, but not in the current one. */
for (i = 0; i < configuration->n; i++) {
id = configuration->servers[i].id;
j = configurationIndexOf(&r->configuration, id);
if (j < r->configuration.n) {
/* This server is present both in the new and in the current
* configuration, so we have already copied its next/match index
* value in the loop above. */
continue;
}
assert(j == r->configuration.n);
initProgress(&progress[i], last_index);
}
raft_free(r->leader_state.progress);
r->leader_state.progress = progress;
return 0;
}
bool progressIsUpToDate(struct raft *r, unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
raft_index last_index = TrailLastIndex(&r->trail);
return p->next_index == last_index + 1;
}
bool progressIsOnline(struct raft *r, unsigned i)
{
raft_time last_recv = r->leader_state.progress[i].last_recv;
if (last_recv == ULLONG_MAX) {
return false;
}
assert(r->now >= last_recv);
return r->now - last_recv < r->election_timeout;
}
bool progressHasContactedRecently(struct raft *r, unsigned i)
{
raft_time last_recv = r->leader_state.progress[i].last_recv;
if (last_recv != ULLONG_MAX && last_recv >= r->election_timer_start) {
return true;
}
return false;
}
bool progressShouldReplicate(struct raft *r, unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
raft_index last_index = TrailLastIndex(&r->trail);
bool needs_heartbeat = false;
bool result = false;
/* We must be in a valid state. */
assert(p->state == PROGRESS__PROBE || p->state == PROGRESS__PIPELINE ||
p->state == PROGRESS__SNAPSHOT);
/* The next index to send must be lower than the highest index in our
* log. */
assert(p->next_index <= last_index + 1);
/* The last_send field is either at its max value (we never sent any
* message), or it must be lower or equal than the current time.*/
assert(p->last_send == ULLONG_MAX || p->last_send <= r->now);
/* If we never sent any AppendEntries message to this follower, or if the
* last time we sent it an AppendEntries message was more than a heartbeat
* timeout ago, we need to send a heartbeat. */
if (p->last_send == ULLONG_MAX ||
r->now - p->last_send >= r->heartbeat_timeout) {
needs_heartbeat = true;
}
switch (p->state) {
case PROGRESS__SNAPSHOT:
/* We are in snapshot mode, so we must have sent a snapshot. */
assert(p->snapshot.last_send != ULLONG_MAX);
/* Snapshot timed out, move to PROBE */
if (r->now - p->snapshot.last_send >= r->install_snapshot_timeout) {
infof("timeout install snapshot at index %llu",
p->snapshot.index);
result = true;
progressAbortSnapshot(r, i);
} else {
/* Enforce Leadership during follower Snapshot installation */
result = needs_heartbeat;
}
break;
case PROGRESS__PROBE:
/* We send at most one message per heartbeat interval. */
result = needs_heartbeat;
break;
case PROGRESS__PIPELINE:
/* In pipeline mode we replicate new entries immediately, and send
* empty append entries messages if we haven't sent anything in the
* last heartbeat interval (i.e. there were no new entries in that
* period). */
result = !progressIsUpToDate(r, i) || needs_heartbeat;
break;
}
return result;
}
raft_index progressNextIndex(struct raft *r, unsigned i)
{
return r->leader_state.progress[i].next_index;
}
raft_index progressMatchIndex(const struct raft *r, unsigned i)
{
return r->leader_state.progress[i].match_index;
}
void progressUpdateLastSend(struct raft *r, unsigned i)
{
r->leader_state.progress[i].last_send = r->now;
r->update->flags |= RAFT_UPDATE_TIMEOUT;
}
void progressUpdateSnapshotLastSend(struct raft *r, unsigned i)
{
r->leader_state.progress[i].snapshot.last_send = r->now;
}
void progressUpdateLastRecv(struct raft *r, unsigned i)
{
r->leader_state.progress[i].last_recv = r->now;
}
void progressSetFeatures(struct raft *r,
const unsigned i,
unsigned short features)
{
r->leader_state.progress[i].features = features;
}
unsigned short progressGetFeatures(const struct raft *r, const unsigned i)
{
return r->leader_state.progress[i].features;
}
void progressSetCapacity(struct raft *r,
const unsigned i,
unsigned short capacity)
{
r->leader_state.progress[i].capacity = capacity;
}
unsigned short progressGetCapacity(const struct raft *r, const unsigned i)
{
return r->leader_state.progress[i].capacity;
}
raft_time progressGetLastSend(const struct raft *r, const unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
raft_time last_send = p->last_send;
if (p->snapshot.last_send != ULLONG_MAX &&
p->snapshot.last_send > last_send) {
last_send = p->snapshot.last_send;
}
return last_send;
}
void progressToSnapshot(struct raft *r, unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
p->state = PROGRESS__SNAPSHOT;
p->snapshot.index = TrailSnapshotIndex(&r->trail);
/* Set the next_index to the snapshot index + 1. While the snapshot is being
* installed (or while we wait for the server to come online, before even
* sending the snapshot) we'll send heartbeats using this next index, so
* when we get back results we don't consider them as stale. */
p->next_index = p->snapshot.index + 1;
}
void progressAbortSnapshot(struct raft *r, const unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
p->next_index = p->match_index + 1;
p->snapshot.index = 0;
p->state = PROGRESS__PROBE;
}
int progressState(struct raft *r, const unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
return p->state;
}
const char *progressStateName(struct raft *r, unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
const char *name;
switch (p->state) {
case PROGRESS__PROBE:
name = "probe";
break;
case PROGRESS__PIPELINE:
name = "pipeline";
break;
case PROGRESS__SNAPSHOT:
name = "snapshot";
break;
default:
assert(0);
name = NULL;
break;
}
return name;
}
bool progressMaybeDecrement(struct raft *r,
const unsigned i,
raft_index rejected,
raft_index last_index)
{
struct raft_progress *p = &r->leader_state.progress[i];
assert(p->state == PROGRESS__PROBE || p->state == PROGRESS__PIPELINE ||
p->state == PROGRESS__SNAPSHOT);
/* We must be called only when receiving an AppendEntries rejection. */
assert(rejected > 0);
/* From figure 3.1:
*
* Reply false if log doesn't contain an entry at prevLogIndex whose term
* matches prevLogTerm.
*
* This means that there are two cases for rejection:
*
* - The follower does not have an entry at #rejected at all. In that case
* its #last_index is clearly lower than #rejected.
*
* - The follower has an entry at #rejected, but it has a different term. In
* - that case the follower must set #last_index to #rejected - 1. */
assert(last_index < rejected);
/* The next index must always be non-zero, and the match index must be
* always strictly lower than the match index. */
assert(p->next_index > 0);
assert(p->match_index < p->next_index);
if (p->state == PROGRESS__SNAPSHOT) {
/* The rejection must be stale or spurious if the rejected index does
* not match the last snapshot index. */
if (rejected != p->snapshot.index) {
infof(
"stale rejected index (%llu vs snapshot index %llu) -> ignore",
rejected, p->snapshot.index);
return false;
}
progressAbortSnapshot(r, i);
assert(p->match_index < p->next_index);
return true;
}
if (p->state == PROGRESS__PIPELINE) {
/* The rejection must be stale if the rejected index is smaller than
* the matched one. */
if (rejected <= p->match_index) {
infof("stale rejected index (%llu vs match index %llu) -> ignore",
rejected, p->match_index);
return false;
}
/* Directly decrease next to match + 1 */
p->next_index = min(rejected, p->match_index + 1);
progressToProbe(r, i);
assert(p->match_index < p->next_index);
return true;
}
/* The rejection must be stale or spurious if the rejected index does not
* match the next index minus one. */
if (rejected != p->next_index - 1) {
tracef("rejected index %llu different from next index %lld -> ignore ",
rejected, p->next_index);
return false;
}
p->next_index = min(rejected, last_index + 1);
assert(p->next_index > 0);
assert(p->match_index < p->next_index);
return true;
}
void progressSetNextIndex(struct raft *r, unsigned i, raft_index next_index)
{
struct raft_progress *p = &r->leader_state.progress[i];
p->next_index = next_index;
}
bool progressMaybeUpdate(struct raft *r, unsigned i, raft_index last_index)
{
struct raft_progress *p = &r->leader_state.progress[i];
bool updated = false;
if (p->match_index < last_index) {
p->match_index = last_index;
updated = true;
}
if (p->next_index < last_index + 1) {
p->next_index = last_index + 1;
}
return updated;
}
void progressToProbe(struct raft *r, const unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
if (p->state == PROGRESS__SNAPSHOT) {
assert(p->snapshot.index > 0);
p->snapshot.index = 0;
} else {
p->next_index = p->match_index + 1;
}
p->state = PROGRESS__PROBE;
}
void progressToPipeline(struct raft *r, const unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
p->state = PROGRESS__PIPELINE;
}
bool progressSnapshotDone(struct raft *r, const unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
assert(p->state == PROGRESS__SNAPSHOT);
return p->match_index >= p->snapshot.index;
}
void progressCatchUpStart(struct raft *r, unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
assert(p->catch_up != RAFT_CATCH_UP_RUNNING);
p->catch_up = RAFT_CATCH_UP_RUNNING;
}
void progressCatchUpAbort(struct raft *r, unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
assert(p->catch_up == RAFT_CATCH_UP_RUNNING);
p->catch_up = RAFT_CATCH_UP_ABORTED;
}
void progressCatchUpFinish(struct raft *r, unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
assert(p->catch_up == RAFT_CATCH_UP_RUNNING);
p->catch_up = RAFT_CATCH_UP_FINISHED;
}
int progressCatchUpStatus(const struct raft *r, unsigned i)
{
struct raft_progress *p = &r->leader_state.progress[i];
return p->catch_up;
}
#undef infof
#undef tracef
raft-0.22.1/src/progress.h 0000664 0000000 0000000 00000015074 14601504142 0015340 0 ustar 00root root 0000000 0000000 /* Track replication progress on followers. */
#ifndef PROGRESS_H_
#define PROGRESS_H_
#include "../include/raft.h"
/* Possible values for the state field of struct raft_progress. */
enum {
PROGRESS__PROBE = 0, /* At most one AppendEntries per heartbeat interval */
PROGRESS__PIPELINE, /* Optimistically stream AppendEntries */
PROGRESS__SNAPSHOT /* Sending a snapshot */
};
/**
* Used by leaders to keep track of replication progress for each server.
*/
struct raft_progress
{
unsigned short state; /* Probe, pipeline or snapshot. */
unsigned short catch_up; /* None, running, aborted, finished. */
unsigned short features; /* What the server is capable of. */
unsigned short capacity; /* Guaranteed capacity. */
raft_index next_index; /* Next entry to send. */
raft_index match_index; /* Highest index reported as replicated. */
raft_time last_send; /* Timestamp of last AppendEntries RPC. */
raft_time last_recv; /* Timestamp of last AppendEntries result. */
struct
{
raft_index index; /* Last index of most recent snapshot sent. */
raft_time last_send; /* Timestamp of last InstallSnaphot RPC. */
} snapshot;
};
/* Create and initialize the array of progress objects used by the leader to
* track followers. The match index will be set to zero, and the next index to
* the current last index plus 1.
*
* Return NULL if memory for the progress array could not be allocated.
*/
struct raft_progress *progressBuildArray(struct raft *r);
/* Re-build the progress array against a new configuration.
*
* Progress information for servers existing both in the new and in the current
* configuration will remain unchanged.
*
* Progress information for servers existing only in the new configuration will
* be initialized as in progressBuildArray().
*
* RAFT_NOMEM
* Memory for the progress array could not be allocated.
*/
int progressRebuildArray(struct raft *r,
const struct raft_configuration *configuration);
/* Whether the log of the i'th server in the configuration is up-to-date with
* ours. */
bool progressIsUpToDate(struct raft *r, unsigned i);
/* Whether the i'th server in the configuration is online or not.
*
* A server is online if we received a message from it within the last election
* timeout. */
bool progressIsOnline(struct raft *r, unsigned i);
/* Whether the i'th server in the configuration has contacted us recently.
*
* A server has contacted us recently if we received a message from it within
* the last election timer reset. */
bool progressHasContactedRecently(struct raft *r, unsigned i);
/* Whether a new AppendEntries or InstallSnapshot message should be sent to the
* i'th server at this time.
*
* See the docstring of replicationProgress() for details about how the decision
* is taken. */
bool progressShouldReplicate(struct raft *r, unsigned i);
/* Return the index of the next entry that should be sent to the i'th server. */
raft_index progressNextIndex(struct raft *r, unsigned i);
/* Return the index of the most recent entry that the i'th server has reported
* as replicated. */
raft_index progressMatchIndex(const struct raft *r, unsigned i);
/* Update the last_send timestamp after an AppendEntries request has been
* sent. */
void progressUpdateLastSend(struct raft *r, unsigned i);
/* Update the snapshot_last_send timestamp after an InstallSnaphot request has
* been sent. */
void progressUpdateSnapshotLastSend(struct raft *r, unsigned i);
/* Update the last_recv timestamp after an AppendEntries response has been
* received. */
void progressUpdateLastRecv(struct raft *r, unsigned i);
/* Reset to false all the recent_recv flags. */
void progressResetRecentRecv(struct raft *r);
/* Return the value of the last_send timestamp, or of the snapshot.last_send
* timestamp if more recent. */
raft_time progressGetLastSend(const struct raft *r, unsigned i);
/* Convert to the i'th server to snapshot mode. */
void progressToSnapshot(struct raft *r, unsigned i);
/* Convert to probe mode. */
void progressToProbe(struct raft *r, unsigned i);
/* Convert to pipeline mode. */
void progressToPipeline(struct raft *r, unsigned i);
/* Abort snapshot mode and switch to back to probe.
*
* Called after sending the snapshot has failed or timed out. */
void progressAbortSnapshot(struct raft *r, unsigned i);
/* Return the progress mode code for the i'th server. */
int progressState(struct raft *r, unsigned i);
/* Return the progress mode name for the i'th server. */
const char *progressStateName(struct raft *r, unsigned i);
/* Update the next index of the given server.
*
* Called in pipeline mode after sending new entries, or before sending a
* snapshot when waiting for a server to come online. */
void progressSetNextIndex(struct raft *r, unsigned i, raft_index next_index);
/* Return false if the given @index comes from an outdated message. Otherwise
* update the progress and returns true. To be called when receiving a
* successful AppendEntries RPC response. */
bool progressMaybeUpdate(struct raft *r, unsigned i, raft_index last_index);
/* Return false if the given rejected index comes from an out of order
* message. Otherwise decrease the progress next index to min(rejected,
* last_index) and returns true. To be called when receiving an unsuccessful
* AppendEntries RPC response. */
bool progressMaybeDecrement(struct raft *r,
unsigned i,
raft_index rejected,
raft_index last_index);
/* Return true if match_index is equal or higher than the snapshot_index. */
bool progressSnapshotDone(struct raft *r, unsigned i);
/* Sets the feature flags of a server. */
void progressSetFeatures(struct raft *r, unsigned i, unsigned short features);
/* Gets the feature flags of a server. */
unsigned short progressGetFeatures(const struct raft *r, unsigned i);
/* Sets the capacity of a server. */
void progressSetCapacity(struct raft *r, unsigned i, unsigned short features);
/* Gets the feature flags of a server. */
unsigned short progressGetCapacity(const struct raft *r, unsigned i);
/* Start catching up a server. */
void progressCatchUpStart(struct raft *r, unsigned i);
/* Stop catching up a server because it's not fast enough or it's
* unresponsive. */
void progressCatchUpAbort(struct raft *r, unsigned i);
/* Stop catching up a server because it has now caught up. */
void progressCatchUpFinish(struct raft *r, unsigned i);
/* Return the information about the catch-up progress of a server. */
int progressCatchUpStatus(const struct raft *r, unsigned i);
#endif /* PROGRESS_H_ */
raft-0.22.1/src/queue.h 0000664 0000000 0000000 00000003324 14601504142 0014613 0 ustar 00root root 0000000 0000000 #ifndef QUEUE_H_
#define QUEUE_H_
#include
typedef void *queue[2];
/* Private macros. */
#define QUEUE_NEXT(q) (*(queue **)&((*(q))[0]))
#define QUEUE_PREV(q) (*(queue **)&((*(q))[1]))
#define QUEUE_PREV_NEXT(q) (QUEUE_NEXT(QUEUE_PREV(q)))
#define QUEUE_NEXT_PREV(q) (QUEUE_PREV(QUEUE_NEXT(q)))
/* Initialize an empty queue. */
#define QUEUE_INIT(q) \
{ \
QUEUE_NEXT(q) = (q); \
QUEUE_PREV(q) = (q); \
}
/* Return true if the queue has no element. */
#define QUEUE_IS_EMPTY(q) ((const queue *)(q) == (const queue *)QUEUE_NEXT(q))
/* Insert an element at the back of a queue. */
#define QUEUE_PUSH(q, e) \
{ \
QUEUE_NEXT(e) = (q); \
QUEUE_PREV(e) = QUEUE_PREV(q); \
QUEUE_PREV_NEXT(e) = (e); \
QUEUE_PREV(q) = (e); \
}
/* Remove the given element from the queue. Any element can be removed at any *
* time. */
#define QUEUE_REMOVE(e) \
{ \
QUEUE_PREV_NEXT(e) = QUEUE_NEXT(e); \
QUEUE_NEXT_PREV(e) = QUEUE_PREV(e); \
}
/* Return the element at the front of the queue. */
#define QUEUE_HEAD(q) (QUEUE_NEXT(q))
/* Return the element at the back of the queue. */
#define QUEUE_TAIL(q) (QUEUE_PREV(q))
/* Iterate over the element of a queue. * Mutating the queue while iterating
* results in undefined behavior. */
#define QUEUE_FOREACH(q, e) \
for ((q) = QUEUE_NEXT(e); (q) != (e); (q) = QUEUE_NEXT(q))
/* Return the structure holding the given element. */
#define QUEUE_DATA(e, type, field) \
((type *)((void *)((char *)(e)-offsetof(type, field))))
#endif /* QUEUE_H_*/
raft-0.22.1/src/raft.c 0000664 0000000 0000000 00000055045 14601504142 0014425 0 ustar 00root root 0000000 0000000 #include "../include/raft.h"
#include
#include
#include "assert.h"
#include "byte.h"
#include "client.h"
#include "configuration.h"
#include "convert.h"
#include "election.h"
#include "entry.h"
#include "err.h"
#include "heap.h"
#include "membership.h"
#include "message.h"
#include "progress.h"
#include "queue.h"
#include "random.h"
#include "recv.h"
#include "replication.h"
#include "restore.h"
#include "timeout.h"
#include "tracing.h"
#include "trail.h"
#ifndef RAFT__LEGACY_no
#include "legacy.h"
#include "log.h"
#endif
#define DEFAULT_ELECTION_TIMEOUT 1000 /* One second */
#define DEFAULT_HEARTBEAT_TIMEOUT 100 /* One tenth of a second */
#define DEFAULT_INSTALL_SNAPSHOT_TIMEOUT 30000 /* 30 seconds */
#if !defined(RAFT__LEGACY_no)
#define DEFAULT_SNAPSHOT_THRESHOLD 1024
#define DEFAULT_SNAPSHOT_TRAILING 2048
#endif
/* Number of milliseconds after which a server promotion will be aborted if the
* server hasn't caught up with the logs yet. */
#define DEFAULT_MAX_CATCH_UP_ROUNDS 10
#define DEFAULT_MAX_CATCH_UP_ROUND_DURATION (5 * 1000)
#define DEFAULT_MAX_INFLIGHT_ENTRIES 32
#define infof(...) Infof(r->tracer, "> " __VA_ARGS__)
int raft_version_number(void)
{
return RAFT_VERSION_NUMBER;
}
#ifndef RAFT__LEGACY_no
static int ioFsmVersionCheck(struct raft *r,
struct raft_io *io,
struct raft_fsm *fsm)
{
if (io->version == 0) {
ErrMsgPrintf(r->errmsg, "io->version must be set");
return -1;
}
if (fsm->version == 0) {
ErrMsgPrintf(r->errmsg, "fsm->version must be set");
return -1;
}
return 0;
}
#endif
int raft_init(struct raft *r,
struct raft_io *io,
struct raft_fsm *fsm,
const raft_id id,
const char *address)
{
int rv;
assert(r != NULL);
r->tracer = &StderrTracer;
raft_tracer_maybe_enable(r->tracer, true);
r->id = id;
/* Make a copy of the address */
r->address = RaftHeapMalloc(strlen(address) + 1);
if (r->address == NULL) {
ErrMsgOom(r->errmsg);
rv = RAFT_NOMEM;
goto err;
}
strcpy(r->address, address);
r->current_term = 0;
r->voted_for = 0;
TrailInit(&r->trail);
raft_configuration_init(&r->configuration);
raft_configuration_init(&r->configuration_committed);
r->configuration_committed_index = 0;
r->configuration_uncommitted_index = 0;
r->configuration_last_snapshot_index = 0;
r->election_timeout = DEFAULT_ELECTION_TIMEOUT;
r->heartbeat_timeout = DEFAULT_HEARTBEAT_TIMEOUT;
r->install_snapshot_timeout = DEFAULT_INSTALL_SNAPSHOT_TIMEOUT;
r->commit_index = 0;
r->last_stored = 0;
r->state = RAFT_FOLLOWER;
r->follower_state.current_leader.id = 0;
r->follower_state.current_leader.address = NULL;
r->follower_state.match = 0;
r->snapshot.installing = false;
memset(r->errmsg, 0, sizeof r->errmsg);
r->pre_vote = false;
r->max_catch_up_rounds = DEFAULT_MAX_CATCH_UP_ROUNDS;
r->max_catch_up_round_duration = DEFAULT_MAX_CATCH_UP_ROUND_DURATION;
r->now = 0;
r->messages = NULL;
r->n_messages_cap = 0;
r->max_inflight_entries = DEFAULT_MAX_INFLIGHT_ENTRIES;
r->update = NULL;
r->capacity = 0;
r->capacity_threshold = 0;
#if defined(RAFT__LEGACY_no)
(void)io;
(void)fsm;
#else
r->io = NULL;
r->fsm = NULL;
if (io != NULL) {
assert(fsm != NULL);
rv = ioFsmVersionCheck(r, io, fsm);
if (rv != 0) {
goto err_after_address_alloc;
}
r->io = io;
r->fsm = fsm;
r->last_applied = 0;
r->close_cb = NULL;
r->io->data = r;
rv = r->io->init(r->io, r->id, r->address);
if (rv != 0) {
ErrMsgTransfer(r->io->errmsg, r->errmsg, "io");
goto err_after_address_alloc;
}
r->now = r->io->time(r->io);
raft_seed(r, (unsigned)r->io->random(r->io, 0, INT_MAX));
r->legacy.prev_state = r->state;
r->legacy.closing = false;
QUEUE_INIT(&r->legacy.pending);
QUEUE_INIT(&r->legacy.requests);
r->legacy.step_cb = NULL;
r->legacy.change = NULL;
r->legacy.snapshot_index = 0;
r->legacy.snapshot_taking = false;
r->legacy.snapshot_install = false;
r->legacy.snapshot_pending = NULL;
r->transfer = NULL;
r->legacy.log = logInit();
r->legacy.snapshot_threshold = DEFAULT_SNAPSHOT_THRESHOLD;
r->legacy.snapshot_trailing = DEFAULT_SNAPSHOT_TRAILING;
if (r->legacy.log == NULL) {
goto err_after_address_alloc;
}
r->capacity_threshold = 4 * 1024; /* 4 megabytes, i.e. 1 open segment */
}
#endif
return 0;
#ifndef RAFT__LEGACY_no
err_after_address_alloc:
RaftHeapFree(r->address);
#endif
err:
assert(rv != 0);
return rv;
}
static void finalClose(struct raft *r)
{
raft_free(r->address);
TrailClose(&r->trail);
#ifndef RAFT__LEGACY_no
if (r->io != NULL) {
logClose(r->legacy.log);
}
#endif
raft_configuration_close(&r->configuration);
raft_configuration_close(&r->configuration_committed);
if (r->messages != NULL) {
raft_free(r->messages);
}
}
#ifndef RAFT__LEGACY_no
static void ioCloseCb(struct raft_io *io)
{
struct raft *r = io->data;
finalClose(r);
if (r->close_cb != NULL) {
r->close_cb(r);
}
}
#endif
void raft_close(struct raft *r, void (*cb)(struct raft *r))
{
assert(r->update == NULL);
convertClear(r);
#if defined(RAFT__LEGACY_no)
(void)cb;
finalClose(r);
#else
if (r->io != NULL) {
assert(!r->legacy.closing);
r->legacy.closing = true;
if (r->transfer != NULL) {
LegacyLeadershipTransferClose(r);
}
LegacyFailPendingRequests(r);
LegacyFireCompletedRequests(r);
r->close_cb = cb;
r->io->close(r->io, ioCloseCb);
} else {
finalClose(r);
}
#endif
}
void raft_seed(struct raft *r, unsigned random)
{
r->random = random;
}
/* If we're the only voting server in the configuration, automatically
* self-elect ourselves and convert to leader without waiting for the election
* timeout. */
static int maybeSelfElect(struct raft *r)
{
const struct raft_server *server;
int rv;
server = configurationGet(&r->configuration, r->id);
if (server == NULL || server->role != RAFT_VOTER ||
configurationVoterCount(&r->configuration) > 1) {
return 0;
}
/* Converting to candidate will notice that we're the only voter and
* automatically convert to leader. */
rv = convertToCandidate(r, false /* disrupt leader */);
if (rv != 0) {
return rv;
}
assert(r->state == RAFT_LEADER);
/* Send initial heartbeat. */
replicationHeartbeat(r);
return 0;
}
/* Emit a start message containing information about the current state. */
static void stepStartEmitMessage(const struct raft *r)
{
char msg[512] = {0};
raft_index snapshot_index = TrailSnapshotIndex(&r->trail);
unsigned n_entries = TrailNumEntries(&r->trail);
if (r->current_term == 0) {
strcat(msg, "no state");
goto emit;
}
if (r->current_term > 0) {
char msg_term[64];
sprintf(msg_term, "term %llu", r->current_term);
strcat(msg, msg_term);
if (snapshot_index > 0 || n_entries > 0) {
strcat(msg, ", ");
}
}
if (r->voted_for > 0) {
char msg_vote[64];
sprintf(msg_vote, "voted for %llu, ", r->voted_for);
strcat(msg, msg_vote);
}
if (snapshot_index) {
char msg_snapshot[64];
sprintf(msg_snapshot, "1 snapshot (%llu^%llu)", snapshot_index,
TrailSnapshotTerm(&r->trail));
strcat(msg, msg_snapshot);
if (n_entries > 0) {
strcat(msg, ", ");
}
}
if (n_entries > 0) {
char msg_entries[64];
raft_index first =
TrailLastIndex(&r->trail) - TrailNumEntries(&r->trail) + 1;
if (n_entries == 1) {
sprintf(msg_entries, "1 entry (%llu^%llu)", first,
TrailTermOf(&r->trail, first));
} else {
raft_index last = TrailLastIndex(&r->trail);
sprintf(msg_entries, "%u entries (%llu^%llu..%llu^%llu)", n_entries,
first, TrailTermOf(&r->trail, first), last,
TrailTermOf(&r->trail, last));
}
strcat(msg, msg_entries);
}
emit:
infof("%s", msg);
}
/* Handle a RAFT_START event. */
static int stepStart(struct raft *r,
raft_term term,
raft_id voted_for,
struct raft_snapshot_metadata *metadata,
raft_index start_index,
struct raft_entry *entries,
unsigned n_entries)
{
raft_index snapshot_index = 0;
raft_term snapshot_term = 0;
int rv;
r->current_term = term;
r->voted_for = voted_for;
/* If no term is set, there must be no persisted state. */
if (r->current_term == 0) {
assert(r->voted_for == 0);
assert(metadata == NULL);
assert(n_entries == 0);
}
if (metadata != NULL) {
snapshot_index = metadata->index;
snapshot_term = metadata->term;
rv = RestoreSnapshot(r, metadata);
if (rv != 0) {
entryBatchesDestroy(entries, n_entries);
return rv;
}
} else if (n_entries > 0) {
/* If we don't have a snapshot and the on-disk log is not empty, then
* the first entry must be a configuration entry. */
assert(start_index == 1);
assert(entries[0].type == RAFT_CHANGE);
/* As a small optimization, bump the commit index to 1 since we require
* the first entry to be the same on all servers. */
r->commit_index = 1;
r->update->flags |= RAFT_UPDATE_COMMIT_INDEX;
}
/* Append the entries to the log, possibly restoring the last
* configuration. */
rv = RestoreEntries(r, snapshot_index, snapshot_term, start_index, entries,
n_entries);
if (rv != 0) {
entryBatchesDestroy(entries, n_entries);
return rv;
}
stepStartEmitMessage(r);
/* By default we start as followers. */
assert(r->state == RAFT_FOLLOWER);
electionResetTimer(r);
r->follower_state.current_leader.id = 0;
r->follower_state.current_leader.address = NULL;
/* If there's only one voting server, and that is us, it's safe to convert
* to leader right away. If that is not us, we're either joining the cluster
* or we're simply configured as non-voter, and we'll stay follower. */
rv = maybeSelfElect(r);
if (rv != 0) {
return rv;
}
return 0;
}
static int stepPersistedEntries(struct raft *r, raft_index index)
{
raft_index first_index;
raft_index first_term;
raft_index last_term;
unsigned n;
/* The newly peristed index must be greater than our previous last stored
* mark. */
assert(index > r->last_stored);
n = (unsigned)(index - r->last_stored);
first_index = index - n + 1;
assert(TrailLastIndex(&r->trail) >= index);
first_term = TrailTermOf(&r->trail, first_index);
last_term = TrailTermOf(&r->trail, index);
assert(first_term > 0);
assert(last_term > 0);
if (n == 1) {
infof("persisted 1 entry (%llu^%llu)", first_index, first_term);
} else {
infof("persisted %u entry (%llu^%llu..%llu^%llu)", n, first_index,
first_term, index, last_term);
}
return replicationPersistEntriesDone(r, index);
}
static int stepPersistedSnapshot(struct raft *r,
struct raft_snapshot_metadata *metadata,
size_t offset,
bool last)
{
int rv;
/* We wait for all writes to be settled before transitioning to candidate
* state, and no new writes are issued as candidate, so the current state
* must be leader or follower */
assert(r->state == RAFT_LEADER || r->state == RAFT_FOLLOWER);
infof("persisted snapshot (%llu^%llu)", metadata->index, metadata->term);
rv = replicationPersistSnapshotDone(r, metadata, offset, last);
if (rv != 0) {
return rv;
}
return 0;
}
/* Handle new messages. */
static int stepReceive(struct raft *r, struct raft_message *message)
{
const char *desc;
switch (message->type) {
case RAFT_REQUEST_VOTE:
desc = "request vote";
break;
case RAFT_REQUEST_VOTE_RESULT:
desc = "request vote result";
break;
case RAFT_APPEND_ENTRIES:
desc = "append entries";
break;
case RAFT_APPEND_ENTRIES_RESULT:
desc = "append entries result";
break;
case RAFT_INSTALL_SNAPSHOT:
desc = "install snapshot";
break;
case RAFT_TIMEOUT_NOW:
desc = "timeout now";
break;
default:
desc = "unknown message";
break;
}
infof("recv %s from server %llu", desc, message->server_id);
return recvMessage(r, message);
}
int stepSnapshot(struct raft *r,
struct raft_snapshot_metadata *metadata,
unsigned trailing)
{
const char *suffix = trailing == 1 ? "y" : "ies";
infof("new snapshot (%llu^%llu), %u trailing entr%s", metadata->index,
metadata->term, trailing, suffix);
return replicationSnapshot(r, metadata, trailing);
}
int stepTimeout(struct raft *r)
{
const char *state_name = raft_state_name(r->state);
infof("timeout as %s", state_name);
return Timeout(r);
}
int raft_step(struct raft *r,
struct raft_event *event,
struct raft_update *update)
{
int rv;
assert(event != NULL);
assert(update != NULL);
assert(r->update == NULL);
r->update = update;
r->update->flags = 0;
r->update->messages.batch = r->messages;
r->update->messages.n = 0;
r->now = event->time;
r->capacity = event->capacity;
/* Possibly update this server's capacity in the progress array. */
if (r->state == RAFT_LEADER) {
unsigned i = configurationIndexOf(&r->configuration, r->id);
if (i < r->configuration.n) {
progressSetFeatures(r, i, MESSAGE__FEATURE_CAPACITY);
progressSetCapacity(r, i, r->capacity);
}
}
switch (event->type) {
case RAFT_START:
rv = stepStart(r, event->start.term, event->start.voted_for,
event->start.metadata, event->start.start_index,
event->start.entries, event->start.n_entries);
break;
case RAFT_PERSISTED_ENTRIES:
rv = stepPersistedEntries(r, event->persisted_entries.index);
break;
case RAFT_PERSISTED_SNAPSHOT:
rv = stepPersistedSnapshot(r, &event->persisted_snapshot.metadata,
event->persisted_snapshot.offset,
event->persisted_snapshot.last);
break;
case RAFT_RECEIVE:
rv = stepReceive(r, event->receive.message);
break;
case RAFT_CONFIGURATION:
rv = replicationApplyConfigurationChange(
r, &event->configuration.conf, event->configuration.index);
break;
case RAFT_SNAPSHOT:
rv = stepSnapshot(r, &event->snapshot.metadata,
event->snapshot.trailing);
break;
case RAFT_TIMEOUT:
rv = stepTimeout(r);
break;
case RAFT_SUBMIT:
infof("submit %u new client entr%s", event->submit.n,
event->submit.n == 1 ? "y" : "ies");
rv = ClientSubmit(r, event->submit.entries, event->submit.n);
break;
case RAFT_CATCH_UP:
infof("catch-up server %llu", event->catch_up.server_id);
ClientCatchUp(r, event->catch_up.server_id);
rv = 0;
break;
case RAFT_TRANSFER:
infof("transfer leadership to %llu", event->transfer.server_id);
rv = ClientTransfer(r, event->transfer.server_id);
break;
default:
rv = RAFT_INVALID;
break;
}
if (rv != 0) {
goto out;
}
out:
r->update = NULL;
if (rv != 0) {
return rv;
}
return 0;
}
raft_term raft_current_term(const struct raft *r)
{
return r->current_term;
}
raft_term raft_voted_for(const struct raft *r)
{
return r->voted_for;
}
raft_index raft_commit_index(const struct raft *r)
{
return r->commit_index;
}
/* Return the time at which the next leader timeout should be triggered. */
static raft_time leaderTimeout(const struct raft *r)
{
raft_time timeout;
raft_time last_send = ULLONG_MAX;
unsigned i;
/* Find the oldest last_send timestamp. */
for (i = 0; i < r->configuration.n; i++) {
if (progressGetLastSend(r, i) < last_send) {
last_send = progressGetLastSend(r, i);
}
}
/* We always send a heartbeat at the beginning of our term, so if all
* last_send timestamps are ULLONG_MAX it means that there are no
* voters/stand-bys to send hearbeats to. So just return the timeout for the
* quorum check. */
if (last_send == ULLONG_MAX) {
return r->election_timer_start + r->election_timeout;
}
/* The next timeout is either for heartbeat or a quorum check. */
timeout = last_send + r->heartbeat_timeout;
if (timeout > r->election_timer_start + r->election_timeout) {
timeout = r->election_timer_start + r->election_timeout;
}
return timeout;
}
raft_time raft_timeout(const struct raft *r)
{
raft_time timeout;
switch (r->state) {
case RAFT_FOLLOWER:
/* fallthrough */
case RAFT_CANDIDATE:
timeout = electionTimerExpiration(r);
break;
case RAFT_LEADER:
/* The next timeout is either for heartbeat or a quorum check. */
timeout = leaderTimeout(r);
break;
default:
timeout = 0;
break;
}
return timeout;
}
int raft_match_index(const struct raft *r, raft_id id, raft_index *index)
{
unsigned i;
if (r->state != RAFT_LEADER) {
return RAFT_NOTLEADER;
}
i = configurationIndexOf(&r->configuration, id);
if (i == r->configuration.n) {
return RAFT_BADID;
}
*index = progressMatchIndex(r, i);
return 0;
}
int raft_catch_up(const struct raft *r, raft_id id, int *status)
{
unsigned i;
if (r->state != RAFT_LEADER) {
return RAFT_NOTLEADER;
}
i = configurationIndexOf(&r->configuration, id);
if (i == r->configuration.n) {
return RAFT_BADID;
}
*status = progressCatchUpStatus(r, i);
return 0;
}
raft_id raft_transferee(const struct raft *r)
{
if (r->state != RAFT_LEADER) {
return 0;
}
return r->leader_state.transferee;
}
void raft_set_election_timeout(struct raft *r, const unsigned msecs)
{
r->election_timeout = msecs;
/* FIXME: workaround for failures in the dqlite test suite, which sets
* timeouts too low and end up in failures when run on slow harder. */
#ifndef RAFT__LEGACY_no
if (r->io != NULL && r->election_timeout == 150 &&
r->heartbeat_timeout == 15) {
r->election_timeout *= 3;
r->heartbeat_timeout *= 3;
}
#endif
switch (r->state) {
case RAFT_FOLLOWER:
case RAFT_CANDIDATE:
electionUpdateRandomizedTimeout(r);
break;
}
}
void raft_set_heartbeat_timeout(struct raft *r, const unsigned msecs)
{
r->heartbeat_timeout = msecs;
}
void raft_set_install_snapshot_timeout(struct raft *r, const unsigned msecs)
{
r->install_snapshot_timeout = msecs;
}
void raft_set_pre_vote(struct raft *r, bool enabled)
{
r->pre_vote = enabled;
}
void raft_set_max_catch_up_rounds(struct raft *r, unsigned n)
{
r->max_catch_up_rounds = n;
}
void raft_set_max_catch_up_round_duration(struct raft *r, unsigned msecs)
{
r->max_catch_up_round_duration = msecs;
}
void raft_set_max_inflight_entries(struct raft *r, unsigned n)
{
r->max_inflight_entries = n;
}
void raft_set_capacity_threshold(struct raft *r, unsigned short min)
{
r->capacity_threshold = min;
}
const char *raft_errmsg(struct raft *r)
{
return r->errmsg;
}
const char *raft_strerror(int errnum)
{
return errCodeToString(errnum);
}
void raft_configuration_init(struct raft_configuration *c)
{
configurationInit(c);
}
void raft_configuration_close(struct raft_configuration *c)
{
configurationClose(c);
}
int raft_configuration_add(struct raft_configuration *c,
const raft_id id,
const char *address,
const int role)
{
return configurationAdd(c, id, address, role);
}
int raft_configuration_encode(const struct raft_configuration *c,
struct raft_buffer *buf)
{
return configurationEncode(c, buf);
}
int raft_configuration_decode(const struct raft_buffer *buf,
struct raft_configuration *c)
{
return configurationDecode(buf, c);
}
unsigned long long raft_digest(const char *text, unsigned long long n)
{
struct byteSha1 sha1;
uint8_t value[20];
uint64_t n64 = byteFlip64((uint64_t)n);
uint64_t digest;
byteSha1Init(&sha1);
byteSha1Update(&sha1, (const uint8_t *)text, (uint32_t)strlen(text));
byteSha1Update(&sha1, (const uint8_t *)&n64, (uint32_t)(sizeof n64));
byteSha1Digest(&sha1, value);
memcpy(&digest, value + (sizeof value - sizeof digest), sizeof digest);
return byteFlip64(digest);
}
unsigned raft_random(unsigned *state, unsigned min, unsigned max)
{
return RandomWithinRange(state, min, max);
}
const char *raft_state_name(int state)
{
const char *name;
switch (state) {
case RAFT_FOLLOWER:
name = "follower";
break;
case RAFT_CANDIDATE:
name = "candidate";
break;
case RAFT_LEADER:
name = "leader";
break;
default:
name = NULL;
break;
}
return name;
}
const char *raft_role_name(int role)
{
const char *name;
switch (role) {
case RAFT_STANDBY:
name = "stand-by";
break;
case RAFT_VOTER:
name = "voter";
break;
case RAFT_SPARE:
name = "spare";
break;
default:
name = NULL;
break;
}
return name;
}
#undef infof
raft-0.22.1/src/random.c 0000664 0000000 0000000 00000002404 14601504142 0014740 0 ustar 00root root 0000000 0000000 #include
#include "assert.h"
#include "random.h"
#define RANDOM_MULTIPLIER (747796405U)
#define RANDOM_INCREMENT (1729U)
static uint32_t randomAdvance(unsigned *random)
{
uint32_t state;
uint32_t n;
state = *random;
n = ((state >> ((state >> 28) + 4)) ^ state) * (277803737U);
n ^= n >> 22;
*random = state * RANDOM_MULTIPLIER + RANDOM_INCREMENT;
return n;
}
/* Generate a random non-negative number with at most the given value. */
static uint32_t randomAtMost(unsigned *random, uint32_t max)
{
/* We want (UINT32_MAX + 1) % max, which in unsigned arithmetic is the same
* as (UINT32_MAX + 1 - max) % max = -max % max. We compute -max using not
* to avoid compiler warnings.
*/
const uint32_t min = (~max + 1U) % max;
uint32_t x;
if (max == (~((uint32_t)0U))) {
return randomAdvance(random);
}
max++;
do {
x = randomAdvance(random);
} while (x < min);
return x % max;
}
unsigned RandomWithinRange(unsigned *state, unsigned min, unsigned max)
{
uint64_t range = (uint64_t)max - (uint64_t)min;
assert(min <= max);
if (range > (~((uint32_t)0U))) {
range = (~((uint32_t)0U));
}
return min + (unsigned)randomAtMost(state, (uint32_t)range);
}
raft-0.22.1/src/random.h 0000664 0000000 0000000 00000000360 14601504142 0014744 0 ustar 00root root 0000000 0000000 /* Pseudo-random number generator. */
#ifndef RAFT_RANDOM_H_
#define RAFT_RANDOM_H_
/* Generate a random number between min and max. */
unsigned RandomWithinRange(unsigned *state, unsigned min, unsigned max);
#endif /* RAFT_RANDOM_H_ */
raft-0.22.1/src/recv.c 0000664 0000000 0000000 00000011214 14601504142 0014416 0 ustar 00root root 0000000 0000000 #include "recv.h"
#include "assert.h"
#include "convert.h"
#include "entry.h"
#include "heap.h"
#include "membership.h"
#include "message.h"
#include "recv_append_entries.h"
#include "recv_append_entries_result.h"
#include "recv_install_snapshot.h"
#include "recv_request_vote.h"
#include "recv_request_vote_result.h"
#include "recv_timeout_now.h"
#include "string.h"
#include "tracing.h"
#define infof(...) Infof(r->tracer, " " __VA_ARGS__)
/* Dispatch a single RPC message to the appropriate handler. */
int recvMessage(struct raft *r, struct raft_message *message)
{
raft_id id = message->server_id;
const char *address = message->server_address;
int rv = 0;
switch (message->type) {
case RAFT_APPEND_ENTRIES:
rv = recvAppendEntries(r, id, address, &message->append_entries);
break;
case RAFT_APPEND_ENTRIES_RESULT:
rv = recvAppendEntriesResult(r, id, address,
&message->append_entries_result);
break;
case RAFT_REQUEST_VOTE:
rv = recvRequestVote(r, id, address, &message->request_vote);
break;
case RAFT_REQUEST_VOTE_RESULT:
rv = recvRequestVoteResult(r, id, address,
&message->request_vote_result);
break;
case RAFT_INSTALL_SNAPSHOT:
rv =
recvInstallSnapshot(r, id, address, &message->install_snapshot);
/* Already installing a snapshot, wait for it and ignore this one */
if (rv == RAFT_BUSY) {
raft_free(message->install_snapshot.data.base);
raft_configuration_close(&message->install_snapshot.conf);
rv = 0;
}
break;
case RAFT_TIMEOUT_NOW:
rv = recvTimeoutNow(r, id, address, &message->timeout_now);
break;
default:
/* Drop message */
return 0;
};
if (rv != 0 && rv != RAFT_NOCONNECTION) {
return rv;
}
return 0;
}
void recvBumpCurrentTerm(struct raft *r, raft_term term)
{
char msg[128];
assert(r != NULL);
assert(term > r->current_term);
sprintf(msg, "remote term is higher (%lld vs %lld) -> bump term", term,
r->current_term);
if (r->state != RAFT_FOLLOWER) {
strcat(msg, ", step down");
}
infof("%s", msg);
/* Mark both the current term and vote as changed. */
r->update->flags |= RAFT_UPDATE_CURRENT_TERM | RAFT_UPDATE_VOTED_FOR;
/* Update our cache too. */
r->current_term = term;
r->voted_for = 0;
if (r->state != RAFT_FOLLOWER) {
/* Also convert to follower. */
convertToFollower(r);
}
/* Reset the match index, because we don't know anything about the leader of
* this new term yet. */
r->follower_state.match = 0;
}
int recvCheckMatchingTerms(const struct raft *r, raft_term term)
{
int match;
if (term < r->current_term) {
match = -1;
} else if (term > r->current_term) {
match = 1;
} else {
match = 0;
}
return match;
}
int recvEnsureMatchingTerms(struct raft *r, raft_term term)
{
int match;
assert(r != NULL);
match = recvCheckMatchingTerms(r, term);
if (match == -1) {
goto out;
}
/* From Figure 3.1:
*
* Rules for Servers: All Servers: If RPC request or response contains
* term T > currentTerm: set currentTerm = T, convert to follower.
*
* From state diagram in Figure 3.3:
*
* [leader]: discovers server with higher term -> [follower]
*
* From Section 3.3:
*
* If a candidate or leader discovers that its term is out of date, it
* immediately reverts to follower state.
*/
if (match == 1) {
recvBumpCurrentTerm(r, term);
}
out:
return match;
}
int recvUpdateLeader(struct raft *r, const raft_id id, const char *address)
{
assert(r->state == RAFT_FOLLOWER);
r->follower_state.current_leader.id = id;
/* If the address of the current leader is the same as the given one, we're
* done. */
if (r->follower_state.current_leader.address != NULL &&
strcmp(address, r->follower_state.current_leader.address) == 0) {
return 0;
}
if (r->follower_state.current_leader.address != NULL) {
RaftHeapFree(r->follower_state.current_leader.address);
}
r->follower_state.current_leader.address =
RaftHeapMalloc(strlen(address) + 1);
if (r->follower_state.current_leader.address == NULL) {
return RAFT_NOMEM;
}
strcpy(r->follower_state.current_leader.address, address);
return 0;
}
#undef infof
raft-0.22.1/src/recv.h 0000664 0000000 0000000 00000003251 14601504142 0014425 0 ustar 00root root 0000000 0000000 /* Receive an RPC message. */
#ifndef RECV_H_
#define RECV_H_
#include "../include/raft.h"
/* Function to be invoked upon receiving an RPC message. */
int recvMessage(struct raft *r, struct raft_message *message);
/* Compare a request's term with the server's current term.
*
* Return 0 if the local term matches the request's term, to -1 if the request's
* term is lower, and to 1 if the request's term is higher. */
int recvCheckMatchingTerms(const struct raft *r, raft_term term);
/* Bump the current term and possibly step down from candidate or leader
* state. */
void recvBumpCurrentTerm(struct raft *r, raft_term term);
/* Common logic for RPC handlers, comparing the request's term with the server's
* current term and possibly deciding to reject the request or step down from
* candidate or leader.
*
* From Section 3.3:
*
* If a candidate or leader discovers that its term is out of date, it
* immediately reverts to follower state. If a server receives a request with
* a stale term number, it rejects the request.
*
* The return value will be set to 0 if the local term matches the request's
* term, to -1 if the request's term is lower, and to 1 if the request's term
* was higher and we have bumped the local one to match it (and stepped down to
* follower in that case, if we were not follower already). */
int recvEnsureMatchingTerms(struct raft *r, raft_term term);
/* If different from the current one, update information about the current
* leader. Must be called only by followers.
*
* Errors:
*
* RAFT_NOMEM
* A copy of @address could not be made
*/
int recvUpdateLeader(struct raft *r, raft_id id, const char *address);
#endif /* RECV_H_ */
raft-0.22.1/src/recv_append_entries.c 0000664 0000000 0000000 00000014243 14601504142 0017503 0 ustar 00root root 0000000 0000000 #include "recv_append_entries.h"
#include "assert.h"
#include "convert.h"
#include "entry.h"
#include "heap.h"
#include "message.h"
#include "recv.h"
#include "replication.h"
#include "tracing.h"
#include "trail.h"
#define infof(...) Infof(r->tracer, " " __VA_ARGS__)
#define tracef(...) Tracef(r->tracer, __VA_ARGS__)
int recvAppendEntries(struct raft *r,
raft_id id,
const char *address,
const struct raft_append_entries *args)
{
struct raft_message message;
struct raft_append_entries_result *result = &message.append_entries_result;
raft_index last_index;
int match;
bool async;
int rv;
assert(r != NULL);
assert(id > 0);
assert(args != NULL);
assert(address != NULL);
result->rejected = args->prev_log_index;
result->version = MESSAGE__APPEND_ENTRIES_RESULT_VERSION;
result->features = MESSAGE__FEATURE_CAPACITY;
match = recvEnsureMatchingTerms(r, args->term);
/* From Figure 3.1:
*
* AppendEntries RPC: Receiver implementation: Reply false if term <
* currentTerm.
*/
if (match < 0) {
infof("local term is higher (%llu vs %llu) -> reject", r->current_term,
args->term);
goto reply;
}
/* If we get here it means that the term in the request matches our current
* term or it was higher and we have possibly stepped down, because we
* discovered the current leader:
*
* From Figure 3.1:
*
* Rules for Servers: Candidates: if AppendEntries RPC is received from
* new leader: convert to follower.
*
* From Section 3.4:
*
* While waiting for votes, a candidate may receive an AppendEntries RPC
* from another server claiming to be leader. If the leader's term
* (included in its RPC) is at least as large as the candidate's current
* term, then the candidate recognizes the leader as legitimate and
* returns to follower state. If the term in the RPC is smaller than the
* candidate's current term, then the candidate rejects the RPC and
* continues in candidate state.
*
* From state diagram in Figure 3.3:
*
* [candidate]: discovers current leader -> [follower]
*
* Note that it should not be possible for us to be in leader state, because
* the leader that is sending us the request should have either a lower term
* (and in that case we reject the request above), or a higher term (and in
* that case we step down). It can't have the same term because at most one
* leader can be elected at any given term.
*/
assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE);
assert(r->current_term == args->term);
if (r->state == RAFT_CANDIDATE) {
/* The current term and the peer one must match, otherwise we would have
* either rejected the request or stepped down to followers. */
assert(match == 0);
infof("discovered leader (%llu) -> step down ", id);
convertToFollower(r);
}
assert(r->state == RAFT_FOLLOWER);
/* Update current leader because the term in this AppendEntries RPC is up to
* date. */
rv = recvUpdateLeader(r, id, address);
if (rv != 0) {
assert(rv == RAFT_NOMEM);
return rv;
}
/* Reset the election timer. */
r->election_timer_start = r->now;
r->update->flags |= RAFT_UPDATE_TIMEOUT;
/* If we are installing a snapshot, ignore these entries. TODO: we should do
* something smarter, e.g. buffering the entries in the I/O backend, which
* should be in charge of serializing everything. */
if (r->snapshot.installing && args->n_entries > 0) {
infof("snapshot install in progress -> ignore");
if (args->n_entries > 0) {
assert(args->entries[0].batch != NULL);
raft_free(args->entries[0].batch);
}
return 0;
}
rv = replicationAppend(r, args, &result->rejected, &async);
if (rv != 0 && rv != RAFT_BUSY) {
goto err;
}
if (async) {
return 0;
}
/* Set the last_log_index field of the response. */
last_index = TrailLastIndex(&r->trail);
if (result->rejected > 0) {
/* In case of rejection we have to cases:
*
* 1. If our log is shorter and is missing the entry at #rejected, then
* we set last_log_index to our actual last log index.
* 2. If our log is equal or longer, but the entry at #rejected has a
* different term, then we set last_log_index to #rejected - 1 and
* the leader will eventually retry with that index. */
result->last_log_index = last_index;
if (result->last_log_index >= result->rejected) {
result->last_log_index = result->rejected - 1;
}
} else {
/* In case of synchronous success we expect to have all entries, and no
* new entry needs to be persisted. However we might still be persisting
* some of them, so we set last_log_index to the index of the last
* stored index that is lower or equal than the last index in this
* message.
*
* We use a stored index instead of an in-memory one because the leader
* will use it to update our match index and to check quorum. */
result->last_log_index = args->prev_log_index + args->n_entries;
assert(last_index >= result->last_log_index);
if (result->last_log_index > r->last_stored) {
result->last_log_index = r->last_stored;
}
}
reply:
result->term = r->current_term;
/* Free the entries batch, if any. */
if (args->n_entries) {
assert(args->entries[0].batch != NULL);
raft_free(args->entries[0].batch);
}
result->capacity = r->capacity;
message.type = RAFT_APPEND_ENTRIES_RESULT;
message.server_id = id;
message.server_address = address;
rv = MessageEnqueue(r, &message);
if (rv != 0) {
goto err;
}
return 0;
err:
assert(rv != 0);
if (args->n_entries) {
assert(args->entries[0].batch != NULL);
raft_free(args->entries[0].batch);
}
return rv;
}
#undef infof
#undef tracef
raft-0.22.1/src/recv_append_entries.h 0000664 0000000 0000000 00000000627 14601504142 0017511 0 ustar 00root root 0000000 0000000 /* Receive an AppendEntries message. */
#ifndef RECV_APPEND_ENTRIES_H_
#define RECV_APPEND_ENTRIES_H_
#include "../include/raft.h"
/* Process an AppendEntries RPC from the given server. */
int recvAppendEntries(struct raft *r,
raft_id id,
const char *address,
const struct raft_append_entries *args);
#endif /* RECV_APPEND_ENTRIES_H_ */
raft-0.22.1/src/recv_append_entries_result.c 0000664 0000000 0000000 00000004434 14601504142 0021102 0 ustar 00root root 0000000 0000000 #include "recv_append_entries_result.h"
#include "assert.h"
#include "configuration.h"
#include "recv.h"
#include "replication.h"
#include "tracing.h"
#define infof(...) Infof(r->tracer, " " __VA_ARGS__)
#define tracef(...) Tracef(r->tracer, __VA_ARGS__)
int recvAppendEntriesResult(struct raft *r,
const raft_id id,
const char *address,
const struct raft_append_entries_result *result)
{
const struct raft_server *server;
int match;
int rv;
assert(r != NULL);
assert(id > 0);
assert(address != NULL);
assert(result != NULL);
/* XXX: Up to version 0.19.1 followers were erroneously setting
* last_log_index to whatever their last log index was, regardless or the
* index being rejected. If we detect such a case, we manually amend it
* here. This code can be dropped once sufficient time has passed that we
* are confident that no server is running the old buggy code. */
if (result->rejected > 0 && result->last_log_index >= result->rejected) {
((struct raft_append_entries_result *)result)->last_log_index =
result->rejected - 1;
}
if (r->state != RAFT_LEADER) {
infof("local server is not leader -> ignore");
return 0;
}
match = recvEnsureMatchingTerms(r, result->term);
if (match < 0) {
infof("local term is higher (%llu vs %llu) -> ignore", r->current_term,
result->term);
return 0;
}
/* If we have stepped down, abort here.
*
* From Figure 3.1:
*
* [Rules for Servers] All Servers: If RPC request or response contains
* term T > currentTerm: set currentTerm = T, convert to follower.
*/
if (match > 0) {
assert(r->state == RAFT_FOLLOWER);
return 0;
}
assert(result->term == r->current_term);
/* Ignore responses from servers that have been removed */
server = configurationGet(&r->configuration, id);
if (server == NULL) {
infof("unknown server -> ignore");
return 0;
}
/* Update the progress of this server, possibly sending further entries. */
rv = replicationUpdate(r, server, result);
if (rv != 0) {
return rv;
}
return 0;
}
#undef infof
#undef tracef
raft-0.22.1/src/recv_append_entries_result.h 0000664 0000000 0000000 00000000733 14601504142 0021105 0 ustar 00root root 0000000 0000000 /* Receive an AppendEntries result message. */
#ifndef RECV_APPEND_ENTRIES_RESULT_H_
#define RECV_APPEND_ENTRIES_RESULT_H_
#include "../include/raft.h"
/* Process an AppendEntries RPC result from the given server. */
int recvAppendEntriesResult(struct raft *r,
raft_id id,
const char *address,
const struct raft_append_entries_result *result);
#endif /* RECV_APPEND_ENTRIES_RESULT_H_ */
raft-0.22.1/src/recv_install_snapshot.c 0000664 0000000 0000000 00000004670 14601504142 0020073 0 ustar 00root root 0000000 0000000 #include "recv_install_snapshot.h"
#include "assert.h"
#include "convert.h"
#include "message.h"
#include "recv.h"
#include "replication.h"
#include "tracing.h"
#include "trail.h"
#define infof(...) Infof(r->tracer, " " __VA_ARGS__)
int recvInstallSnapshot(struct raft *r,
const raft_id id,
const char *address,
struct raft_install_snapshot *args)
{
struct raft_message message;
struct raft_append_entries_result *result = &message.append_entries_result;
int rv;
int match;
bool async;
assert(address != NULL);
result->version = MESSAGE__APPEND_ENTRIES_RESULT_VERSION;
result->features = MESSAGE__FEATURE_CAPACITY;
match = recvEnsureMatchingTerms(r, args->term);
if (match < 0) {
infof("local term is higher (%llu vs %llu) -> reject", r->current_term,
args->term);
goto reply;
}
/* TODO: this logic duplicates the one in the AppendEntries handler */
assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE);
assert(r->current_term == args->term);
if (r->state == RAFT_CANDIDATE) {
assert(match == 0);
infof("discovered leader (%llu) -> step down ", id);
convertToFollower(r);
}
rv = recvUpdateLeader(r, id, address);
if (rv != 0) {
return rv;
}
r->election_timer_start = r->now;
r->update->flags |= RAFT_UPDATE_TIMEOUT;
rv = replicationInstallSnapshot(r, args, &async);
if (rv != 0) {
return rv;
}
if (async) {
return 0;
}
/* If we got here it mess that we either have a more recent snapshot than
* the one being sent, or that we already have all snapshot entries in our
* log. */
assert(TrailLastIndex(&r->trail) >= args->last_index);
/* Echo back to the leader the point that we reached. */
result->last_log_index = args->last_index;
if (r->last_stored < result->last_log_index) {
result->last_log_index = r->last_stored;
}
reply:
result->term = r->current_term;
result->rejected = 0;
/* Free the snapshot data. */
raft_configuration_close(&args->conf);
raft_free(args->data.base);
result->capacity = r->capacity;
message.type = RAFT_APPEND_ENTRIES_RESULT;
message.server_id = id;
message.server_address = address;
rv = MessageEnqueue(r, &message);
if (rv != 0) {
return rv;
}
return 0;
}
#undef infof
raft-0.22.1/src/recv_install_snapshot.h 0000664 0000000 0000000 00000000637 14601504142 0020077 0 ustar 00root root 0000000 0000000 /* InstallSnapshot RPC handlers. */
#ifndef RECV_INSTALL_SNAPSHOT_H_
#define RECV_INSTALL_SNAPSHOT_H_
#include "../include/raft.h"
/* Process an InstallSnapshot RPC from the given server. */
int recvInstallSnapshot(struct raft *r,
raft_id id,
const char *address,
struct raft_install_snapshot *args);
#endif /* RECV_INSTALL_SNAPSHOT_H_ */
raft-0.22.1/src/recv_request_vote.c 0000664 0000000 0000000 00000011004 14601504142 0017220 0 ustar 00root root 0000000 0000000 #include "recv_request_vote.h"
#include "assert.h"
#include "election.h"
#include "message.h"
#include "recv.h"
#include "replication.h"
#include "tracing.h"
#define infof(...) Infof(r->tracer, " " __VA_ARGS__)
#define tracef(...) Tracef(r->tracer, __VA_ARGS__)
int recvRequestVote(struct raft *r,
const raft_id id,
const char *address,
const struct raft_request_vote *args)
{
struct raft_message message;
struct raft_request_vote_result *result = &message.request_vote_result;
bool has_leader;
int match;
int rv;
assert(r != NULL);
assert(id > 0);
assert(args != NULL);
result->vote_granted = false;
result->pre_vote = args->pre_vote;
result->version = MESSAGE__REQUEST_VOTE_RESULT_VERSION;
/* Reject the request if we have a leader.
*
* From Section 4.2.3:
*
* [Removed] servers should not be able to disrupt a leader whose cluster
* is receiving heartbeats. [...] If a server receives a RequestVote
* request within the minimum election timeout of hearing from a current
* leader, it does not update its term or grant its vote
*
* From Section 4.2.3:
*
* This change conflicts with the leadership transfer mechanism as
* described in Chapter 3, in which a server legitimately starts an
* election without waiting an election timeout. In that case, RequestVote
* messages should be processed by other servers even when they believe a
* current cluster leader exists. Those RequestVote requests can include a
* special flag to indicate this behavior ("I have permission to disrupt
* the leader - it told me to!").
*/
has_leader =
r->state == RAFT_LEADER ||
(r->state == RAFT_FOLLOWER && r->follower_state.current_leader.id != 0);
if (has_leader && !args->disrupt_leader) {
if (r->state == RAFT_LEADER) {
infof("local server is leader -> reject");
} else {
assert(r->state == RAFT_FOLLOWER);
infof("local server has a leader (server %llu) -> reject",
r->follower_state.current_leader.id);
}
goto reply;
}
/* If this is a pre-vote request, don't actually increment our term or
* persist the vote. */
if (args->pre_vote) {
match = recvCheckMatchingTerms(r, args->term);
} else {
match = recvEnsureMatchingTerms(r, args->term);
}
/* Reject the request if we are installing a snapshot.
*
* This condition should only be reachable if the disrupt_leader flag is
* set, since otherwise we wouldn't have passed the have_leader check above
* (follower state is not cleared while a snapshot is being installed). */
if (r->snapshot.installing) {
tracef("installing snapshot -> reject (disrupt_leader:%d)",
(int)args->disrupt_leader);
goto reply;
}
/* From Figure 3.1:
*
* RequestVote RPC: Receiver implementation: Reply false if
* term < currentTerm.
*
*/
if (match < 0) {
infof("remote term is lower (%llu vs %llu) -> reject", args->term,
r->current_term);
goto reply;
}
/* Unless this is a pre-vote request, at this point our term must be the
* same as the request term (otherwise we would have rejected the request or
* bumped our term). */
if (!args->pre_vote) {
assert(r->current_term == args->term);
}
electionVote(r, args, &result->vote_granted);
reply:
result->term = r->current_term;
/* Nodes don't update their term when seeing a Pre-Vote RequestVote RPC.
* To prevent the candidate from ignoring the response of this node if it
* has a smaller term than the candidate, we include the term of the
* request. The smaller term can occur if this node was partitioned from the
* cluster and has reestablished connectivity. This prevents a cluster
* deadlock when a majority of the nodes is online, but they fail to
* establish quorum because the vote of a former partitioned node with a
* smaller term is needed for majority.*/
if (args->pre_vote) {
result->term = args->term;
}
result->features = MESSAGE__FEATURE_CAPACITY;
result->capacity = r->capacity;
message.type = RAFT_REQUEST_VOTE_RESULT;
message.server_id = id;
message.server_address = address;
rv = MessageEnqueue(r, &message);
if (rv != 0) {
return rv;
}
return 0;
}
#undef infof
#undef tracef
raft-0.22.1/src/recv_request_vote.h 0000664 0000000 0000000 00000000573 14601504142 0017236 0 ustar 00root root 0000000 0000000 /* RequestVote RPC handler. */
#ifndef RECV_REQUEST_VOTE_H_
#define RECV_REQUEST_VOTE_H_
#include "../include/raft.h"
/* Process a RequestVote RPC from the given server. */
int recvRequestVote(struct raft *r,
raft_id id,
const char *address,
const struct raft_request_vote *args);
#endif /* RECV_REQUEST_VOTE_H_ */
raft-0.22.1/src/recv_request_vote_result.c 0000664 0000000 0000000 00000012000 14601504142 0020613 0 ustar 00root root 0000000 0000000 #include "recv_request_vote_result.h"
#include "assert.h"
#include "configuration.h"
#include "convert.h"
#include "election.h"
#include "recv.h"
#include "replication.h"
#include "tracing.h"
#define infof(...) Infof(r->tracer, " " __VA_ARGS__)
int recvRequestVoteResult(struct raft *r,
raft_id id,
const char *address,
const struct raft_request_vote_result *result)
{
size_t votes_index;
int match;
int rv;
(void)address;
assert(r != NULL);
assert(id > 0);
/* If this is a pre-vote result, don't actually increment our term right
* now, because the term included in this message is not necessarily the
* term the remote peer is at (pre-vote results contain the term that the
* peer would bump to if the request it receives was an actual request, and
* that term is typically our current term plus one). */
if (r->candidate_state.in_pre_vote) {
match = recvCheckMatchingTerms(r, result->term);
} else {
match = recvEnsureMatchingTerms(r, result->term);
}
/* Ignore responses if we are not candidate anymore */
if (r->state != RAFT_CANDIDATE) {
assert(r->state == RAFT_LEADER || r->state == RAFT_FOLLOWER);
infof("local server is %s -> ignore", raft_state_name(r->state));
return 0;
}
votes_index = configurationIndexOfVoter(&r->configuration, id);
if (votes_index == r->configuration.n) {
infof("non-voting or unknown server -> reject");
return 0;
}
if (match < 0) {
/* If the term in the result is older than ours, this is an old message
* we should ignore, because the node who voted for us would have
* obtained our term. This happens if the network is pretty choppy. */
infof("remote term is lower (%llu vs %llu) -> ignore", result->term,
r->current_term);
return 0;
}
/* Avoid counting pre-vote votes as regular votes. */
if (result->version > 1 && result->pre_vote &&
!r->candidate_state.in_pre_vote) {
infof("receive stale pre-vote response -> ignore");
return 0;
}
/* This can happen when a candidate wins a pre-vote, bumps its term,
* sends real RequestVote RPCs, crashes, comes online, starts a pre-vote
* and then receives the response to the RequestVote RPC it sent
* out before crashing. */
if (result->version > 1 && !result->pre_vote &&
r->candidate_state.in_pre_vote) {
infof("receive vote response during pre-vote -> ignore");
return 0;
}
/* If we're in the pre-vote phase, check that the peer's is at most one term
* ahead (possibly stepping down). If in we're the actual voting phase, we
* expect our term to be the same as the response term (otherwise we would
* have either ignored the result bumped our term). */
if (r->candidate_state.in_pre_vote) {
if (match > 0) {
if (result->term > r->current_term + 1) {
assert(!result->vote_granted);
recvBumpCurrentTerm(r, result->term);
return 0;
}
}
} else {
assert(result->term == r->current_term);
}
/* Updates features and capacity */
r->candidate_state.votes[votes_index].features = result->features;
r->candidate_state.votes[votes_index].capacity = result->capacity;
/* If the vote was granted and we reached quorum, convert to leader.
*
* From Figure 3.1:
*
* If votes received from majority of severs: become leader.
*
* From state diagram in Figure 3.3:
*
* [candidate]: receives votes from majority of servers -> [leader]
*
* From Section 3.4:
*
* A candidate wins an election if it receives votes from a majority of
* the servers in the full cluster for the same term. Each server will
* vote for at most one candidate in a given term, on a
* firstcome-first-served basis [...]. Once a candidate wins an election,
* it becomes leader.
*/
if (result->vote_granted) {
unsigned votes;
unsigned n_voters;
if (electionTally(r, votes_index, &votes, &n_voters)) {
if (r->candidate_state.in_pre_vote) {
infof("votes quorum reached -> pre-vote successful");
r->candidate_state.in_pre_vote = false;
electionStart(r);
} else {
infof(
"quorum reached with %u votes out of %u -> convert to "
"leader",
votes, n_voters);
rv = convertToLeader(r);
if (rv != 0) {
return rv;
}
/* Send initial heartbeat. */
replicationHeartbeat(r);
}
} else {
infof("quorum not reached, only %u votes out of %u", votes,
n_voters);
}
} else {
infof("vote not granted");
}
return 0;
}
#undef infof
raft-0.22.1/src/recv_request_vote_result.h 0000664 0000000 0000000 00000000702 14601504142 0020626 0 ustar 00root root 0000000 0000000 /* Receive a RequestVote result. */
#ifndef RECV_REQUEST_VOTE_RESULT_H_
#define RECV_REQUEST_VOTE_RESULT_H_
#include "../include/raft.h"
/* Process a RequestVote RPC result from the given server. */
int recvRequestVoteResult(struct raft *r,
raft_id id,
const char *address,
const struct raft_request_vote_result *result);
#endif /* RAFT_RECV_REQUEST_VOTE_RESULT_H_ */
raft-0.22.1/src/recv_timeout_now.c 0000664 0000000 0000000 00000004130 14601504142 0017046 0 ustar 00root root 0000000 0000000 #include "recv_timeout_now.h"
#include "assert.h"
#include "configuration.h"
#include "convert.h"
#include "recv.h"
#include "tracing.h"
#include "trail.h"
#define infof(...) Infof(r->tracer, " " __VA_ARGS__)
int recvTimeoutNow(struct raft *r,
const raft_id id,
const char *address,
const struct raft_timeout_now *args)
{
const struct raft_server *local_server;
raft_index local_last_index;
raft_term local_last_term;
int match;
int rv;
assert(r != NULL);
assert(id > 0);
assert(args != NULL);
(void)address;
/* Ignore the request if we are not voters. */
local_server = configurationGet(&r->configuration, r->id);
if (local_server == NULL || local_server->role != RAFT_VOTER) {
infof("non-voter");
return 0;
}
/* Ignore the request if we are not follower, or we have different
* leader. */
if (r->state != RAFT_FOLLOWER ||
r->follower_state.current_leader.id != id) {
infof("ignore - r->state:%d current_leader.id:%llu", r->state,
r->follower_state.current_leader.id);
return 0;
}
/* Possibly update our term. Ignore the request if it turns out we have a
* higher term. */
match = recvEnsureMatchingTerms(r, args->term);
if (match < 0) {
return 0;
}
/* Ignore the request if we our log is not up-to-date. */
local_last_index = TrailLastIndex(&r->trail);
local_last_term = TrailLastTerm(&r->trail);
if (local_last_index != args->last_log_index ||
local_last_term != args->last_log_term) {
return 0;
}
/* Ignore the request if we're still persisting entries or installing a
* snapshot. */
if (r->last_stored < local_last_index || r->snapshot.installing) {
return 0;
}
/* Convert to candidate and start a new election. */
infof("convert to candidate, start election for term %llu",
r->current_term + 1);
rv = convertToCandidate(r, true /* disrupt leader */);
if (rv != 0) {
return rv;
}
return 0;
}
#undef infof
raft-0.22.1/src/recv_timeout_now.h 0000664 0000000 0000000 00000000567 14601504142 0017065 0 ustar 00root root 0000000 0000000 /* Receive a TimeoutNow message. */
#ifndef RECV_TIMEOUT_NOW_H_
#define RECV_TIMEOUT_NOW_H_
#include "../include/raft.h"
/* Process a TimeoutNow RPC from the given server. */
int recvTimeoutNow(struct raft *r,
raft_id id,
const char *address,
const struct raft_timeout_now *args);
#endif /* RECV_TIMEOUT_NOW_H_ */
raft-0.22.1/src/replication.c 0000664 0000000 0000000 00000107025 14601504142 0015776 0 ustar 00root root 0000000 0000000 #include
#include
#include "assert.h"
#include "configuration.h"
#include "convert.h"
#include "entry.h"
#ifdef __GLIBC__
#include "error.h"
#endif
#include "err.h"
#include "heap.h"
#include "membership.h"
#include "message.h"
#include "progress.h"
#include "queue.h"
#include "replication.h"
#include "request.h"
#include "restore.h"
#include "tracing.h"
#include "trail.h"
#define infof(...) Infof(r->tracer, " " __VA_ARGS__)
#define tracef(...) Tracef(r->tracer, __VA_ARGS__)
#ifndef max
#define max(a, b) ((a) < (b) ? (b) : (a))
#endif
#ifndef min
#define min(a, b) ((a) < (b) ? (a) : (b))
#endif
/* Send an AppendEntries message to the i'th server, including all log entries
* from the given point onwards. */
static int sendAppendEntries(struct raft *r,
const unsigned i,
const raft_index prev_index,
const raft_term prev_term,
bool heartbeat)
{
struct raft_server *server = &r->configuration.servers[i];
struct raft_message message;
struct raft_append_entries *args = &message.append_entries;
raft_index next_index = prev_index + 1;
int rv;
args->term = r->current_term;
args->prev_log_index = prev_index;
args->prev_log_term = prev_term;
if (heartbeat || !TrailHasEntry(&r->trail, next_index)) {
args->n_entries = 0;
} else {
raft_index match_index = progressMatchIndex(r, i);
unsigned currently_inflight; /* Current N of un-acnowledged entries. */
assert(TrailHasEntry(&r->trail, next_index));
assert(match_index < next_index);
currently_inflight = (unsigned)(next_index - match_index) - 1;
/* If we have already reached the maximum amount of allowed inflight
* entries, don't send any other entry.
*
* Otherwise, send the maximum amount of entries that doesn't make us
* exceed the configured limit.*/
if (currently_inflight >= r->max_inflight_entries) {
args->n_entries = 0;
} else {
raft_index last_index = TrailLastIndex(&r->trail);
unsigned outstanding = (unsigned)(last_index - next_index) + 1;
unsigned max = r->max_inflight_entries - currently_inflight;
assert(max > 0);
args->n_entries = min(outstanding, max);
}
}
/* From Section 3.5:
*
* The leader keeps track of the highest index it knows to be committed,
* and it includes that index in future AppendEntries RPCs (including
* heartbeats) so that the other servers eventually find out. Once a
* follower learns that a log entry is committed, it applies the entry to
* its local state machine (in log order)
*/
args->leader_commit = r->commit_index;
if (args->n_entries == 0) {
infof("%s server %llu sending a heartbeat (no entries)",
progressStateName(r, i), server->id);
} else if (args->n_entries == 1) {
infof("%s server %llu sending 1 entry (%llu^%llu)",
progressStateName(r, i), server->id, next_index,
TrailTermOf(&r->trail, next_index));
} else {
infof("%s server %llu sending %u entries (%llu^%llu..%llu^%llu)",
progressStateName(r, i), server->id, args->n_entries, next_index,
TrailTermOf(&r->trail, next_index),
next_index + args->n_entries - 1,
TrailTermOf(&r->trail, next_index + args->n_entries - 1));
}
args->version = MESSAGE__APPEND_ENTRIES_VERSION;
message.type = RAFT_APPEND_ENTRIES;
message.server_id = server->id;
message.server_address = server->address;
rv = MessageEnqueue(r, &message);
if (rv != 0) {
goto err;
}
if (progressState(r, i) == PROGRESS__PIPELINE) {
/* Optimistically update progress. */
progressSetNextIndex(r, i, args->prev_log_index + 1 + args->n_entries);
}
progressUpdateLastSend(r, i);
return 0;
err:
assert(rv != 0);
return rv;
}
/* Send the latest snapshot to the i'th server */
static int sendSnapshot(struct raft *r, const unsigned i)
{
struct raft_message message;
struct raft_install_snapshot *args = &message.install_snapshot;
struct raft_server *server;
int rv;
progressToSnapshot(r, i);
progressUpdateSnapshotLastSend(r, i);
server = &r->configuration.servers[i];
message.type = RAFT_INSTALL_SNAPSHOT;
message.server_id = server->id;
message.server_address = server->address;
args->version = MESSAGE__INSTALL_SNAPSHOT_VERSION;
args->term = r->current_term;
args->last_index = TrailSnapshotIndex(&r->trail);
args->last_term = TrailTermOf(&r->trail, args->last_index);
args->conf_index = r->configuration_last_snapshot_index;
infof("sending snapshot (%llu^%llu) to server %llu", args->last_index,
args->last_term, server->id);
rv = MessageEnqueue(r, &message);
if (rv != 0) {
goto err;
}
return 0;
err:
progressAbortSnapshot(r, i);
assert(rv != 0);
return rv;
}
int replicationProgress(struct raft *r, unsigned i)
{
struct raft_server *server = &r->configuration.servers[i];
bool progress_state_is_snapshot = progressState(r, i) == PROGRESS__SNAPSHOT;
raft_index next_index = progressNextIndex(r, i);
raft_index prev_index;
raft_term prev_term;
bool heartbeat = false; /* Whether to just send a heartbeat (no entries) */
bool needs_snapshot = false;
assert(r->state == RAFT_LEADER);
assert(server->id != r->id);
assert(next_index >= 1);
/* From Section 3.5:
*
* When sending an AppendEntries RPC, the leader includes the index and
* term of the entry in its log that immediately precedes the new
* entries. If the follower does not find an entry in its log with the
* same index and term, then it refuses the new entries. The consistency
* check acts as an induction step: the initial empty state of the logs
* satisfies the Log Matching Property, and the consistency check
* preserves the Log Matching Property whenever logs are extended. As a
* result, whenever AppendEntries returns successfully, the leader knows
* that the follower's log is identical to its own log up through the new
* entries (Log Matching Property in Figure 3.2).
*/
prev_index = next_index - 1;
if (next_index == 1) {
/* This is the first entry, so prevIndex and prevTerm are null. */
prev_term = 0;
/* If we don't have entry 1 anymore in our log, we need to send a
* snapshot. */
if (TrailTermOf(&r->trail, 1) == 0) {
needs_snapshot = true;
}
} else {
/* Set prevIndex and prevTerm to the index and term of the entry at
* next_index - 1. */
prev_term = TrailTermOf(&r->trail, prev_index);
/* We need to send a snapshot if prev_term prev_term is 0, because in
* this case we don't have anymore information about the previous entry,
* i.e. it's not in the log (it was truncated) and also it's not the
* last entry included in the last snapshot.
*/
if (prev_term == 0) {
needs_snapshot = true;
}
}
/* If we have to send entries that are not anymore in our log, send the last
* snapshot if we're not doing so already. */
if (needs_snapshot || progress_state_is_snapshot) {
raft_index snapshot_index = TrailSnapshotIndex(&r->trail);
infof("missing previous entry at index %lld -> needs snapshot",
prev_index);
assert(snapshot_index > 0);
if (!progress_state_is_snapshot && progressIsOnline(r, i)) {
return sendSnapshot(r, i);
}
/* Set the next index to the snapshot index + 1, so when we receive
* the AppendEntries result for the empty heartbeat, we don't
* consider the result as stale. */
progressSetNextIndex(r, i, snapshot_index + 1);
/* Send heartbeats anchored to the snapshot index */
prev_index = snapshot_index;
prev_term = TrailSnapshotTerm(&r->trail);
assert(prev_term > 0);
heartbeat = true;
}
return sendAppendEntries(r, i, prev_index, prev_term, heartbeat);
}
/* Possibly trigger I/O requests for newly appended log entries or heartbeats.
*
* This function loops through all followers and triggers replication on them.
*
* It must be called only by leaders. */
static int triggerAll(struct raft *r)
{
unsigned i;
int rv;
assert(r->state == RAFT_LEADER);
/* Trigger replication for servers we didn't hear from recently. */
for (i = 0; i < r->configuration.n; i++) {
const struct raft_server *server = &r->configuration.servers[i];
if (server->id == r->id) {
continue;
}
/* Skip spare servers, unless they're being promoted. */
if (server->role == RAFT_SPARE &&
server->id != r->leader_state.promotee_id) {
continue;
}
if (!progressShouldReplicate(r, i)) {
continue;
}
rv = replicationProgress(r, i);
if (rv != 0 && rv != RAFT_NOCONNECTION) {
/* This is not a critical failure, let's just log it. */
tracef("failed to send append entries to server %llu: %s (%d)",
server->id, raft_strerror(rv), rv);
}
}
return 0;
}
int replicationHeartbeat(struct raft *r)
{
return triggerAll(r);
}
/* Called after a successful append entries I/O request to update the index of
* the last entry stored on disk. Return how many new entries that are still
* present in our in-memory log were stored. */
static void updateLastStored(struct raft *r, raft_index index)
{
assert(r->last_stored < index);
r->last_stored = index;
}
static void replicationQuorum(struct raft *r, const raft_index index);
/* Invoked once a disk write request for new entries has been completed. */
static int leaderPersistEntriesDone(struct raft *r, raft_index index)
{
size_t server_index;
assert(r->state == RAFT_LEADER);
updateLastStored(r, index);
/* Only update the next index if we are part of the current
* configuration. The only case where this is not true is when we were
* asked to remove ourselves from the cluster.
*
* From Section 4.2.2:
*
* there will be a period of time (while it is committing Cnew) when a
* leader can manage a cluster that does not include itself; it
* replicates log entries but does not count itself in majorities.
*/
server_index = configurationIndexOf(&r->configuration, r->id);
if (server_index < r->configuration.n) {
r->leader_state.progress[server_index].match_index = r->last_stored;
}
/* Check if we can commit some new entries. */
replicationQuorum(r, r->last_stored);
return 0;
}
static void followerPersistEntriesDone(struct raft *r, raft_index index);
/* Invoked once a disk write request for new entries has been completed. */
int replicationPersistEntriesDone(struct raft *r, raft_index index)
{
int rv;
/* We wait for all writes to be settled before transitioning to candidate
* state, and no new writes are issued as candidate, so the current state
* must be leader or follower */
assert(r->state == RAFT_LEADER || r->state == RAFT_FOLLOWER);
switch (r->state) {
case RAFT_LEADER:
rv = leaderPersistEntriesDone(r, index);
break;
case RAFT_FOLLOWER:
followerPersistEntriesDone(r, index);
rv = 0;
break;
default:
/* We expect no write to complete during candidate state */
assert(0);
rv = RAFT_SHUTDOWN;
break;
}
if (rv != 0) {
return rv;
}
return 0;
}
static void persistEntries(struct raft *r,
raft_index index,
struct raft_entry entries[],
unsigned n)
{
assert(n > 0);
assert(entries != NULL);
/* This must be the first time during this raft_step() call where we set new
* entries to be persisted. */
assert(!(r->update->flags & RAFT_UPDATE_ENTRIES));
r->update->flags |= RAFT_UPDATE_ENTRIES;
r->update->entries.index = index;
r->update->entries.batch = entries;
r->update->entries.n = n;
}
int replicationTrigger(struct raft *r,
raft_index index,
struct raft_entry *entries,
unsigned n)
{
persistEntries(r, index, entries, n);
return triggerAll(r);
}
int replicationUpdate(struct raft *r,
const struct raft_server *server,
const struct raft_append_entries_result *result)
{
bool is_being_promoted;
raft_index last_index;
unsigned i;
int rv;
assert(r->state == RAFT_LEADER);
assert(server->id != 0);
i = configurationIndexOf(&r->configuration, server->id);
assert(i < r->configuration.n);
progressUpdateLastRecv(r, i);
progressSetFeatures(r, i, result->features);
progressSetCapacity(r, i, result->capacity);
/* If the RPC failed because of a log mismatch, retry.
*
* From Figure 3.1:
*
* [Rules for servers] Leaders:
*
* - If AppendEntries fails because of log inconsistency:
* decrement nextIndex and retry.
*/
if (result->rejected > 0) {
bool retry;
retry = progressMaybeDecrement(r, i, result->rejected,
result->last_log_index);
if (retry) {
/* Retry, ignoring errors. */
infof("log mismatch -> send old entries");
replicationProgress(r, i);
}
return 0;
}
/* In case of success the remote server is expected to send us back the
* value of prevLogIndex + len(entriesToAppend). If it has a longer log, it
* might be a leftover from previous terms. */
last_index = result->last_log_index;
if (last_index > TrailLastIndex(&r->trail)) {
last_index = TrailLastIndex(&r->trail);
}
/* If the RPC succeeded, update our counters for this server.
*
* From Figure 3.1:
*
* [Rules for servers] Leaders:
*
* If successful update nextIndex and matchIndex for follower.
*/
if (!progressMaybeUpdate(r, i, last_index)) {
return 0;
}
switch (progressState(r, i)) {
case PROGRESS__SNAPSHOT:
/* If a snapshot has been installed, transition back to probe */
if (progressSnapshotDone(r, i)) {
progressToPipeline(r, i);
}
break;
case PROGRESS__PROBE:
/* Transition to pipeline */
progressToPipeline(r, i);
}
/* If the server is currently being promoted and is catching with logs,
* update the information about the current catch-up round, and possibly
* proceed with the promotion. */
is_being_promoted = r->leader_state.promotee_id != 0 &&
r->leader_state.promotee_id == server->id;
if (is_being_promoted) {
bool is_up_to_date = membershipUpdateCatchUpRound(r);
if (is_up_to_date) {
r->leader_state.promotee_id = 0;
}
}
/* If we are transferring leadership to this follower, check if its log
* is now up-to-date and, if so, send it a TimeoutNow RPC (unless we
* already did). */
if (r->leader_state.transferee == server->id) {
raft_index match_index = progressMatchIndex(r, i);
if (match_index == last_index && !r->leader_state.transferring) {
rv = membershipLeadershipTransferStart(r);
if (rv != 0) {
r->leader_state.transferee = 0;
}
}
}
/* If this follower is in pipeline mode, send it more entries if
* needed. */
if (progressState(r, i) == PROGRESS__PIPELINE &&
progressShouldReplicate(r, i)) {
replicationProgress(r, i);
}
/* Check if we can commit some new entries. */
replicationQuorum(r, last_index);
return 0;
}
static void sendAppendEntriesResult(
struct raft *r,
const struct raft_append_entries_result *result)
{
struct raft_message message;
raft_id id = r->follower_state.current_leader.id;
const char *address = r->follower_state.current_leader.address;
int rv;
assert(r->state == RAFT_FOLLOWER);
/* There are two cases in which a follower can have no leader:
*
* - If it never had a leader before (e.g. it just started)
*
* - If has become a follower after stepping down from leader because
* it could not contact a majority of servers.
*
* In the first case we can't reach this function because no entries have
* been received.
*
* In the second case we might call this function as part of the raft_step()
* logic, namely when a raft_persist_entries task originally created by a
* leader gets completed after the leader has stepped down and has now
* become a follower. In this circumstance current_leader.address will be
* NULL and we don't send any message.
*/
if (r->follower_state.current_leader.address == NULL) {
return;
}
if (result->rejected == 0) {
infof("send success result to %llu", id);
}
message.type = RAFT_APPEND_ENTRIES_RESULT;
message.append_entries_result = *result;
message.server_id = id;
message.server_address = address;
rv = MessageEnqueue(r, &message);
if (rv != 0) {
/* This is not fatal, we'll retry. */
(void)rv;
}
}
static void followerPersistEntriesDone(struct raft *r, raft_index index)
{
struct raft_append_entries_result result;
assert(r->state == RAFT_FOLLOWER);
result.term = r->current_term;
result.version = MESSAGE__APPEND_ENTRIES_RESULT_VERSION;
result.features = MESSAGE__FEATURE_CAPACITY;
/* We received an InstallSnapshot RPC while these entries were being
* persisted to disk */
if (r->snapshot.installing) {
return;
}
updateLastStored(r, index);
/* If we haven't received any AppendEntries request yet and so we have no
* idea of what the leader's log contain, don't report anything. */
if (r->follower_state.match == 0) {
return;
}
result.rejected = 0;
result.last_log_index = min(r->last_stored, r->follower_state.match);
result.capacity = r->capacity;
sendAppendEntriesResult(r, &result);
}
/* Check the log matching property against an incoming AppendEntries request.
*
* From Figure 3.1:
*
* [AppendEntries RPC] Receiver implementation:
*
* 2. Reply false if log doesn't contain an entry at prevLogIndex whose
* term matches prevLogTerm.
*
* Return 0 if the check passed.
*
* Return 1 if the check did not pass and the request needs to be rejected.
*
* Return -1 if there's a conflict and we need to shutdown. */
static int checkLogMatchingProperty(const struct raft *r,
const struct raft_append_entries *args)
{
raft_term local_prev_term;
/* If this is the very first entry, there's nothing to check. */
if (args->prev_log_index == 0) {
assert(args->prev_log_term == 0);
return 0;
}
assert(args->prev_log_term != 0);
local_prev_term = TrailTermOf(&r->trail, args->prev_log_index);
if (local_prev_term == 0) {
infof("missing previous entry (%llu^%llu) -> reject",
args->prev_log_index, args->prev_log_term);
return 1;
}
if (local_prev_term != args->prev_log_term) {
if (args->prev_log_index <= r->commit_index) {
/* Should never happen; something is seriously wrong! */
infof(
"conflicting terms %llu and %llu for entry %llu (commit "
"index %llu) -> shutdown",
local_prev_term, args->prev_log_term, args->prev_log_index,
r->commit_index);
return -1;
}
infof("previous term mismatch -> reject");
return 1;
}
return 0;
}
/* Check if our log has entries that conflict with the ones in the given
* AppendEntries request.
*
* The i output parameter will be set to the array index of the first new log
* entry that we don't have yet in our log, among the ones included in the given
* AppendEntries request.
*
* The truncate output parameter will be set to the index of the first
* conflicting entry that was found, or 0 if no such entry entry was found.
*
* Errors:
*
* RAFT_SHUTDOWN
* A committed entry with a conflicting term has been found.
*/
static int checkConflictingEntries(const struct raft *r,
const struct raft_append_entries *args,
size_t *i,
raft_index *truncate)
{
size_t j;
*truncate = 0;
for (j = 0; j < args->n_entries; j++) {
struct raft_entry *entry = &args->entries[j];
raft_index entry_index = args->prev_log_index + 1 + j;
raft_term local_term = TrailTermOf(&r->trail, entry_index);
assert(entry->term != 0);
if (local_term > 0 && local_term != entry->term) {
if (entry_index <= r->commit_index) {
/* Should never happen; something is seriously wrong! */
infof(
"conflicting terms %llu and %llu for entry %llu (commit "
"index %llu) -> shutdown",
local_term, entry->term, entry_index, r->commit_index);
return RAFT_SHUTDOWN;
}
infof("log mismatch (%llu^%llu vs %llu^%llu) -> truncate",
entry_index, local_term, entry_index, entry->term);
*truncate = entry_index;
/* We want to append all entries from here on, replacing anything
* that we had before. */
break;
} else if (local_term == 0) {
/* We don't have an entry at this index, so we want to append this
* new one and all the subsequent ones. */
break;
}
}
*i = j;
return 0;
}
/* Delete all entries from the given index onwards, possibly rolling back any
* affected uncommitted configuration.
*
* Errors:
*
* RAFT_NOMEM
* In case a configuration rollback is needed, a copy of the last committed
* configuration could not be made.
*/
static int deleteConflictingEntries(struct raft *r, raft_index index)
{
int rv;
/* Discard any uncommitted configuration change. */
if (r->configuration_uncommitted_index >= index) {
rv = membershipRollback(r);
if (rv != 0) {
assert(rv == RAFT_NOMEM);
return rv;
}
}
/* Delete all entries from this index on because they don't match. */
TrailTruncate(&r->trail, index);
/* Drop information about previously stored entries that have just been
* discarded. */
if (r->last_stored >= index) {
r->last_stored = index - 1;
}
return 0;
}
int replicationAppend(struct raft *r,
const struct raft_append_entries *args,
raft_index *rejected,
bool *async)
{
struct raft_entry *entries;
raft_index index;
raft_index truncate;
unsigned n_entries;
int match;
size_t n;
size_t i;
size_t j;
int rv;
assert(r->state == RAFT_FOLLOWER);
assert(*rejected == args->prev_log_index);
*async = false;
/* Check the log matching property. */
match = checkLogMatchingProperty(r, args);
if (match != 0) {
assert(match == 1 || match == -1);
return match == 1 ? 0 : RAFT_SHUTDOWN;
}
/* Check for conflicting entries. */
rv = checkConflictingEntries(r, args, &i, &truncate);
if (rv != 0) {
assert(rv == RAFT_SHUTDOWN);
return rv;
}
/* From Figure 3.1:
*
* [AppendEntries RPC] Receiver implementation:
*
* 3. If an existing entry conflicts with a new one (same index but
* different terms), delete the existing entry and all that follow it.
*/
if (truncate > 0) {
rv = deleteConflictingEntries(r, truncate);
if (rv != 0) {
assert(rv == RAFT_NOMEM);
return rv;
}
}
n = args->n_entries - i; /* Number of new entries */
/* Index of first new entry */
index = args->prev_log_index + 1 + i;
/* Update our in-memory log to reflect that we received these entries. We'll
* notify the leader of a successful append once the write entries request
* that we issue below actually completes. */
for (j = 0; j < n; j++) {
struct raft_entry *entry = &args->entries[i + j];
rv = TrailAppend(&r->trail, entry->term);
if (rv != 0) {
goto err;
}
}
/* Update our local match index, since we can be sure that all entries in
* our log up to the last one in this AppendEntries request now match the
* ones of the leader of the current term. */
if (args->prev_log_index + args->n_entries >= r->follower_state.match) {
r->follower_state.match = args->prev_log_index + args->n_entries;
}
*rejected = 0;
/* From Figure 3.1:
*
* AppendEntries RPC: Receiver implementation: If leaderCommit >
* commitIndex, set commitIndex = min(leaderCommit, index of last new
* entry).
*/
if (args->leader_commit > r->commit_index) {
raft_index last_new_entry = args->prev_log_index + args->n_entries;
r->commit_index = min(args->leader_commit, last_new_entry);
r->update->flags |= RAFT_UPDATE_COMMIT_INDEX;
}
if (n == 0) {
infof("no new entries to persist");
return 0;
}
*async = true;
/* Double check we are tracking the relevant entries in the log trail. */
n_entries = (unsigned)(TrailLastIndex(&r->trail) - index) + 1;
/* The number of entries we must be exactly n, which is the number of new
* entries present in the message (here "new entries" means entries that we
* don't have yet in our in-memory log). That's because we call TrailAppend
* above exactly n times, once for each new log entry. */
assert(n_entries > 0);
assert(n_entries == n);
entries = &args->entries[i];
/* The n == 0 case is handled above. */
assert(n_entries > 0);
if (n_entries == 1) {
infof("start persisting 1 new entry (%llu^%llu)", index,
entries[0].term);
} else {
infof("start persisting %u new entries (%llu^%llu..%llu^%llu)",
n_entries, index, entries[0].term, index + n_entries - 1,
entries[n_entries - 1].term);
}
/* Possibly apply configuration changes as uncommitted. */
for (i = 0; i < n_entries; i++) {
struct raft_entry *entry = &entries[i];
if (entry->type == RAFT_CHANGE) {
rv = membershipUncommittedChange(r, index, entry);
if (rv != 0) {
goto err;
}
}
}
persistEntries(r, index, entries, n_entries);
return 0;
err:
/* Release all entries added to the in-memory log, making
* sure the in-memory log and disk don't diverge, leading
* to future log entries not being persisted to disk. */
if (j != 0) {
TrailTruncate(&r->trail, index);
}
assert(rv != 0);
return rv;
}
int replicationPersistSnapshotDone(struct raft *r,
struct raft_snapshot_metadata *metadata,
size_t offset,
bool last)
{
struct raft_append_entries_result result;
int rv;
(void)offset;
(void)last;
/* We avoid converting to candidate state while installing a snapshot. */
assert(r->state == RAFT_FOLLOWER);
r->snapshot.installing = false;
result.term = r->current_term;
result.version = MESSAGE__APPEND_ENTRIES_RESULT_VERSION;
result.features = MESSAGE__FEATURE_CAPACITY;
result.rejected = 0;
/* From Figure 5.3:
*
* 7. Discard the entire log
* 8. ... load lastConfig as cluster configuration
*/
rv = RestoreSnapshot(r, metadata);
if (rv != 0) {
tracef("restore snapshot %llu: %s", metadata->index, raft_strerror(rv));
goto discard;
}
goto respond;
discard:
/* In case of error we must also free the snapshot data buffer and free the
* configuration. */
result.rejected = metadata->index;
raft_configuration_close(&metadata->configuration);
respond:
if (r->state == RAFT_FOLLOWER) {
result.last_log_index = r->last_stored;
result.capacity = r->capacity;
sendAppendEntriesResult(r, &result);
}
return 0;
}
int replicationInstallSnapshot(struct raft *r,
const struct raft_install_snapshot *args,
bool *async)
{
struct raft_snapshot_metadata metadata;
raft_term local_term;
assert(r->state == RAFT_FOLLOWER);
assert(args->last_index != 0);
assert(args->last_term != 0);
*async = false;
/* If we are installing a snapshot, ignore the request, the leader will
* eventually retry.
*
* Note that if we are taking a snapshot, the consuming code is supposed to
* wait until taking the snapshot completes before starting to persist this
* one.
*
* TODO: we should do something smarter. */
if (r->snapshot.installing) {
*async = true;
infof("already taking or installing snapshot");
return 0;
}
/* If our last snapshot is more up-to-date, this is a no-op */
if (TrailSnapshotIndex(&r->trail) >= args->last_index) {
infof("have more recent snapshot");
return 0;
}
/* If we already have all entries in the snapshot, this is a no-op */
local_term = TrailTermOf(&r->trail, args->last_index);
if (local_term == args->last_term) {
infof("have all entries");
return 0;
}
*async = true;
/* Preemptively update our in-memory state. */
TrailRestore(&r->trail, args->last_index, args->last_term);
r->last_stored = 0;
assert(!r->snapshot.installing);
r->snapshot.installing = true;
metadata.index = args->last_index;
metadata.term = args->last_term;
metadata.index = args->last_index;
metadata.configuration_index = args->conf_index;
metadata.configuration = args->conf;
assert(!(r->update->flags & RAFT_UPDATE_SNAPSHOT));
infof("start persisting snapshot (%llu^%llu)", metadata.index,
metadata.term);
r->update->flags |= RAFT_UPDATE_SNAPSHOT;
r->update->snapshot.metadata = metadata;
r->update->snapshot.offset = 0;
r->update->snapshot.chunk = args->data;
r->update->snapshot.last = true;
return 0;
}
int replicationApplyConfigurationChange(struct raft *r,
struct raft_configuration *conf,
raft_index index)
{
assert(index > 0);
if (r->configuration_uncommitted_index != index) {
configurationClose(conf);
return 0;
}
/* If this is an uncommitted configuration that we had already applied when
* submitting the configuration change (for leaders) or upon receiving it
* via an AppendEntries RPC (for followers), then reset the uncommitted
* index, since that uncommitted configuration is now committed. */
r->configuration_uncommitted_index = 0;
r->configuration_committed_index = index;
configurationClose(&r->configuration_committed);
r->configuration_committed = *conf;
if (r->state == RAFT_LEADER) {
const struct raft_server *server;
/* If we are leader but not part of this new configuration, step
* down.
*
* From Section 4.2.2:
*
* In this approach, a leader that is removed from the configuration
* steps down once the Cnew entry is committed.
*/
server = configurationGet(&r->configuration, r->id);
if (server == NULL || server->role != RAFT_VOTER) {
const char *reason;
if (server == NULL) {
reason = "leader removed from config";
} else {
reason = "leader no longer voter";
}
infof("%s -> step down", reason);
convertToFollower(r);
}
}
return 0;
}
int replicationSnapshot(struct raft *r,
struct raft_snapshot_metadata *metadata,
unsigned trailing)
{
(void)trailing;
/* Make also a copy of the index of the configuration contained in the
* snapshot, we'll need it in case we send out an InstallSnapshot RPC. */
r->configuration_last_snapshot_index = metadata->configuration_index;
if (metadata->configuration_index > r->configuration_committed_index) {
configurationClose(&r->configuration_committed);
r->configuration_committed = metadata->configuration;
} else {
configurationClose(&metadata->configuration);
}
TrailSnapshot(&r->trail, metadata->index, trailing);
return 0;
}
static unsigned replicationCountVotes(struct raft *r, raft_index index)
{
unsigned votes;
unsigned i;
votes = 0;
for (i = 0; i < r->configuration.n; i++) {
struct raft_server *server = &r->configuration.servers[i];
if (server->role != RAFT_VOTER) {
continue;
}
if (r->leader_state.progress[i].match_index >= index) {
votes++;
}
}
return votes;
}
/* Check if a quorum has been reached for the given log index or some earlier
* index, and update the commit index accordingly if so.
*
* From Figure 3.1:
*
* [Rules for servers] Leaders:
*
* If there exists an N such that N > commitIndex, a majority of
* matchIndex[i] >= N, and log[N].term == currentTerm: set commitIndex = N */
static void replicationQuorum(struct raft *r, raft_index index)
{
unsigned votes;
raft_term term;
unsigned n_voters;
raft_index uncommitted = 0; /* Lowest uncommitted entry in current term */
const char *suffix;
assert(r->state == RAFT_LEADER);
n_voters = configurationVoterCount(&r->configuration);
while (index > r->commit_index) {
term = TrailTermOf(&r->trail, index);
/* TODO: fuzzy-test --seed 0x8db5fccc replication/entries/partitioned
* fails the assertion below. */
if (term == 0) {
return;
}
// assert(logTermOf(r->log, index) > 0);
assert(term <= r->current_term);
uncommitted = index;
votes = replicationCountVotes(r, index);
/* Don't commit entries from previous terms by counting replicas. */
if (term < r->current_term) {
break;
}
if (votes > n_voters / 2) {
unsigned n = (unsigned)(index - r->commit_index);
if (n == 1) {
infof("commit 1 new entry (%llu^%llu)", index, term);
} else {
infof("commit %u new entries (%llu^%llu..%llu^%llu)", n,
r->commit_index + 1,
TrailTermOf(&r->trail, r->commit_index + 1), index, term);
}
r->commit_index = index;
r->update->flags |= RAFT_UPDATE_COMMIT_INDEX;
return;
}
/* Try with the previous uncommitted index, if any. */
index -= 1;
}
if (uncommitted != 0) {
if (votes == 1) {
suffix = "";
} else {
suffix = "s";
}
infof("next uncommitted entry (%llu^%llu) has %u vote%s out of %u",
uncommitted, TrailTermOf(&r->trail, uncommitted), votes, suffix,
n_voters);
}
}
#undef infof
#undef tracef
raft-0.22.1/src/replication.h 0000664 0000000 0000000 00000010216 14601504142 0015776 0 ustar 00root root 0000000 0000000 /* Log replication logic and helpers. */
#ifndef REPLICATION_H_
#define REPLICATION_H_
#include "../include/raft.h"
/* Send AppendEntries RPC messages to all followers to which no AppendEntries
* was sent in the last heartbeat interval. */
int replicationHeartbeat(struct raft *r);
/* Start a local disk write for entries from the given index onwards, and
* trigger replication against all followers, typically sending AppendEntries
* RPC messages with outstanding log entries. */
int replicationTrigger(struct raft *r,
raft_index index,
struct raft_entry *entries,
unsigned n);
/* Possibly send an AppendEntries or an InstallSnapshot RPC message to the
* server with the given index.
*
* The rules to decide whether or not to send a message are:
*
* - If we have sent an InstallSnapshot RPC recently and we haven't yet received
* a response, then don't send any new message.
*
* - If we are probing the follower (i.e. we haven't received a successful
* response during the last heartbeat interval), then send a message only if
* haven't sent any during the last heartbeat interval.
*
* - If we are pipelining entries to the follower, then send any new entries we
* haven't yet sent.
*
* If a message should be sent, the rules to decide what type of message to send
* and what it should contain are:
*
* - If we don't have anymore the first entry that should be sent to the
* follower, then send an InstallSnapshot RPC with the last snapshot.
*
* - If we still have the first entry to send, then send all entries from that
index onward (possibly zero).
*
* This function must be called only by leaders. */
int replicationProgress(struct raft *r, unsigned i);
/* Update the replication state (match and next indexes) for the given server
* using the given AppendEntries RPC result.
*
* Possibly send to the server a new set of entries or a snapshot if the result
* was unsuccessful because of missing entries or if new entries were added to
* our log in the meantime.
*
* It must be called only by leaders. */
int replicationUpdate(struct raft *r,
const struct raft_server *server,
const struct raft_append_entries_result *result);
/* Append the log entries in the given request if the Log Matching Property is
* satisfied.
*
* The rejected output parameter will be set to 0 if the Log Matching Property
* was satisfied, or to args->prev_log_index if not.
*
* The async output parameter will be set to true if some of the entries in the
* request were not present in our log, and a disk write was started to persist
* them to disk. The entries will still be appended immediately to our in-memory
* copy of the log, but an AppendEntries result message will be sent only once
* the disk write completes and the I/O callback is invoked.
*
* It must be called only by followers. */
int replicationAppend(struct raft *r,
const struct raft_append_entries *args,
raft_index *rejected,
bool *async);
int replicationInstallSnapshot(struct raft *r,
const struct raft_install_snapshot *args,
bool *async);
/* Called when handling a RAFT_PERSISTED_ENTRIES event. */
int replicationPersistEntriesDone(struct raft *r, raft_index index);
/* Called when handling RAFT_PERSISTED_SNAPSHOT event. */
int replicationPersistSnapshotDone(struct raft *r,
struct raft_snapshot_metadata *metadata,
size_t offset,
bool last);
/* Called when a RAFT_SNAPSHOT event is fired, signalling the completion of a
* new snapshot. */
int replicationSnapshot(struct raft *r,
struct raft_snapshot_metadata *metadata,
unsigned trailing);
/* Apply a RAFT_CHANGE entry that has been committed. */
int replicationApplyConfigurationChange(struct raft *r,
struct raft_configuration *conf,
raft_index index);
#endif /* REPLICATION_H_ */
raft-0.22.1/src/request.h 0000664 0000000 0000000 00000000572 14601504142 0015161 0 ustar 00root root 0000000 0000000 #ifndef REQUEST_H_
#define REQUEST_H_
#include "../include/raft.h"
/* Abstract request type */
struct request
{
/* Must be kept in sync with RAFT__REQUEST in raft.h */
void *data;
int type;
raft_index index;
void *queue[2];
uint8_t req_id[16];
uint8_t client_id[16];
uint8_t unique_id[16];
uint64_t reserved[4];
};
#endif /* REQUEST_H_ */
raft-0.22.1/src/restore.c 0000664 0000000 0000000 00000012762 14601504142 0015153 0 ustar 00root root 0000000 0000000 #include "../include/raft.h"
#include "assert.h"
#include "configuration.h"
#include "convert.h"
#include "entry.h"
#include "err.h"
#include "tracing.h"
#include "trail.h"
#define tracef(...) Tracef(r->tracer, __VA_ARGS__)
/* Restore the most recent configuration entry found in the log. */
static int restoreMostRecentConfigurationEntry(struct raft *r,
struct raft_entry *entry,
raft_index index)
{
struct raft_configuration configuration;
int rv;
rv = configurationDecode(&entry->buf, &configuration);
if (rv != 0) {
configurationClose(&configuration);
return rv;
}
configurationClose(&r->configuration);
r->configuration = configuration;
/* If the configuration comes from entry at index 1 in the log, we know it's
* the bootstrap configuration and it's committed by default. Otherwise we
* we can't know if it's committed or not and treat it as uncommitted. */
if (index == 1) {
assert(r->configuration_uncommitted_index == 0);
r->configuration_committed_index = 1;
configurationClose(&r->configuration_committed);
rv = configurationCopy(&r->configuration, &r->configuration_committed);
if (rv != 0) {
return rv;
}
} else {
assert(r->configuration_committed_index < index);
r->configuration_uncommitted_index = index;
}
return 0;
}
/* Note that if the last configuration entry in the log has index greater than
* one we cannot know if it is committed or not. Therefore we also need to track
* the second-to-last configuration entry. This second-to-last entry is
* committed by default as raft doesn't allow multiple uncommitted configuration
* entries. That entry is used in case of configuration rollback scenarios. If
* we don't find the second-to-last configuration entry in the log, it means
* that the log was truncated after a snapshot and second-to-last configuration
* is still available in r->configuration_committed, which we popolated earlier
* when the snapshot was restored. */
int RestoreEntries(struct raft *r,
raft_index snapshot_index,
raft_term snapshot_term,
raft_index start_index,
struct raft_entry *entries,
unsigned n)
{
struct raft_entry *conf = NULL;
raft_index conf_index = 0;
unsigned i;
int rv;
TrailStart(&r->trail, snapshot_index, snapshot_term, start_index);
r->last_stored = start_index - 1;
for (i = 0; i < n; i++) {
struct raft_entry *entry = &entries[i];
rv = TrailAppend(&r->trail, entry->term);
if (rv != 0) {
goto err;
}
r->last_stored++;
/* Only take into account configurations that are newer than the
* configuration restored from the snapshot. */
if (entry->type == RAFT_CHANGE &&
r->last_stored > r->configuration_committed_index) {
/* If there is a previous configuration it must have been committed
* as we don't allow multiple uncommitted configurations. At the end
* of the loop r->configuration_committed_index will point to the
* second to last configuration entry, if any. */
if (conf_index != 0) {
assert(conf != NULL);
r->configuration_committed_index = conf_index;
configurationClose(&r->configuration_committed);
rv = configurationDecode(&conf->buf,
&r->configuration_committed);
if (rv != 0) {
goto err;
}
/* We also indirectly know that the commit index must be at
* least as high as the index of this second to last
* configuration entry. */
/* FIXME: this currently breaks incus/cowsql tests
r->commit_index = r->configuration_committed_index;
r->update->flags |= RAFT_UPDATE_COMMIT_INDEX;
*/
}
conf = entry;
conf_index = r->last_stored;
}
}
if (conf != NULL) {
rv = restoreMostRecentConfigurationEntry(r, conf, conf_index);
if (rv != 0) {
goto err;
}
}
return 0;
err:
return rv;
}
int RestoreSnapshot(struct raft *r, struct raft_snapshot_metadata *metadata)
{
int rv;
configurationClose(&r->configuration);
r->configuration = metadata->configuration;
r->configuration_committed_index = metadata->configuration_index;
r->configuration_uncommitted_index = 0;
/* Make a copy of the configuration contained in the snapshot, in case
* r->configuration gets overriden with an uncommitted configuration and we
* then need to rollback, but the log does not contain anymore the entry at
* r->configuration_committed_index because it was truncated. */
configurationClose(&r->configuration_committed);
rv = configurationCopy(&r->configuration, &r->configuration_committed);
if (rv != 0) {
return rv;
}
/* Make also a copy of the index of the configuration contained in the
* snapshot, we'll need it in case we send out an InstallSnapshot RPC. */
r->configuration_last_snapshot_index = metadata->configuration_index;
r->commit_index = metadata->index;
r->last_stored = metadata->index;
r->update->flags |= RAFT_UPDATE_COMMIT_INDEX;
return 0;
}
#undef tracef
raft-0.22.1/src/restore.h 0000664 0000000 0000000 00000001703 14601504142 0015151 0 ustar 00root root 0000000 0000000 /* Restore the in-memory raft state with data loaded from persistent storage. */
#ifndef RAFT_RESTORE_H_
#define RAFT_RESTORE_H_
#include "../include/raft.h"
/* Restore the entries that were loaded from persistent storage. The most recent
* configuration entry will be restored as well, if any. */
int RestoreEntries(struct raft *r,
raft_index snapshot_index,
raft_term snapshot_term,
raft_index start_index,
struct raft_entry *entries,
unsigned n);
/* Function to be called when restoring a snapshot.
*
* This will reset the current state of the raft object as if the last entry
* contained in the snapshot with the given metadata had just been persisted,
* committed and applied.
*
* The in-memory log must be empty when calling this function. */
int RestoreSnapshot(struct raft *r, struct raft_snapshot_metadata *metadata);
#endif /* RAFT_RESTORE_H_ */
raft-0.22.1/src/snapshot.c 0000664 0000000 0000000 00000002624 14601504142 0015323 0 ustar 00root root 0000000 0000000 #include "snapshot.h"
#include
#include
#include "assert.h"
#include "configuration.h"
#include "err.h"
#include "tracing.h"
#define tracef(...) Tracef(r->tracer, __VA_ARGS__)
void snapshotClose(struct raft_snapshot *s)
{
unsigned i;
configurationClose(&s->configuration);
for (i = 0; i < s->n_bufs; i++) {
raft_free(s->bufs[i].base);
}
raft_free(s->bufs);
}
void snapshotDestroy(struct raft_snapshot *s)
{
snapshotClose(s);
raft_free(s);
}
int snapshotCopy(const struct raft_snapshot *src, struct raft_snapshot *dst)
{
int rv;
unsigned i;
size_t size;
uint8_t *cursor;
dst->term = src->term;
dst->index = src->index;
dst->configuration_index = src->configuration_index;
rv = configurationCopy(&src->configuration, &dst->configuration);
if (rv != 0) {
return rv;
}
size = 0;
for (i = 0; i < src->n_bufs; i++) {
size += src->bufs[i].len;
}
dst->bufs = raft_malloc(sizeof *dst->bufs);
assert(dst->bufs != NULL);
dst->bufs[0].base = raft_malloc(size);
dst->bufs[0].len = size;
if (dst->bufs[0].base == NULL) {
return RAFT_NOMEM;
}
cursor = dst->bufs[0].base;
for (i = 0; i < src->n_bufs; i++) {
memcpy(cursor, src->bufs[i].base, src->bufs[i].len);
cursor += src->bufs[i].len;
}
dst->n_bufs = 1;
return 0;
}
#undef tracef
raft-0.22.1/src/snapshot.h 0000664 0000000 0000000 00000001106 14601504142 0015322 0 ustar 00root root 0000000 0000000 #ifndef RAFT_SNAPSHOT_H_
#define RAFT_SNAPSHOT_H_
#include "../include/raft.h"
/* Release all memory associated with the given snapshot. */
void snapshotClose(struct raft_snapshot *s);
/* Like snapshotClose(), but also release the snapshot object itself. */
void snapshotDestroy(struct raft_snapshot *s);
/* Make a full deep copy of a snapshot object.
*
* All data buffers in the source snapshot will be compacted in a single buffer
* in the destination snapshot. */
int snapshotCopy(const struct raft_snapshot *src, struct raft_snapshot *dst);
#endif /* RAFT_SNAPSHOT_H */
raft-0.22.1/src/state.c 0000664 0000000 0000000 00000002074 14601504142 0014603 0 ustar 00root root 0000000 0000000 #include "assert.h"
#include "configuration.h"
#include "election.h"
#include "queue.h"
#include "trail.h"
enum raft_state raft_state(struct raft *r)
{
return r->state;
}
void raft_leader(struct raft *r, raft_id *id, const char **address)
{
switch (r->state) {
case RAFT_CANDIDATE:
*id = 0;
*address = NULL;
return;
case RAFT_FOLLOWER:
*id = r->follower_state.current_leader.id;
*address = r->follower_state.current_leader.address;
return;
case RAFT_LEADER:
if (r->leader_state.transferee != 0) {
*id = 0;
*address = NULL;
return;
}
*id = r->id;
*address = r->address;
return;
}
}
raft_index raft_last_index(struct raft *r)
{
return TrailLastIndex(&r->trail);
}
int raft_role(struct raft *r)
{
const struct raft_server *local =
configurationGet(&r->configuration, r->id);
if (local == NULL) {
return -1;
}
return local->role;
}
raft-0.22.1/src/syscall.c 0000664 0000000 0000000 00000002715 14601504142 0015137 0 ustar 00root root 0000000 0000000 #include "syscall.h"
#if HAVE_LINUX_AIO_ABI_H || HAVE_LINUX_IO_URING_H
#include
#include
#endif
#if HAVE_LINUX_AIO_ABI_H
int io_setup(unsigned nr_events, aio_context_t *ctx_idp)
{
return (int)syscall(__NR_io_setup, nr_events, ctx_idp);
}
int io_destroy(aio_context_t ctx_id)
{
return (int)syscall(__NR_io_destroy, ctx_id);
}
int io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
{
return (int)syscall(__NR_io_submit, ctx_id, nr, iocbpp);
}
int io_getevents(aio_context_t ctx_id,
long min_nr,
long nr,
struct io_event *events,
struct timespec *timeout)
{
return (int)syscall(__NR_io_getevents, ctx_id, min_nr, nr, events, timeout);
}
#endif
#if HAVE_LINUX_IO_URING_H
int io_uring_register(int fd,
unsigned int opcode,
const void *arg,
unsigned int nr_args)
{
return (int)syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
}
int io_uring_setup(unsigned int entries, struct io_uring_params *p)
{
return (int)syscall(__NR_io_uring_setup, entries, p);
}
int io_uring_enter(int fd,
unsigned int to_submit,
unsigned int min_complete,
unsigned int flags,
sigset_t *sig)
{
return (int)syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags,
sig, _NSIG / 8);
}
#endif
raft-0.22.1/src/syscall.h 0000664 0000000 0000000 00000002156 14601504142 0015143 0 ustar 00root root 0000000 0000000 /* Wrappers for system calls not yet defined in libc. */
#ifndef SYSCALL_H_
#define SYSCALL_H_
#if HAVE_LINUX_AIO_ABI_H
#include
#include
#include
#endif
#if HAVE_LINUX_IO_URING_H
#include
#endif
#if HAVE_LINUX_AIO_ABI_H
/* AIO */
int io_setup(unsigned nr_events, aio_context_t *ctx_idp);
int io_destroy(aio_context_t ctx_id);
int io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp);
int io_getevents(aio_context_t ctx_id,
long min_nr,
long nr,
struct io_event *events,
struct timespec *timeout);
#endif
#if HAVE_LINUX_IO_URING_H
/* uring */
int io_uring_register(int fd,
unsigned int opcode,
const void *arg,
unsigned int nr_args);
int io_uring_setup(unsigned int entries, struct io_uring_params *p);
int io_uring_enter(int fd,
unsigned int to_submit,
unsigned int min_complete,
unsigned int flags,
sigset_t *sig);
#endif
#endif /* SYSCALL_ */
raft-0.22.1/src/timeout.c 0000664 0000000 0000000 00000021344 14601504142 0015152 0 ustar 00root root 0000000 0000000 #include
#include "../include/raft.h"
#include "assert.h"
#include "configuration.h"
#include "convert.h"
#include "election.h"
#include "progress.h"
#include "replication.h"
#include "tracing.h"
#include "trail.h"
#define infof(...) Infof(r->tracer, " " __VA_ARGS__)
#define tracef(...) Tracef(r->tracer, __VA_ARGS__)
/* Apply time-dependent rules for followers (Figure 3.1). */
static int timeoutFollower(struct raft *r)
{
const struct raft_server *server;
int rv;
assert(r != NULL);
assert(r->state == RAFT_FOLLOWER);
server = configurationGet(&r->configuration, r->id);
/* If we have been removed from the configuration, or maybe we didn't
* receive one yet, just stay follower. */
if (server == NULL) {
infof("server not in current configuration -> stay follower");
electionResetTimer(r);
goto out;
}
/* Check if we need to start an election.
*
* From Section 3.3:
*
* If a follower receives no communication over a period of time called
* the election timeout, then it assumes there is no viable leader and
* begins an election to choose a new leader.
*
* Figure 3.1:
*
* If election timeout elapses without receiving AppendEntries RPC from
* current leader or granting vote to candidate, convert to candidate.
*/
if (r->now >= electionTimerExpiration(r)) {
raft_index last_index = TrailLastIndex(&r->trail);
const char *pre_vote_text = r->pre_vote ? "pre-" : "";
if (server->role != RAFT_VOTER) {
infof("%s server -> stay follower", raft_role_name(server->role));
electionResetTimer(r);
goto out;
}
if (r->snapshot.installing) {
infof("installing snapshot -> stay follower");
electionResetTimer(r);
goto out;
}
if (r->last_stored < last_index) {
infof("persisting %u entries -> stay follower",
(unsigned)(last_index - r->last_stored));
electionResetTimer(r);
goto out;
}
infof("convert to candidate, start %selection for term %llu",
pre_vote_text, r->current_term + 1);
rv = convertToCandidate(r, false /* disrupt leader */);
if (rv != 0) {
return rv;
}
}
out:
return 0;
}
/* Apply time-dependent rules for candidates (Figure 3.1). */
static int timeoutCandidate(struct raft *r)
{
assert(r != NULL);
assert(r->state == RAFT_CANDIDATE);
/* Check if we need to start an election.
*
* From Section 3.4:
*
* The third possible outcome is that a candidate neither wins nor loses
* the election: if many followers become candidates at the same time,
* votes could be split so that no candidate obtains a majority. When this
* happens, each candidate will time out and start a new election by
* incrementing its term and initiating another round of RequestVote RPCs
*/
if (r->now >= electionTimerExpiration(r)) {
infof("stay candidate, start election for term %llu",
r->current_term + 1);
electionStart(r);
}
return 0;
}
/* Return true if we received an AppendEntries RPC result from a majority of
* voting servers since we became leaders or since the last time this function
* returned true (i.e. since the last time the check was successful and hence we
* reset the election timer).
*
* For each server the function checks the recent_recv flag of the associated
* progress object, and resets the flag after the check. It returns true if a
* majority of voting server had the flag set to true. */
static bool checkContactQuorum(struct raft *r)
{
unsigned i;
unsigned contacts = 0;
assert(r->state == RAFT_LEADER);
for (i = 0; i < r->configuration.n; i++) {
struct raft_server *server = &r->configuration.servers[i];
bool is_recent = progressHasContactedRecently(r, i);
if ((server->role == RAFT_VOTER && is_recent) || server->id == r->id) {
contacts++;
}
if (!is_recent) {
switch (progressState(r, i)) {
case PROGRESS__PIPELINE:
infof("server %llu is unreachable -> abort pipeline",
server->id);
progressToProbe(r, i);
break;
case PROGRESS__SNAPSHOT:
infof("server %llu is unreachable -> abort snapshot",
server->id);
progressAbortSnapshot(r, i);
break;
}
}
}
return contacts > configurationVoterCount(&r->configuration) / 2;
}
/* Apply time-dependent rules for leaders (Figure 3.1). */
static int timeoutLeader(struct raft *r)
{
assert(r->state == RAFT_LEADER);
/* Check if we still can reach a majority of servers.
*
* From Section 6.2:
*
* A leader in Raft steps down if an election timeout elapses without a
* successful round of heartbeats to a majority of its cluster; this
* allows clients to retry their requests with another server.
*/
/* If a majority of servers have contacted us recently, reset the
* recent_recv flags and the election timer. Otherwise, check if we have run
* past the election timer and step down in that case. */
if (checkContactQuorum(r)) {
r->election_timer_start = r->now;
r->update->flags |= RAFT_UPDATE_TIMEOUT;
} else if (r->now - r->election_timer_start >= r->election_timeout) {
infof("unable to contact majority of cluster -> step down");
convertToFollower(r);
return 0;
}
/* Possibly send heartbeats.
*
* From Figure 3.1:
*
* Send empty AppendEntries RPC during idle periods to prevent election
* timeouts.
*/
replicationHeartbeat(r);
/* If a server is being promoted, increment the timer of the current
* round or abort the promotion.
*
* From Section 4.2.1:
*
* The algorithm waits a fixed number of rounds (such as 10). If the last
* round lasts less than an election timeout, then the leader adds the new
* server to the cluster, under the assumption that there are not enough
* unreplicated entries to create a significant availability
* gap. Otherwise, the leader aborts the configuration change with an
* error.
*/
if (r->leader_state.promotee_id != 0) {
raft_id id = r->leader_state.promotee_id;
unsigned server_index;
raft_time round_duration = r->now - r->leader_state.round_start;
bool is_too_slow;
bool is_unresponsive;
/* If a promotion is in progress, we expect that our configuration
* contains an entry for the server being promoted, and that the server
* is not yet considered as voting. */
server_index = configurationIndexOf(&r->configuration, id);
assert(server_index < r->configuration.n);
assert(r->configuration.servers[server_index].role != RAFT_VOTER);
is_too_slow = (r->leader_state.round_number == r->max_catch_up_rounds &&
round_duration > r->election_timeout);
is_unresponsive = round_duration > r->max_catch_up_round_duration;
/* Abort the promotion if we are at the 10'th round and it's still
* taking too long, or if the server is unresponsive. */
if (is_too_slow || is_unresponsive) {
infof("server %llu is %s", id,
is_too_slow ? "too slow" : "unresponsive -> abort catch-up");
r->leader_state.promotee_id = 0;
r->leader_state.round_index = 0;
r->leader_state.round_number = 0;
r->leader_state.round_start = 0;
progressCatchUpAbort(r, server_index);
}
}
/* If there is a leadership transfer request in progress, check if it's
* expired. */
if (r->leader_state.transferee != 0) {
if (r->now - r->leader_state.transfer_start >= r->election_timeout) {
infof("server %llu not replicating fast enough -> abort transfer",
r->leader_state.transferee);
r->leader_state.transferee = 0;
r->leader_state.transferring = false;
}
}
return 0;
}
int Timeout(struct raft *r)
{
int rv = -1;
assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE ||
r->state == RAFT_LEADER);
switch (r->state) {
case RAFT_FOLLOWER:
rv = timeoutFollower(r);
break;
case RAFT_CANDIDATE:
rv = timeoutCandidate(r);
break;
case RAFT_LEADER:
rv = timeoutLeader(r);
break;
}
return rv;
}
#undef infof
#undef tracef
raft-0.22.1/src/timeout.h 0000664 0000000 0000000 00000000320 14601504142 0015146 0 ustar 00root root 0000000 0000000 /* Logic to be invoked periodically. */
#ifndef TIMEOUT_H_
#define TIMEOUT_H_
#include "../include/raft.h"
/* Called when upon RAFT_TIMEOUT events. */
int Timeout(struct raft *r);
#endif /* TIMEOUT_H_ */
raft-0.22.1/src/tracing.c 0000664 0000000 0000000 00000002462 14601504142 0015113 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include "tracing.h"
static inline void noopEmit(struct raft_tracer *t, int type, const void *info)
{
(void)t;
(void)type;
(void)info;
}
struct raft_tracer NoopTracer = {.impl = NULL, .version = 2, .emit = noopEmit};
static bool stderrTraceEnabled = false;
static inline void stderrTracerEmit(struct raft_tracer *t,
int type,
const void *data)
{
const struct raft_tracer_info *info = data;
struct timespec ts = {0};
int64_t ns;
(void)t;
if (!stderrTraceEnabled) {
return;
}
if (type != RAFT_TRACER_DIAGNOSTIC) {
return;
}
/* ignore errors */
clock_gettime(CLOCK_REALTIME, &ts);
ns = ts.tv_sec * 1000000000 + ts.tv_nsec;
fprintf(stderr, "LIBRAFT %" PRId64 " %s:%d %s\n", ns,
info->diagnostic.file, info->diagnostic.line,
info->diagnostic.message);
}
struct raft_tracer StderrTracer = {.impl = NULL,
.version = 2,
.emit = stderrTracerEmit};
void raft_tracer_maybe_enable(struct raft_tracer *tracer, bool enabled)
{
(void)tracer;
if (getenv(LIBRAFT_TRACE) != NULL) {
stderrTraceEnabled = enabled;
}
}
raft-0.22.1/src/tracing.h 0000664 0000000 0000000 00000006257 14601504142 0015126 0 ustar 00root root 0000000 0000000 /* Tracing functions and helpers. */
#ifndef TRACING_H_
#define TRACING_H_
#include
#include "../include/raft.h"
#include "utils.h"
/* If an env var with this name is found, tracing can be enabled */
#define LIBRAFT_TRACE "LIBRAFT_TRACE"
extern struct raft_tracer NoopTracer;
/* Default stderr tracer. */
extern struct raft_tracer StderrTracer;
/* Legacy raft tracer */
typedef void (*raft_tracer_emit)(struct raft_tracer *t,
const char *file,
int line,
const char *message);
/* Use TRACER to trace an event of type TYPE with the given INFO. */
#define Trace(TRACER, TYPE, INFO) \
do { \
if (LIKELY(TRACER == NULL)) { \
break; \
} \
if (LIKELY(TRACER->version == 2)) { \
TRACER->emit(TRACER, TYPE, INFO); \
} \
} while (0)
/* Emit a diagnostic message with the given tracer at level 3. */
#define Infof(TRACER, ...) Logf(TRACER, 3, __VA_ARGS__)
/* Emit diagnostic message with the given tracer at level 5. */
#define Tracef(TRACER, ...) Logf(TRACER, 5, __VA_ARGS__)
/* Use the tracer to log an event at the given level.
*
* The LEVEL parameter should be one of:
*
* - 1: error
* - 2: warning
* - 3: info
* - 4: debug
* - 5: trace
*/
#define Logf(TRACER, LEVEL, ...) \
do { \
int _type; \
struct raft_tracer_info _info; \
static char _msg[1024]; \
\
if (LIKELY(TRACER == NULL)) { \
break; \
} \
\
snprintf(_msg, sizeof _msg, __VA_ARGS__); \
\
if (LIKELY(TRACER->version == 2)) { \
_type = RAFT_TRACER_DIAGNOSTIC; \
_info.version = 1; \
_info.diagnostic.level = LEVEL; \
_info.diagnostic.message = _msg; \
_info.diagnostic.file = __FILE__; \
_info.diagnostic.line = __LINE__; \
TRACER->emit(TRACER, _type, &_info); \
} else if (UNLIKELY((bool)TRACER->version)) { \
raft_tracer_emit emit = \
(raft_tracer_emit)((uintptr_t)TRACER->emit); \
emit(TRACER, __FILE__, __LINE__, _msg); \
} \
} while (0)
/* Enable the tracer if the env variable is set or disable the tracer */
void raft_tracer_maybe_enable(struct raft_tracer *tracer, bool enabled);
#endif /* TRACING_H_ */
raft-0.22.1/src/trail.c 0000664 0000000 0000000 00000025622 14601504142 0014602 0 ustar 00root root 0000000 0000000 #include "trail.h"
#include "assert.h"
void TrailInit(struct raft_trail *t)
{
t->records = NULL;
t->size = 0;
t->front = t->back = 0;
t->offset = 0;
t->snapshot.index = 0;
t->snapshot.term = 0;
}
void TrailClose(struct raft_trail *t)
{
if (t->records != NULL) {
raft_free(t->records);
}
}
void TrailStart(struct raft_trail *t,
raft_index snapshot_index,
raft_term snapshot_term,
raft_index start_index)
{
assert(TrailNumEntries(t) == 0);
assert(start_index > 0);
assert(start_index <= snapshot_index + 1);
assert(snapshot_index == 0 || snapshot_term != 0);
t->snapshot.index = snapshot_index;
t->snapshot.term = snapshot_term;
t->offset = start_index - 1;
}
/* Get the current number of records in the trail. */
static unsigned trailNumRecords(const struct raft_trail *t)
{
/* The circular buffer is not wrapped. */
if (t->front <= t->back) {
return t->back - t->front;
}
/* The circular buffer is wrapped. */
return t->size - t->front + t->back;
}
/* Return the circular buffer position of the i'th record in the log. */
static unsigned trailPositionAt(const struct raft_trail *t, unsigned i)
{
return (t->front + i) % t->size;
}
unsigned TrailNumEntries(const struct raft_trail *t)
{
unsigned n = trailNumRecords(t);
unsigned i;
if (n == 0) {
return 0;
}
i = trailPositionAt(t, n - 1);
assert(t->records[i].index > 0);
assert(t->records[i].index > t->offset);
return (unsigned)(t->records[i].index - t->offset);
}
raft_index TrailLastIndex(const struct raft_trail *t)
{
unsigned n = TrailNumEntries(t);
/* If there are no entries in the log, but there is a snapshot available
* check that it's last index is consistent with the offset. */
if (n == 0) {
if (t->snapshot.index != 0) {
assert(t->offset <= t->snapshot.index);
return t->snapshot.index;
}
return 0;
}
return t->offset + n;
}
raft_term TrailLastTerm(const struct raft_trail *t)
{
raft_index last_index;
last_index = TrailLastIndex(t);
return last_index > 0 ? TrailTermOf(t, last_index) : 0;
}
raft_term TrailTermOf(const struct raft_trail *t, raft_index index)
{
unsigned n;
unsigned i;
unsigned j;
assert(index > 0);
assert(t->offset <= t->snapshot.index);
/* If the given index is lower than the first index, or higher than the last
* index, return 0, unless there is a matching snapshot. */
if (index < t->offset + 1 || index > t->offset + TrailNumEntries(t)) {
if (index == t->snapshot.index) {
return t->snapshot.term;
}
return 0;
}
/* Go through all records, starting from the last one, looking for a record
* whose previous record has an index lower than the given one. Stop when we
* find such a record, or when we reach the first record, which has no
* previous record. */
n = trailNumRecords(t);
assert(n > 0);
do {
i = trailPositionAt(t, n - 1);
assert(index <= t->records[i].index);
if (n == 1) {
break;
}
j = trailPositionAt(t, n - 2);
if (index > t->records[j].index) {
break;
}
n -= 1;
} while (1);
return t->records[i].term;
}
/* Ensure that the last record in the circular buffer is at the given term,
* creating a new record if necessary.
*
* Errors:
*
* RAFT_NOMEM
* Memory for the records array could not be allocated.
*/
static int trailEnsureRecord(struct raft_trail *t, raft_term term)
{
unsigned size;
unsigned n;
unsigned i;
/* clang-format off */
struct
{
raft_index index;
raft_term term;
} *records;
/* clang-format on */
n = trailNumRecords(t);
/* If there are already some records, and the term of the last record
* matches the given one, then there's nothing to do, as we already have a
* record for the given term. */
if (n > 0) {
i = trailPositionAt(t, n - 1);
assert(t->records[i].term <= term);
if (t->records[i].term == term) {
return 0;
}
}
/* If the circular buffer is big enough to hold an additional record, just
* append it to the back. */
if (n + 1 < t->size) {
t->back += 1;
t->back = t->back % t->size;
goto out;
}
/* Otherwise we need to resize the circular buffer.
*
* Make the new size twice the current size plus one (for the record
* associated with the new term). Over-allocating now avoids smaller
* allocations later. */
size = (t->size + 1) * 2;
records = raft_calloc(size, sizeof *t->records);
if (records == NULL) {
return RAFT_NOMEM;
}
/* Copy all active old records to the beginning of the newly allocated
* array. */
for (i = 0; i < n; i++) {
unsigned j = trailPositionAt(t, i);
records[i].index = t->records[j].index;
records[i].term = t->records[j].term;
}
/* Release the old records array. */
if (t->records != NULL) {
raft_free(t->records);
}
t->records = (void *)records;
t->size = size;
t->front = 0;
t->back = n + 1;
out:
i = trailPositionAt(t, n);
t->records[i].index = 0;
t->records[i].term = term;
return 0;
}
int TrailAppend(struct raft_trail *t, raft_term term)
{
unsigned n;
unsigned i;
unsigned j;
int rv;
rv = trailEnsureRecord(t, term);
if (rv != 0) {
assert(rv == RAFT_NOMEM);
return rv;
}
n = trailNumRecords(t);
i = trailPositionAt(t, n - 1);
/* If we have already recorded an index for this term, then the next index
* is just that index plus 1. */
if (t->records[i].index > 0) {
t->records[i].index += 1;
goto out;
}
assert(t->records[i].index == 0);
/* Otherwise, if there is a previous record, then the next index is the
* index of the previous record plus 1. */
if (n > 1) {
j = trailPositionAt(t, n - 2);
assert(t->records[j].index > 0);
t->records[i].index = t->records[j].index + 1;
goto out;
}
/* Otherwise, we're appending the very first entry to an empty trail. */
t->records[i].index = t->offset + 1;
out:
return 0;
}
void TrailTruncate(struct raft_trail *t, const raft_index index)
{
raft_index last;
unsigned n;
unsigned i;
unsigned j;
assert(index > t->offset);
assert(index <= TrailLastIndex(t));
/* Delete all entries, starting from the last, down to the given index
* included. */
for (last = TrailLastIndex(t); last >= index; last--) {
n = trailNumRecords(t);
i = trailPositionAt(t, n - 1);
/* The record must be valid */
assert(t->records[i].index > 0);
assert(t->records[i].term > 0);
/* The record must refer to the entry being deleted. */
assert(t->records[i].index == last);
if (n == 1) {
/* If we're deleting the very last entry of the trail, clear
* everything. */
if (last == t->offset + 1) {
t->records[i].index = 0;
t->records[i].term = 0;
t->front = t->back = 0;
return;
}
/* Otherwise just delete this entry. */
t->records[i].index -= 1;
assert(t->records[i].index > 0);
continue;
}
j = trailPositionAt(t, n - 2);
/* If the preceding index belongs to the previous record, clear the
* current record. */
if (last - 1 == t->records[j].index) {
t->records[i].index = 0;
t->records[i].term = 0;
if (t->back == 0) {
t->back = t->size - 1;
} else {
t->back -= 1;
}
continue;
}
/* Otherwise just delete this entry. */
t->records[i].index -= 1;
assert(t->records[i].index > 0);
}
}
/* Delete all entries up to the given index (included). */
static void trailRemovePrefix(struct raft_trail *t, const raft_index index)
{
unsigned n = trailNumRecords(t);
unsigned i;
unsigned j;
unsigned front = t->front;
raft_index record_index;
assert(index > 0);
assert(index <= TrailLastIndex(t));
for (i = 0; i < n; i++) {
j = trailPositionAt(t, i);
record_index = t->records[j].index;
/* If the index belongs to this record, and there are larger indexes
* too, just update the starting offset, but leave the record. */
if (record_index > index) {
break;
}
/* Drop the entire record. */
t->records[j].index = 0;
t->records[j].term = 0;
front += 1;
/* If the index is the last one of the record, stop here, the next
* record will be unchanged. */
if (record_index == index) {
break;
}
}
t->front = front % t->size;
t->offset = index;
}
void TrailSnapshot(struct raft_trail *t,
const raft_index last_index,
const unsigned trailing)
{
raft_term last_term;
assert(last_index > 0);
/* We must not already have a snapshot at this index */
assert(t->snapshot.index != last_index);
/* We must have an entry at this index */
last_term = TrailTermOf(t, last_index);
assert(last_term != 0);
t->snapshot.index = last_index;
t->snapshot.term = last_term;
/* If we have not at least trailing entries preceeding the given last index,
* then there's nothing to remove and we're done. */
if (last_index <= trailing || TrailTermOf(t, last_index - trailing) == 0) {
return;
}
trailRemovePrefix(t, last_index - trailing);
}
raft_index TrailSnapshotIndex(const struct raft_trail *t)
{
return t->snapshot.index;
}
raft_term TrailSnapshotTerm(const struct raft_trail *t)
{
return t->snapshot.term;
}
void TrailRestore(struct raft_trail *t,
raft_index last_index,
raft_term last_term)
{
size_t n = TrailNumEntries(t);
assert(last_index > 0);
assert(last_term > 0);
if (n > 0) {
TrailTruncate(t, TrailLastIndex(t) - n + 1);
}
t->snapshot.index = last_index;
t->snapshot.term = last_term;
t->offset = last_index;
}
bool TrailHasEntry(const struct raft_trail *t, raft_index index)
{
unsigned n = trailNumRecords(t);
unsigned i;
/* If there are no records, there are no entries. */
if (n == 0) {
return false;
}
/* If the index is lower than the offset, then it's not in the log.*/
if (index <= t->offset) {
return false;
}
/* If the index is greater than the last record, then it's not in the log.*/
i = trailPositionAt(t, n - 1);
assert(t->records[i].index > 0);
if (index > t->records[i].index) {
return false;
}
return true;
}
raft-0.22.1/src/trail.h 0000664 0000000 0000000 00000006264 14601504142 0014610 0 ustar 00root root 0000000 0000000 #ifndef RAFT_TRAIL_H_
#define RAFT_TRAIL_H_
#include "../include/raft.h"
#endif /* RAFT_TRAIL_H_ */
/* Initialize an empty trail of raft log entries. */
void TrailInit(struct raft_trail *t);
/* Release all memory used by the given trail object. */
void TrailClose(struct raft_trail *t);
/* Called at startup when populating the trail with data of entries loaded from
* disk. It sets the starting state of the trail. The start index must be lower
* or equal than snapshot_index + 1. */
void TrailStart(struct raft_trail *t,
raft_index snapshot_index,
raft_term snapshot_term,
raft_index start_index);
/* Get the current number of entries in the log (either already persisted or
* being persisted). */
unsigned TrailNumEntries(const struct raft_trail *t);
/* Get the index of the last entry in the log. Return 0 if the log is empty. */
raft_index TrailLastIndex(const struct raft_trail *t);
/* Get the term of the last entry in the log. Return 0 if the log is empty. */
raft_term TrailLastTerm(const struct raft_trail *t);
/* Get the term of the entry with the given index. Return 0 if index is greater
* than the last index of the log, or if it's lower than oldest index we know
* the term of (either because it's outstanding or because it's the last entry
* in the most recent snapshot). */
raft_term TrailTermOf(const struct raft_trail *t, raft_index index);
/* Record a new entry at TrailLastIndex() + 1, with the given term.
*
* Errors:
*
* RAFT_NOMEM
* Memory for the records array could not be allocated.
*/
int TrailAppend(struct raft_trail *t, raft_term term);
/* Delete all entries from the given index (included) onwards. If the log is
* empty this is a no-op. If @index is lower than or equal to the index of the
* first entry in the log, then the log will become empty. */
void TrailTruncate(struct raft_trail *t, raft_index index);
/* To be called when taking a new snapshot. The log must contain an entry at
* last_index, which is the index of the last entry included in the
* snapshot. The function will update the last snapshot information and delete
* all entries up to last_index - trailing (included). If the log contains no
* entry at last_index - trailing, then no entry will be deleted. */
void TrailSnapshot(struct raft_trail *t,
raft_index last_index,
unsigned trailing);
/* Get the index of the last entry in the most recent snapshot. Return #0 if
* there are no snapshots. */
raft_index TrailSnapshotIndex(const struct raft_trail *t);
/* Get the term of the last entry of the most recent snapshot. Return #0 if
* there are no snapshots. */
raft_term TrailSnapshotTerm(const struct raft_trail *t);
/* To be called when installing a snapshot.
*
* The log can be in any state. All outstanding entries will be discarded, the
* last index and last term of the most recent snapshot will be set to the given
* values, and the offset adjusted accordingly. */
void TrailRestore(struct raft_trail *t,
raft_index last_index,
raft_term last_term);
/* Return true if there is an entry at the given index. */
bool TrailHasEntry(const struct raft_trail *t, raft_index index);
raft-0.22.1/src/utils.h 0000664 0000000 0000000 00000000444 14601504142 0014627 0 ustar 00root root 0000000 0000000 #ifndef RAFT_UTILS_H_
#define RAFT_UTILS_H_
#include
/* Various utility functions and macros */
#define LIKELY(x) __builtin_expect(!!(x), 1)
#define UNLIKELY(x) __builtin_expect(!!(x), 0)
#define DBG() fprintf(stderr, "%s:%d\n", __func__, __LINE__)
#endif /* RAFT_UTILS_H_ */
raft-0.22.1/src/uv.c 0000664 0000000 0000000 00000061314 14601504142 0014117 0 ustar 00root root 0000000 0000000 #include "../include/raft/uv.h"
#include
#include
#include
#include
#include
#include
#include
#include "../include/raft.h"
#include "assert.h"
#include "byte.h"
#include "configuration.h"
#include "entry.h"
#include "heap.h"
#include "legacy.h"
#include "snapshot.h"
#include "tracing.h"
#include "uv.h"
#include "uv_encoding.h"
#include "uv_os.h"
#define tracef(...) Tracef(uv->tracer, __VA_ARGS__)
/* Retry to connect to peer servers every second.
*
* TODO: implement an exponential backoff instead. */
#define CONNECT_RETRY_DELAY 1000
/* Cleans up files that are no longer used by the system */
static int uvMaintenance(const char *dir, char *errmsg)
{
struct uv_fs_s req;
struct uv_dirent_s entry;
int n;
int i;
int rv;
int rv2;
n = uv_fs_scandir(NULL, &req, dir, 0, NULL);
if (n < 0) {
ErrMsgPrintf(errmsg, "scan data directory: %s", uv_strerror(n));
return RAFT_IOERR;
}
rv = 0;
for (i = 0; i < n; i++) {
const char *filename;
rv = uv_fs_scandir_next(&req, &entry);
assert(rv == 0); /* Can't fail in libuv */
filename = entry.name;
/* Remove leftover tmp-files */
if (strncmp(filename, TMP_FILE_PREFIX, strlen(TMP_FILE_PREFIX)) == 0) {
UvFsRemoveFile(dir, filename, errmsg); /* Ignore errors */
continue;
}
/* Remove orphaned snapshot files */
bool orphan = false;
if ((UvSnapshotIsOrphan(dir, filename, &orphan) == 0) && orphan) {
UvFsRemoveFile(dir, filename, errmsg); /* Ignore errors */
continue;
}
/* Remove orphaned snapshot metadata files */
if ((UvSnapshotMetaIsOrphan(dir, filename, &orphan) == 0) && orphan) {
UvFsRemoveFile(dir, filename, errmsg); /* Ignore errors */
}
}
rv2 = uv_fs_scandir_next(&req, &entry);
assert(rv2 == UV_EOF);
return rv;
}
/* Implementation of raft_io->config. */
static int uvInit(struct raft_io *io, raft_id id, const char *address)
{
struct uv *uv;
size_t direct_io;
struct uvMetadata metadata;
int rv;
uv = io->impl;
uv->id = id;
rv = UvFsCheckDir(uv->dir, io->errmsg);
if (rv != 0) {
return rv;
}
/* Probe file system capabilities */
rv = UvFsProbeCapabilities(uv->dir, &direct_io, &uv->async_io, io->errmsg);
if (rv != 0) {
return rv;
}
uv->direct_io = direct_io != 0;
uv->block_size = direct_io != 0 ? direct_io : 4096;
rv = uvMaintenance(uv->dir, io->errmsg);
if (rv != 0) {
return rv;
}
rv = uvMetadataLoad(uv->dir, &metadata, io->errmsg);
if (rv != 0) {
return rv;
}
uv->metadata = metadata;
rv = uv->transport->init(uv->transport, id, address);
if (rv != 0) {
ErrMsgTransfer(uv->transport->errmsg, io->errmsg, "transport");
return rv;
}
uv->transport->data = uv;
rv = uv_timer_init(uv->loop, &uv->timer);
assert(rv == 0); /* This should never fail */
uv->timer.data = uv;
rv = uv_prepare_init(uv->loop, &uv->prepare);
assert(rv == 0);
uv->prepare.data = uv;
rv = uv_check_init(uv->loop, &uv->check);
assert(rv == 0);
uv->check.data = uv;
rv = uv_timer_init(uv->loop, &uv->prepare_retry);
assert(rv == 0); /* This should never fail */
uv->prepare_retry.data = uv;
rv = uv_timer_init(uv->loop, &uv->append_retry);
assert(rv == 0); /* This should never fail */
uv->append_retry.data = uv;
rv = uv_timer_init(uv->loop, &uv->snapshot_put_retry);
assert(rv == 0); /* This should never fail */
uv->snapshot_put_retry.data = uv;
return 0;
}
/* Periodic timer callback */
static void uvTickTimerCb(uv_timer_t *timer)
{
struct uv *uv;
uv = timer->data;
if (uv->tick_cb != NULL) {
uv->tick_cb(uv->io);
}
}
static void uvUpdateCapacity(struct uv *uv)
{
size_t bytes = UvPrepareCount(uv) * uv->segment_size;
bytes += UvAppendCapacity(uv);
uv->io->capacity = (unsigned short)(bytes / 1024);
}
static void uvPrepareLoopCb(struct uv_prepare_s *prepare)
{
struct uv *uv;
uv = prepare->data;
uvUpdateCapacity(uv);
if (uv->io->data != NULL && uv->io->version != 0) {
LegacyFireCompletedRequests(uv->io->data);
}
}
static void uvCheckLoopCb(struct uv_check_s *check)
{
struct uv *uv;
uv = check->data;
uvUpdateCapacity(uv);
if (uv->io->data != NULL && uv->io->version != 0) {
LegacyFireCompletedRequests(uv->io->data);
}
}
/* Implementation of raft_io->start. */
static int uvStart(struct raft_io *io,
unsigned msecs,
raft_io_tick_cb tick_cb,
raft_io_recv_cb recv_cb)
{
struct uv *uv;
int rv;
uv = io->impl;
uv->state = UV__ACTIVE;
uv->tick_cb = tick_cb;
uv->recv_cb = recv_cb;
rv = UvRecvStart(uv);
if (rv != 0) {
return rv;
}
rv = uv_timer_start(&uv->timer, uvTickTimerCb, msecs, msecs);
assert(rv == 0);
rv = uv_prepare_start(&uv->prepare, uvPrepareLoopCb);
assert(rv == 0);
rv = uv_check_start(&uv->check, uvCheckLoopCb);
assert(rv == 0);
UvPrepareStart(uv);
return 0;
}
void uvMaybeFireCloseCb(struct uv *uv)
{
tracef("uv maybe fire close cb");
if (!uv->closing) {
return;
}
if (uv->transport->data != NULL) {
return;
}
if (uv->timer.data != NULL) {
return;
}
if (uv->prepare.data != NULL) {
return;
}
if (uv->check.data != NULL) {
return;
}
if (uv->prepare_retry.data != NULL) {
return;
}
if (uv->append_retry.data != NULL) {
return;
}
if (uv->snapshot_put_retry.data != NULL) {
return;
}
if (!QUEUE_IS_EMPTY(&uv->append_segments)) {
return;
}
if (!QUEUE_IS_EMPTY(&uv->finalize_reqs)) {
return;
}
if (uv->finalize_work.data != NULL) {
return;
}
if (uv->prepare_inflight != NULL) {
return;
}
if (uv->barrier != NULL) {
return;
}
if (uv->snapshot_put_work.data != NULL) {
return;
}
if (!QUEUE_IS_EMPTY(&uv->snapshot_get_reqs)) {
return;
}
if (!QUEUE_IS_EMPTY(&uv->async_work_reqs)) {
return;
}
if (!QUEUE_IS_EMPTY(&uv->aborting)) {
return;
}
assert(uv->truncate_work.data == NULL);
if (uv->close_cb != NULL) {
uv->close_cb(uv->io);
}
}
static void uvTickTimerCloseCb(uv_handle_t *handle)
{
struct uv *uv = handle->data;
assert(uv->closing);
uv->timer.data = NULL;
uvMaybeFireCloseCb(uv);
}
static void uvTransportCloseCb(struct raft_uv_transport *transport)
{
struct uv *uv = transport->data;
assert(uv->closing);
uv->transport->data = NULL;
uvMaybeFireCloseCb(uv);
}
static void uvPrepareCloseCb(uv_handle_t *handle)
{
struct uv *uv = handle->data;
assert(uv->closing);
uv->prepare.data = NULL;
uvMaybeFireCloseCb(uv);
}
static void uvCheckCloseCb(uv_handle_t *handle)
{
struct uv *uv = handle->data;
assert(uv->closing);
uv->check.data = NULL;
uvMaybeFireCloseCb(uv);
}
static void uvPrepareRetryCloseCb(uv_handle_t *handle)
{
struct uv *uv = handle->data;
assert(uv->closing);
uv->prepare_retry.data = NULL;
uvMaybeFireCloseCb(uv);
}
/* Implementation of raft_io->close. */
static void uvClose(struct raft_io *io, raft_io_close_cb cb)
{
struct uv *uv;
uv = io->impl;
assert(uv != NULL);
assert(!uv->closing);
uv->close_cb = cb;
uv->closing = true;
UvSendClose(uv);
UvRecvClose(uv);
uvAppendClose(uv);
if (uv->transport->data != NULL) {
uv->transport->close(uv->transport, uvTransportCloseCb);
}
if (uv->timer.data != NULL) {
uv_close((uv_handle_t *)&uv->timer, uvTickTimerCloseCb);
}
if (uv->prepare.data != NULL) {
uv_close((uv_handle_t *)&uv->prepare, uvPrepareCloseCb);
}
if (uv->check.data != NULL) {
uv_close((uv_handle_t *)&uv->check, uvCheckCloseCb);
}
if (uv->prepare_retry.data != NULL) {
if (uv->prepare_retry.data != uv) {
assert(uv->prepare_inflight == uv->prepare_retry.data);
RaftHeapFree(uv->prepare_retry.data);
uv->prepare_inflight = NULL;
uv->prepare_retry.data = uv;
}
uv_timer_stop(&uv->prepare_retry);
uv_close((uv_handle_t *)&uv->prepare_retry, uvPrepareRetryCloseCb);
}
UvSnapshotClose(uv);
uvMaybeFireCloseCb(uv);
}
/* Filter the given segment list to find the most recent contiguous chunk of
* closed segments that overlaps with the given snapshot last index. */
static int uvFilterSegments(struct uv *uv,
raft_index last_index,
const char *snapshot_filename,
struct uvSegmentInfo **segments,
size_t *n)
{
struct uvSegmentInfo *segment;
size_t i; /* First valid closed segment. */
size_t j; /* Last valid closed segment. */
/* If there are not segments at all, or only open segments, there's nothing
* to do. */
if (*segments == NULL || (*segments)[0].is_open) {
return 0;
}
/* Find the index of the most recent closed segment. */
for (j = 0; j < *n; j++) {
segment = &(*segments)[j];
if (segment->is_open) {
break;
}
}
assert(j > 0);
j--;
segment = &(*segments)[j];
tracef("most recent closed segment is %s", segment->filename);
/* If the end index of the last closed segment is lower than the last
* snapshot index, there might be no entry that we can keep. We return an
* empty segment list, unless there is at least one open segment, in that
* case we keep everything hoping that they contain all the entries since
* the last closed segment (TODO: we should encode the starting entry in the
* open segment). */
if (segment->end_index < last_index) {
if (!(*segments)[*n - 1].is_open) {
tracef(
"discarding all closed segments, since most recent is behind "
"last snapshot");
raft_free(*segments);
*segments = NULL;
*n = 0;
return 0;
}
tracef(
"most recent closed segment %s is behind last snapshot, "
"yet there are open segments",
segment->filename);
}
/* Now scan the segments backwards, searching for the longest list of
* contiguous closed segments. */
if (j >= 1) {
for (i = j; i > 0; i--) {
struct uvSegmentInfo *newer;
struct uvSegmentInfo *older;
newer = &(*segments)[i];
older = &(*segments)[i - 1];
if (older->end_index != newer->first_index - 1) {
tracef("discarding non contiguous segment %s", older->filename);
break;
}
}
} else {
i = j;
}
/* Make sure that the first index of the first valid closed segment is not
* greater than the snapshot's last index plus one (so there are no
* missing entries). */
segment = &(*segments)[i];
if (segment->first_index > last_index + 1) {
ErrMsgPrintf(uv->io->errmsg,
"closed segment %s is past last snapshot %s",
segment->filename, snapshot_filename);
return RAFT_CORRUPT;
}
if (i != 0) {
size_t new_n = *n - i;
struct uvSegmentInfo *new_segments;
new_segments = raft_malloc(new_n * sizeof *new_segments);
if (new_segments == NULL) {
return RAFT_NOMEM;
}
memcpy(new_segments, &(*segments)[i], new_n * sizeof *new_segments);
raft_free(*segments);
*segments = new_segments;
*n = new_n;
}
return 0;
}
/* Load the last snapshot (if any) and all entries contained in all segment
* files of the data directory. This function can be called recursively, `depth`
* is there to ensure we don't get stuck in a recursive loop. */
static int uvLoadSnapshotAndEntries(struct uv *uv,
struct raft_snapshot **snapshot,
raft_index *start_index,
struct raft_entry *entries[],
size_t *n,
int depth)
{
struct uvSnapshotInfo *snapshots;
struct uvSegmentInfo *segments;
size_t n_snapshots;
size_t n_segments;
int rv;
*snapshot = NULL;
*start_index = 1;
*entries = NULL;
*n = 0;
/* List available snapshots and segments. */
rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments,
uv->io->errmsg);
if (rv != 0) {
goto err;
}
/* Load the most recent snapshot, if any. */
if (snapshots != NULL) {
char snapshot_filename[UV__FILENAME_LEN];
size_t i;
*snapshot = RaftHeapMalloc(sizeof **snapshot);
if (*snapshot == NULL) {
rv = RAFT_NOMEM;
goto err;
}
for (i = 1; i <= n_snapshots; i++) {
uvSnapshotFilenameOf(&snapshots[n_snapshots - i],
snapshot_filename);
rv = UvSnapshotLoad(uv, &snapshots[n_snapshots - i], *snapshot,
uv->io->errmsg);
if (rv == 0) {
break;
}
tracef("skip invalid snapshot %s", snapshot_filename);
}
if (rv != 0) {
RaftHeapFree(*snapshot);
*snapshot = NULL;
goto err;
}
tracef("most recent snapshot at %lld", (*snapshot)->index);
RaftHeapFree(snapshots);
snapshots = NULL;
/* Update the start index. If there are closed segments on disk let's
* make sure that the first index of the first closed segment is not
* greater than the snapshot's last index plus one (so there are no
* missing entries), and update the start index accordingly. */
rv = uvFilterSegments(uv, (*snapshot)->index, snapshot_filename,
&segments, &n_segments);
if (rv != 0) {
goto err;
}
if (segments != NULL) {
if (segments[0].is_open) {
*start_index = (*snapshot)->index + 1;
} else {
*start_index = segments[0].first_index;
}
} else {
*start_index = (*snapshot)->index + 1;
}
}
/* Read data from segments, closing any open segments. */
if (segments != NULL) {
raft_index last_index;
rv = uvSegmentLoadAll(uv, *start_index, segments, n_segments, entries,
n);
if (rv != 0) {
goto err;
}
/* Check if all entries that we loaded are actually behind the last
* snapshot. This can happen if the last closed segment was behind the
* last snapshot and there were open segments, but the entries in the
* open segments turned out to be behind the snapshot as well. */
last_index = *start_index + *n - 1;
if (*snapshot != NULL && last_index < (*snapshot)->index) {
ErrMsgPrintf(uv->io->errmsg,
"last entry on disk has index %llu, which is behind "
"last snapshot's index %llu",
last_index, (*snapshot)->index);
rv = RAFT_CORRUPT;
goto err;
}
raft_free(segments);
segments = NULL;
}
return 0;
err:
assert(rv != 0);
if (*snapshot != NULL) {
snapshotDestroy(*snapshot);
*snapshot = NULL;
}
if (snapshots != NULL) {
raft_free(snapshots);
}
if (segments != NULL) {
raft_free(segments);
}
if (*entries != NULL) {
entryBatchesDestroy(*entries, *n);
*entries = NULL;
*n = 0;
}
/* Try to recover exactly once when corruption is detected, the first pass
* might have cleaned up corrupt data. Most of the arguments are already
* reset after the `err` label, except for `start_index`. */
if (rv == RAFT_CORRUPT && uv->auto_recovery && depth == 0) {
*start_index = 1;
return uvLoadSnapshotAndEntries(uv, snapshot, start_index, entries, n,
depth + 1);
}
return rv;
}
/* Implementation of raft_io->load. */
static int uvLoad(struct raft_io *io,
raft_term *term,
raft_id *voted_for,
struct raft_snapshot **snapshot,
raft_index *start_index,
struct raft_entry **entries,
size_t *n_entries)
{
struct uv *uv;
int rv;
uv = io->impl;
*term = uv->metadata.term;
*voted_for = uv->metadata.voted_for;
*snapshot = NULL;
rv = uvLoadSnapshotAndEntries(uv, snapshot, start_index, entries, n_entries,
0);
if (rv != 0) {
return rv;
}
tracef("start index %lld, %zu entries", *start_index, *n_entries);
if (*snapshot == NULL) {
tracef("no snapshot");
}
/* Set the index of the next entry that will be appended. */
uv->append_next_index = *start_index + *n_entries;
return 0;
}
/* Implementation of raft_io->set_term. */
static int uvSetTerm(struct raft_io *io, const raft_term term)
{
struct uv *uv;
int rv;
uv = io->impl;
uv->metadata.version++;
uv->metadata.term = term;
uv->metadata.voted_for = 0;
rv = uvMetadataStore(uv, &uv->metadata);
if (rv != 0) {
return rv;
}
return 0;
}
/* Implementation of raft_io->set_term. */
static int uvSetVote(struct raft_io *io, const raft_id server_id)
{
struct uv *uv;
int rv;
uv = io->impl;
uv->metadata.version++;
uv->metadata.voted_for = server_id;
rv = uvMetadataStore(uv, &uv->metadata);
if (rv != 0) {
return rv;
}
return 0;
}
/* Implementation of raft_io->bootstrap. */
static int uvBootstrap(struct raft_io *io,
const struct raft_configuration *configuration)
{
struct uv *uv;
int rv;
uv = io->impl;
/* We shouldn't have written anything else yet. */
if (uv->metadata.term != 0) {
ErrMsgPrintf(io->errmsg, "metadata contains term %lld",
uv->metadata.term);
return RAFT_CANTBOOTSTRAP;
}
/* Write the term */
rv = uvSetTerm(io, 1);
if (rv != 0) {
return rv;
}
/* Create the first closed segment file, containing just one entry. */
rv = uvSegmentCreateFirstClosed(uv, configuration);
if (rv != 0) {
return rv;
}
return 0;
}
/* Implementation of raft_io->recover. */
static int uvRecover(struct raft_io *io, const struct raft_configuration *conf)
{
struct uv *uv = io->impl;
struct raft_snapshot *snapshot;
raft_index start_index;
raft_index next_index;
raft_term last_term = 0; /* Term of last entry. */
struct raft_entry *entries;
size_t n_entries;
int rv;
/* Load the current state. This also closes any leftover open segment. */
rv = uvLoadSnapshotAndEntries(uv, &snapshot, &start_index, &entries,
&n_entries, 0);
if (rv != 0) {
return rv;
}
/* We don't care about the actual data, just index of the last entry. */
if (snapshot != NULL) {
last_term = snapshot->term;
snapshotDestroy(snapshot);
}
if (entries != NULL) {
last_term = entries[n_entries - 1].term;
entryBatchesDestroy(entries, n_entries);
}
assert(start_index > 0);
assert(last_term > 0);
next_index = start_index + n_entries;
rv =
uvSegmentCreateClosedWithConfiguration(uv, next_index, conf, last_term);
if (rv != 0) {
return rv;
}
return 0;
}
/* Implementation of raft_io->time. */
static raft_time uvTime(struct raft_io *io)
{
struct uv *uv;
uv = io->impl;
return uv_now(uv->loop);
}
/* Implementation of raft_io->random. */
static int uvRandom(struct raft_io *io, int min, int max)
{
(void)io;
return min + (abs(rand()) % (max - min));
}
static void uvSeedRand(struct uv *uv)
{
struct timeval time = {0};
unsigned seed = 0;
/* Create a seed by combining the node ID and the current time.
*
* Given that we use rand() only to generate the randomized election timeout
* and not to, say, perform some secure-sensitive cryptographic task, it's
* ok to not use more sophisticated methods to create the seed (for
* example. getrandom()). */
seed ^= (unsigned)uv->id;
seed ^= (unsigned)uv_now(uv->loop);
gettimeofday(&time, NULL);
seed ^= (unsigned)((time.tv_sec * 1000) + (time.tv_usec / 1000));
srand(seed);
}
int raft_uv_init(struct raft_io *io,
struct uv_loop_s *loop,
const char *dir,
struct raft_uv_transport *transport)
{
struct uv *uv;
void *data;
int rv;
assert(io != NULL);
assert(loop != NULL);
assert(dir != NULL);
assert(transport != NULL);
data = io->data;
memset(io, 0, sizeof *io);
io->data = data;
if (transport->version == 0) {
ErrMsgPrintf(io->errmsg, "transport->version must be set");
return RAFT_INVALID;
}
/* Ensure that the given path doesn't exceed our static buffer limit. */
if (!UV__DIR_HAS_VALID_LEN(dir)) {
ErrMsgPrintf(io->errmsg, "directory path too long");
return RAFT_NAMETOOLONG;
}
/* Allocate the raft_io_uv object */
uv = raft_malloc(sizeof *uv);
if (uv == NULL) {
rv = RAFT_NOMEM;
goto err;
}
memset(uv, 0, sizeof(struct uv));
uv->io = io;
uv->loop = loop;
strncpy(uv->dir, dir, sizeof(uv->dir) - 1);
uv->dir[sizeof(uv->dir) - 1] = '\0';
uv->transport = transport;
uv->transport->data = NULL;
uv->tracer = &StderrTracer;
uv->id = 0; /* Set by raft_io->config() */
uv->state = UV__PRISTINE;
uv->errored = false;
uv->direct_io = false;
uv->async_io = false;
uv->segment_size = UV__MAX_SEGMENT_SIZE;
uv->disk_retry = UV__DISK_RETRY_RATE;
uv->block_size = 0;
QUEUE_INIT(&uv->clients);
QUEUE_INIT(&uv->servers);
uv->connect_retry_delay = CONNECT_RETRY_DELAY;
uv->prepare_inflight = NULL;
QUEUE_INIT(&uv->prepare_reqs);
QUEUE_INIT(&uv->prepare_pool);
uv->prepare_next_counter = 1;
uv->append_next_index = 1;
QUEUE_INIT(&uv->append_segments);
QUEUE_INIT(&uv->append_pending_reqs);
QUEUE_INIT(&uv->append_writing_reqs);
uv->barrier = NULL;
QUEUE_INIT(&uv->finalize_reqs);
uv->finalize_work.data = NULL;
uv->truncate_work.data = NULL;
QUEUE_INIT(&uv->snapshot_get_reqs);
QUEUE_INIT(&uv->async_work_reqs);
uv->snapshot_put_work.data = NULL;
uv->timer.data = NULL;
uv->tick_cb = NULL; /* Set by raft_io->start() */
uv->recv_cb = NULL; /* Set by raft_io->start() */
QUEUE_INIT(&uv->aborting);
uv->closing = false;
uv->close_cb = NULL;
uv->auto_recovery = true;
uvSeedRand(uv);
/* Set the raft_io implementation. */
io->version = 2; /* future-proof'ing */
io->capacity = 0;
io->impl = uv;
io->init = uvInit;
io->close = uvClose;
io->start = uvStart;
io->load = uvLoad;
io->bootstrap = uvBootstrap;
io->recover = uvRecover;
io->set_term = uvSetTerm;
io->set_vote = uvSetVote;
io->append = UvAppend;
io->truncate = UvTruncate;
io->send = UvSend;
io->snapshot_put = UvSnapshotPut;
io->snapshot_get = UvSnapshotGet;
io->time = uvTime;
io->random = uvRandom;
return 0;
err:
assert(rv != 0);
if (rv == RAFT_NOMEM) {
ErrMsgOom(io->errmsg);
}
return rv;
}
void raft_uv_close(struct raft_io *io)
{
struct uv *uv;
uv = io->impl;
io->impl = NULL;
raft_free(uv);
}
void raft_uv_set_segment_size(struct raft_io *io, size_t size)
{
struct uv *uv;
uv = io->impl;
uv->segment_size = size;
}
void raft_uv_set_disk_retry(struct raft_io *io, unsigned msecs)
{
struct uv *uv;
uv = io->impl;
uv->disk_retry = msecs;
}
void raft_uv_set_block_size(struct raft_io *io, size_t size)
{
struct uv *uv;
uv = io->impl;
uv->block_size = size;
}
int raft_uv_set_snapshot_compression(struct raft_io *io, bool compressed)
{
(void)io;
(void)compressed;
return 0;
}
void raft_uv_set_connect_retry_delay(struct raft_io *io, unsigned msecs)
{
struct uv *uv;
uv = io->impl;
uv->connect_retry_delay = msecs;
}
void raft_uv_set_tracer(struct raft_io *io, struct raft_tracer *tracer)
{
struct uv *uv;
uv = io->impl;
uv->tracer = tracer;
}
void raft_uv_set_auto_recovery(struct raft_io *io, bool flag)
{
struct uv *uv;
uv = io->impl;
uv->auto_recovery = flag;
}
#undef tracef
raft-0.22.1/src/uv.h 0000664 0000000 0000000 00000043053 14601504142 0014124 0 ustar 00root root 0000000 0000000 /* Implementation of the @raft_io interface based on libuv. */
#ifndef UV_H_
#define UV_H_
#include "../include/raft.h"
#include "err.h"
#include "queue.h"
#include "tracing.h"
#include "uv_fs.h"
#include "uv_os.h"
/* 8 Megabytes */
#define UV__MAX_SEGMENT_SIZE (8 * 1024 * 1024)
/* Template string for closed segment filenames: start index (inclusive), end
* index (inclusive). */
#define UV__CLOSED_TEMPLATE "%016llu-%016llu"
/* Template string for open segment filenames: incrementing counter. */
#define UV__OPEN_TEMPLATE "open-%llu"
/* Enough to hold a segment filename (either open or closed) */
#define UV__SEGMENT_FILENAME_BUF_SIZE 34
/* Retry failed disk operations every 5 seconds by default. */
#define UV__DISK_RETRY_RATE 1000 * 5
/* Template string for snapshot filenames: snapshot term, snapshot index,
* creation timestamp (milliseconds since epoch). */
#define UV__SNAPSHOT_TEMPLATE "snapshot-%llu-%llu-%llu"
#define UV__SNAPSHOT_META_SUFFIX ".meta"
/* Template string for snapshot metadata filenames: snapshot term, snapshot
* index, creation timestamp (milliseconds since epoch). */
#define UV__SNAPSHOT_META_TEMPLATE \
UV__SNAPSHOT_TEMPLATE UV__SNAPSHOT_META_SUFFIX
/* State codes. */
enum {
UV__PRISTINE, /* Metadata cache populated and I/O capabilities probed */
UV__ACTIVE,
UV__CLOSED
};
/* Open segment counter type */
typedef unsigned long long uvCounter;
/* Information persisted in a single metadata file. */
struct uvMetadata
{
unsigned long long version; /* Monotonically increasing version */
raft_term term; /* Current term */
raft_id voted_for; /* Server ID of last vote, or 0 */
};
/* Hold state of a libuv-based raft_io implementation. */
struct uv
{
struct raft_io *io; /* I/O object we're implementing */
struct uv_loop_s *loop; /* UV event loop */
char dir[UV__DIR_LEN]; /* Data directory */
struct raft_uv_transport *transport; /* Network transport */
struct raft_tracer *tracer; /* Debug tracing */
raft_id id; /* Server ID */
int state; /* Current state */
bool errored; /* If a disk I/O error was hit */
bool direct_io; /* Whether direct I/O is supported */
bool async_io; /* Whether async I/O is supported */
size_t segment_size; /* Initial size of open segments. */
unsigned disk_retry; /* Disk operations retry rate */
size_t block_size; /* Block size of the data dir */
queue clients; /* Outbound connections */
queue servers; /* Inbound connections */
unsigned connect_retry_delay; /* Client connection retry delay */
void *prepare_inflight; /* Segment being prepared */
queue prepare_reqs; /* Pending prepare requests. */
queue prepare_pool; /* Prepared open segments */
struct uv_timer_s prepare_retry; /* Timer for prepare retries */
uvCounter prepare_next_counter; /* Counter of next open segment */
raft_index append_next_index; /* Index of next entry to append */
queue append_segments; /* Open segments in use. */
queue append_pending_reqs; /* Pending append requests. */
queue append_writing_reqs; /* Append requests in flight */
struct uv_timer_s append_retry; /* Timer for append retries */
struct UvBarrier *barrier; /* Inflight barrier request */
queue finalize_reqs; /* Segments waiting to be closed */
struct uv_work_s finalize_work; /* Resize and rename segments */
struct uv_work_s truncate_work; /* Execute truncate log requests */
queue snapshot_get_reqs; /* Inflight get snapshot requests */
queue async_work_reqs; /* Inflight async work requests */
struct uv_work_s snapshot_put_work; /* Execute snapshot put requests */
struct uv_timer_s snapshot_put_retry; /* Timer for snapshot put retries */
struct uvMetadata metadata; /* Cache of metadata on disk */
struct uv_timer_s timer; /* Timer for periodic ticks */
raft_io_tick_cb tick_cb; /* Invoked when the timer expires */
raft_io_recv_cb recv_cb; /* Invoked when upon RPC messages */
queue aborting; /* Cleanups upon errors or shutdown */
bool closing; /* True if we are closing */
raft_io_close_cb close_cb; /* Invoked when finishing closing */
bool auto_recovery; /* Try to recover from corrupt segments */
struct uv_prepare_s prepare;
struct uv_check_s check;
};
/* Implementation of raft_io->truncate. */
int UvTruncate(struct raft_io *io, raft_index index);
/* Load Raft metadata from disk, choosing the most recent version (either the
* metadata1 or metadata2 file). */
int uvMetadataLoad(const char *dir, struct uvMetadata *metadata, char *errmsg);
/* Store the given metadata to disk, writing the appropriate metadata file
* according to the metadata version (if the version is odd, write metadata1,
* otherwise write metadata2). */
int uvMetadataStore(struct uv *uv, const struct uvMetadata *metadata);
/* Metadata about a segment file. */
struct uvSegmentInfo
{
bool is_open; /* Whether the segment is open */
union {
struct
{
raft_index first_index; /* First index in a closed segment */
raft_index end_index; /* Last index in a closed segment */
};
struct
{
unsigned long long counter; /* Open segment counter */
};
};
char filename[UV__SEGMENT_FILENAME_BUF_SIZE]; /* Segment filename */
};
/* Append a new item to the given segment info list if the given filename
* matches either the one of a closed segment (xxx-yyy) or the one of an open
* segment (open-xxx). */
int uvSegmentInfoAppendIfMatch(const char *filename,
struct uvSegmentInfo *infos[],
size_t *n_infos,
bool *appended);
/* Sort the given list of segments by comparing their filenames. Closed segments
* come before open segments. */
void uvSegmentSort(struct uvSegmentInfo *infos, size_t n_infos);
/* Keep only the closed segments whose entries are within the given trailing
* amount past the given snapshot last index. If the given trailing amount is 0,
* unconditionally delete all closed segments. */
int uvSegmentKeepTrailing(struct uv *uv,
struct uvSegmentInfo *segments,
size_t n,
raft_index last_index,
size_t trailing,
char *errmsg);
/* Load all entries contained in the given closed segment. */
int uvSegmentLoadClosed(struct uv *uv,
struct uvSegmentInfo *segment,
struct raft_entry *entries[],
size_t *n);
/* Load raft entries from the given segments. The @start_index is the expected
* index of the first entry of the first segment. */
int uvSegmentLoadAll(struct uv *uv,
const raft_index start_index,
struct uvSegmentInfo *segments,
size_t n_segments,
struct raft_entry **entries,
size_t *n_entries);
/* Return the number of blocks in a segments. */
#define uvSegmentBlocks(UV) (UV->segment_size / UV->block_size)
/* A dynamically allocated buffer holding data to be written into a segment
* file.
*
* The memory is aligned at disk block boundary, to allow for direct I/O. */
struct uvSegmentBuffer
{
size_t block_size; /* Disk block size for direct I/O */
uv_buf_t arena; /* Previously allocated memory that can be re-used */
size_t n; /* Write offset */
};
/* Initialize an empty buffer. */
void uvSegmentBufferInit(struct uvSegmentBuffer *b, size_t block_size);
/* Release all memory used by the buffer. */
void uvSegmentBufferClose(struct uvSegmentBuffer *b);
/* Encode the format version at the very beginning of the buffer. This function
* must be called when the buffer is empty. */
int uvSegmentBufferFormat(struct uvSegmentBuffer *b);
/* Extend the segment's buffer by encoding the given entries.
*
* Previous data in the buffer will be retained, and data for these new entries
* will be appended. */
int uvSegmentBufferAppend(struct uvSegmentBuffer *b,
const struct raft_entry entries[],
unsigned n_entries);
/* After all entries to write have been encoded, finalize the buffer by zeroing
* the unused memory of the last block. The out parameter will point to the
* memory to write. */
void uvSegmentBufferFinalize(struct uvSegmentBuffer *b, uv_buf_t *out);
/* Reset the buffer preparing it for the next segment write.
*
* If the retain parameter is greater than zero, then the data of the retain'th
* block will be copied at the beginning of the buffer and the write offset will
* be set accordingly. */
void uvSegmentBufferReset(struct uvSegmentBuffer *b, unsigned retain);
/* Write a closed segment, containing just one entry at the given index
* for the given configuration. */
int uvSegmentCreateClosedWithConfiguration(
struct uv *uv,
raft_index index,
const struct raft_configuration *conf,
raft_term conf_term);
/* Write the first closed segment, containing just one entry for the given
* configuration. */
int uvSegmentCreateFirstClosed(struct uv *uv,
const struct raft_configuration *configuration);
/* Truncate a segment that was already closed. */
int uvSegmentTruncate(struct uv *uv,
struct uvSegmentInfo *segment,
raft_index index);
/* Info about a persisted snapshot stored in snapshot metadata file. */
struct uvSnapshotInfo
{
raft_term term;
raft_index index;
unsigned long long timestamp;
char filename[UV__FILENAME_LEN];
};
/* Render the filename of the data file of a snapshot */
void uvSnapshotFilenameOf(struct uvSnapshotInfo *info, char *filename);
/* Upon success `orphan` will be true if filename is a snapshot file without a
* sibling .meta file */
int UvSnapshotIsOrphan(const char *dir, const char *filename, bool *orphan);
/* Upon success `orphan` will be true if filename is a snapshot .meta file
* without a sibling snapshot file */
int UvSnapshotMetaIsOrphan(const char *dir, const char *filename, bool *orphan);
/* Append a new item to the given snapshot info list if the given filename
* matches the pattern of a snapshot metadata file (snapshot-xxx-yyy-zzz.meta)
* and there is actually a matching non-empty snapshot file on disk. */
int UvSnapshotInfoAppendIfMatch(struct uv *uv,
const char *filename,
struct uvSnapshotInfo *infos[],
size_t *n_infos,
bool *appended);
/* Sort the given list of snapshots by comparing their filenames. Older
* snapshots will come first. */
void UvSnapshotSort(struct uvSnapshotInfo *infos, size_t n_infos);
/* Load the snapshot associated with the given metadata. */
int UvSnapshotLoad(struct uv *uv,
struct uvSnapshotInfo *meta,
struct raft_snapshot *snapshot,
char *errmsg);
/* Implementation raft_io->snapshot_put (defined in uv_snapshot.c). */
int UvSnapshotPut(struct raft_io *io,
unsigned trailing,
struct raft_io_snapshot_put *req,
const struct raft_snapshot *snapshot,
raft_io_snapshot_put_cb cb);
/* Implementation of raft_io->snapshot_get (defined in uv_snapshot.c). */
int UvSnapshotGet(struct raft_io *io,
struct raft_io_snapshot_get *req,
raft_io_snapshot_get_cb cb);
/* Cancel any pending snapshot operation. */
void UvSnapshotClose(struct uv *uv);
/* Return a list of all snapshots and segments found in the data directory. Both
* snapshots and segments are ordered by filename (closed segments come before
* open ones). */
int UvList(struct uv *uv,
struct uvSnapshotInfo *snapshots[],
size_t *n_snapshots,
struct uvSegmentInfo *segments[],
size_t *n_segments,
char *errmsg);
/* Request to obtain a newly prepared open segment. */
struct uvPrepare;
typedef void (*uvPrepareCb)(struct uvPrepare *req, int status);
struct uvPrepare
{
void *data; /* User data */
uv_file fd; /* Resulting segment file descriptor */
unsigned long long counter; /* Resulting segment counter */
uvPrepareCb cb; /* Completion callback */
queue queue; /* Links in uv_io->prepare_reqs */
};
/* Get a prepared open segment ready for writing. If a prepared open segment is
* already available in the pool, it will be returned immediately using the fd
* and counter pointers and the request callback won't be invoked. Otherwise the
* request will be queued and its callback invoked once a newly prepared segment
* is available. */
int UvPrepare(struct uv *uv,
uv_file *fd,
uvCounter *counter,
struct uvPrepare *req,
uvPrepareCb cb);
/* Try to allocate open segments at startup. */
void UvPrepareStart(struct uv *uv);
/* Return the number of ready prepared open segments in the pool. */
unsigned UvPrepareCount(struct uv *uv);
/* Cancel all pending prepare requests and start removing all unused prepared
* open segments. If a segment currently being created, wait for it to complete
* and then remove it immediately. */
void UvPrepareClose(struct uv *uv);
/* Implementation of raft_io->append. All the raft_buffers of the raft_entry
* structs in the entries array are required to have a len that is a multiple
* of 8. */
int UvAppend(struct raft_io *io,
struct raft_io_append *req,
const struct raft_entry entries[],
unsigned n,
raft_io_append_cb cb);
/* Return the remaining capacity of segments currently being written. */
size_t UvAppendCapacity(struct uv *uv);
/* Pause request object and callback. */
struct UvBarrierReq;
/* A barrier cb that plans to perform work on the threadpool MUST exit early
* and cleanup resources when it detects uv->closing, this is to allow forced
* closing on shutdown. */
typedef void (*UvBarrierCb)(struct UvBarrierReq *req);
struct UvBarrierReq
{
bool blocking; /* Whether this barrier should block future writes */
void *data; /* User data */
UvBarrierCb cb; /* Completion callback */
queue queue; /* Queue of reqs triggered by a UvBarrier */
};
struct UvBarrier
{
bool blocking; /* Whether this barrier should block future writes */
queue reqs; /* Queue of UvBarrierReq */
};
/* Submit a barrier request to interrupt the normal flow of append
* operations.
*
* The following will happen:
*
* - Replace uv->append_next_index with the given next_index, so the next entry
* that will be appended will have the new index.
*
* - Execution of new writes for subsequent append requests will be blocked
* until UvUnblock is called when the barrier is blocking.
*
* - Wait for all currently pending and inflight append requests against all
* open segments to complete, and for those open segments to be finalized,
* then invoke the barrier callback.
*
* This API is used to implement truncate and snapshot install operations, which
* need to wait until all pending writes have settled and modify the log state,
* changing the next index. */
int UvBarrier(struct uv *uv, raft_index next_index, struct UvBarrierReq *req);
/* Trigger a callback for a barrier request in this @barrier. Returns true if a
* callback was triggered, false if there are no more requests to trigger.
* A barrier callback will call UvUnblock, which in turn will try to run the
* next callback, if any, from a barrier request in this barrier. */
bool UvBarrierMaybeTrigger(struct UvBarrier *barrier);
/* Add a Barrier @req to an existing @barrier. */
void UvBarrierAddReq(struct UvBarrier *barrier, struct UvBarrierReq *req);
/* Returns @true if there are no more segments referencing uv->barrier */
bool UvBarrierReady(struct uv *uv);
/* Resume writing append requests after UvBarrier has been called. */
void UvUnblock(struct uv *uv);
/* Cancel all pending write requests and request the current segment to be
* finalized. Must be invoked at closing time. */
void uvAppendClose(struct uv *uv);
/* Submit a request to finalize the open segment with the given counter.
*
* Requests are processed one at a time, to avoid ending up closing open segment
* N + 1 before closing open segment N. */
int UvFinalize(struct uv *uv,
unsigned long long counter,
size_t used,
raft_index first_index,
raft_index last_index);
/* Implementation of raft_io->send. */
int UvSend(struct raft_io *io,
struct raft_io_send *req,
const struct raft_message *message,
raft_io_send_cb cb);
/* Stop all clients by closing the outbound stream handles and canceling all
* pending send requests. */
void UvSendClose(struct uv *uv);
/* Start receiving messages from new incoming connections. */
int UvRecvStart(struct uv *uv);
/* Stop all servers by closing the inbound stream handles and aborting all
* requests being received. */
void UvRecvClose(struct uv *uv);
void uvMaybeFireCloseCb(struct uv *uv);
#endif /* UV_H_ */
raft-0.22.1/src/uv_append.c 0000664 0000000 0000000 00000103372 14601504142 0015447 0 ustar 00root root 0000000 0000000 #include "assert.h"
#include "byte.h"
#include "heap.h"
#include "queue.h"
#include "uv.h"
#include "uv_encoding.h"
#include "uv_writer.h"
#define tracef(...) Tracef(uv->tracer, __VA_ARGS__)
/* The happy path for an append request is:
*
* - If there is a current segment and it is has enough spare capacity to hold
* the entries in the request, then queue the request, linking it to the
* current segment.
*
* - If there is no current segment, or it hasn't enough spare capacity to hold
* the entries in the request, then request a new open segment to be prepared,
* queue the request and link it to the newly requested segment.
*
* - Wait for any pending write against the current segment to complete, and
* also for the prepare request if we asked for a new segment. Also wait for
* any in progress barrier to be removed.
*
* - Submit a write request for the entries in this append request. The write
* request might contain other append requests targeted to the current segment
* that might have accumulated in the meantime, if we have been waiting for a
* segment to be prepared, or for the previous write to complete or for a
* barrier to be removed.
*
* - Wait for the write request to finish and fire the append request's
* callback.
*
* Possible failure modes are:
*
* - The request to prepare a new segment fails.
* - The write request fails.
* - The request to finalize a new segment fails to be submitted.
*
* In all these cases we mark the instance as errored and fire the relevant
* callbacks.
**/
/* An open segment being written or waiting to be written. */
struct uvAliveSegment
{
struct uv *uv; /* Our writer */
struct uvPrepare prepare; /* Prepare segment file request */
struct UvWriter writer; /* Writer to perform async I/O */
struct UvWriterReq write; /* Write request */
unsigned long long counter; /* Open segment counter */
raft_index first_index; /* Index of the first entry written */
raft_index pending_last_index; /* Index of the last entry written */
size_t size; /* Total number of bytes used */
unsigned next_block; /* Next segment block to write */
struct uvSegmentBuffer pending; /* Buffer for data yet to be written */
uv_buf_t buf; /* Write buffer for current write */
raft_index last_index; /* Last entry actually written */
size_t written; /* Number of bytes actually written */
queue queue; /* Segment queue */
struct UvBarrier *barrier; /* Barrier waiting on this segment */
bool finalize; /* Finalize the segment after writing */
};
struct uvAppend
{
struct raft_io_append *req; /* User request */
const struct raft_entry *entries; /* Entries to write */
unsigned n; /* Number of entries */
struct uvAliveSegment *segment; /* Segment to write to */
queue queue;
};
static void uvAliveSegmentWriterCloseCb(struct UvWriter *writer)
{
struct uvAliveSegment *segment = writer->data;
uvSegmentBufferClose(&segment->pending);
RaftHeapFree(segment);
}
/* Submit a request to close the current open segment. */
static void uvAliveSegmentFinalize(struct uvAliveSegment *s)
{
struct uv *uv = s->uv;
int rv;
rv = UvFinalize(uv, s->counter, s->written, s->first_index, s->last_index);
if (rv != 0) {
uv->errored = true;
/* We failed to submit the finalize request, but let's still close the
* file handle and release the segment memory. */
}
QUEUE_REMOVE(&s->queue);
UvWriterClose(&s->writer, uvAliveSegmentWriterCloseCb);
}
/* Flush the append requests in the given queue, firing their callbacks with the
* given status. */
static void uvAppendFinishRequestsInQueue(struct uv *uv, queue *q, int status)
{
queue queue_copy;
struct uvAppend *append;
QUEUE_INIT(&queue_copy);
while (!QUEUE_IS_EMPTY(q)) {
queue *head;
head = QUEUE_HEAD(q);
append = QUEUE_DATA(head, struct uvAppend, queue);
/* Rollback the append next index if the result was unsuccessful. */
if (status != 0) {
tracef("rollback uv->append_next_index was:%llu",
uv->append_next_index);
uv->append_next_index -= append->n;
tracef("rollback uv->append_next_index now:%llu",
uv->append_next_index);
}
QUEUE_REMOVE(head);
QUEUE_PUSH(&queue_copy, head);
}
while (!QUEUE_IS_EMPTY(&queue_copy)) {
queue *head;
struct raft_io_append *req;
head = QUEUE_HEAD(&queue_copy);
append = QUEUE_DATA(head, struct uvAppend, queue);
QUEUE_REMOVE(head);
req = append->req;
RaftHeapFree(append);
req->cb(req, status);
}
}
/* Flush the append requests in the writing queue, firing their callbacks with
* the given status. */
static void uvAppendFinishWritingRequests(struct uv *uv, int status)
{
uvAppendFinishRequestsInQueue(uv, &uv->append_writing_reqs, status);
}
/* Flush the append requests in the pending queue, firing their callbacks with
* the given status. */
static void uvAppendFinishPendingRequests(struct uv *uv, int status)
{
uvAppendFinishRequestsInQueue(uv, &uv->append_pending_reqs, status);
}
/* Return the segment currently being written, or NULL when no segment has been
* written yet. */
static struct uvAliveSegment *uvGetCurrentAliveSegment(struct uv *uv)
{
queue *head;
if (QUEUE_IS_EMPTY(&uv->append_segments)) {
return NULL;
}
head = QUEUE_HEAD(&uv->append_segments);
return QUEUE_DATA(head, struct uvAliveSegment, queue);
}
/* Extend the segment's write buffer by encoding the entries in the given
* request into it. IOW, previous data in the write buffer will be retained, and
* data for these new entries will be appended. */
static int uvAliveSegmentEncodeEntriesToWriteBuf(struct uvAliveSegment *segment,
struct uvAppend *append)
{
int rv;
assert(append->segment == segment);
/* If this is the very first write to the segment, we need to include the
* format version */
if (segment->pending.n == 0 && segment->next_block == 0) {
rv = uvSegmentBufferFormat(&segment->pending);
if (rv != 0) {
return rv;
}
}
rv = uvSegmentBufferAppend(&segment->pending, append->entries, append->n);
if (rv != 0) {
return rv;
}
segment->pending_last_index += append->n;
return 0;
}
static void uvAliveSegmentWriteCb(struct UvWriterReq *write, const int status);
static void uvAppendRetryTimerCb(uv_timer_t *timer)
{
struct uvAliveSegment *s = timer->data;
struct uv *uv = s->uv;
int rv;
uv->append_retry.data = uv;
rv = UvWriterSubmit(&s->writer, &s->write, &s->buf, 1,
s->next_block * s->uv->block_size,
uvAliveSegmentWriteCb);
if (rv != 0) {
uv->append_retry.data = s;
rv = uv_timer_start(&uv->prepare_retry, uvAppendRetryTimerCb,
uv->disk_retry, 0);
assert(rv == 0);
}
}
static int uvAppendMaybeStart(struct uv *uv);
static void uvAliveSegmentWriteCb(struct UvWriterReq *write, const int status)
{
struct uvAliveSegment *s = write->data;
struct uv *uv = s->uv;
unsigned n_blocks;
int rv;
assert(uv->state != UV__CLOSED);
assert(s->buf.len % uv->block_size == 0);
assert(s->buf.len >= uv->block_size);
/* If the write was unsuccessful, retry it after a delay. */
if (status != 0) {
Tracef(uv->tracer, "retry failed write (%s)", uv->io->errmsg);
uv->append_retry.data = s;
rv = uv_timer_start(&uv->prepare_retry, uvAppendRetryTimerCb,
uv->disk_retry, 0);
assert(rv == 0);
return;
}
s->written = s->next_block * uv->block_size + s->pending.n;
s->last_index = s->pending_last_index;
/* Update our write markers.
*
* We have four cases:
*
* - The data fit completely in the leftover space of the first block that
* we wrote and there is more space left. In this case we just keep the
* scheduled marker unchanged.
*
* - The data fit completely in the leftover space of the first block that
* we wrote and there is no space left. In this case we advance the
* current block counter, reset the first write block and set the
* scheduled marker to 0.
*
* - The data did not fit completely in the leftover space of the first
* block that we wrote, so we wrote more than one block. The last block
* that we wrote was not filled completely and has leftover space. In this
* case we advance the current block counter and copy the memory used for
* the last block to the head of the write arena list, updating the
* scheduled marker accordingly.
*
* - The data did not fit completely in the leftover space of the first
* block that we wrote, so we wrote more than one block. The last block
* that we wrote was filled exactly and has no leftover space. In this
* case we advance the current block counter, reset the first buffer and
* set the scheduled marker to 0.
*/
n_blocks =
(unsigned)(s->buf.len / uv->block_size); /* Number of blocks written. */
if (s->pending.n < uv->block_size) {
/* Nothing to do */
assert(n_blocks == 1);
} else if (s->pending.n == uv->block_size) {
assert(n_blocks == 1);
s->next_block++;
uvSegmentBufferReset(&s->pending, 0);
} else {
assert(s->pending.n > uv->block_size);
assert(s->buf.len > uv->block_size);
if (s->pending.n % uv->block_size > 0) {
s->next_block += n_blocks - 1;
uvSegmentBufferReset(&s->pending, n_blocks - 1);
} else {
s->next_block += n_blocks;
uvSegmentBufferReset(&s->pending, 0);
}
}
/* Fire the callbacks of all requests that were fulfilled with this
* write. */
uvAppendFinishWritingRequests(uv, status);
/* During the closing sequence we should have already canceled all pending
* request. */
if (uv->closing) {
assert(QUEUE_IS_EMPTY(&uv->append_pending_reqs));
assert(s->finalize);
uvAliveSegmentFinalize(s);
return;
}
/* Possibly process waiting requests. */
if (!QUEUE_IS_EMPTY(&uv->append_pending_reqs)) {
rv = uvAppendMaybeStart(uv);
if (rv != 0) {
uv->errored = true;
}
} else if (s->finalize && (s->pending_last_index == s->last_index) &&
!s->writer.closing) {
/* If there are no more append_pending_reqs or write requests in flight,
* this segment must be finalized here in case we don't receive
* AppendEntries RPCs anymore (could happen during a Snapshot install,
* causing the BarrierCb to never fire), but check that the callbacks
* that fired after completion of this write didn't already close the
* segment. */
uvAliveSegmentFinalize(s);
}
}
/* Submit a file write request to append the entries encoded in the write buffer
* of the given segment. */
static int uvAliveSegmentWrite(struct uvAliveSegment *s)
{
int rv;
assert(s->counter != 0);
assert(s->pending.n > 0);
uvSegmentBufferFinalize(&s->pending, &s->buf);
rv = UvWriterSubmit(&s->writer, &s->write, &s->buf, 1,
s->next_block * s->uv->block_size,
uvAliveSegmentWriteCb);
if (rv != 0) {
return rv;
}
return 0;
}
/* Start writing all pending append requests for the current segment, unless we
* are already writing, or the segment itself has not yet been prepared or we
* are blocked on a barrier. If there are no more requests targeted at the
* current segment, make sure it's marked to be finalize and try with the next
* segment. */
static int uvAppendMaybeStart(struct uv *uv)
{
struct uvAliveSegment *segment;
struct uvAppend *append;
unsigned n_reqs;
queue *head;
queue q;
int rv;
assert(!uv->closing);
assert(!QUEUE_IS_EMPTY(&uv->append_pending_reqs));
/* If we are already writing, let's wait. */
if (!QUEUE_IS_EMPTY(&uv->append_writing_reqs)) {
return 0;
}
start:
segment = uvGetCurrentAliveSegment(uv);
assert(segment != NULL);
/* If the preparer isn't done yet, let's wait. */
if (segment->counter == 0) {
return 0;
}
/* If there's a blocking barrier in progress, and it's not waiting for this
* segment to be finalized, let's wait.
*
* FIXME shouldn't we wait even if segment->barrier == uv->barrier, if there
* are other open segments associated with the same barrier? */
if (uv->barrier != NULL && segment->barrier != uv->barrier &&
uv->barrier->blocking) {
return 0;
}
/* If there's no barrier in progress and this segment is marked with a
* barrier, it means that this was a pending barrier, which we can become
* the current barrier now. */
if (uv->barrier == NULL && segment->barrier != NULL) {
uv->barrier = segment->barrier;
}
/* Let's add to the segment's write buffer all pending requests targeted to
* this segment. */
QUEUE_INIT(&q);
n_reqs = 0;
while (!QUEUE_IS_EMPTY(&uv->append_pending_reqs)) {
head = QUEUE_HEAD(&uv->append_pending_reqs);
append = QUEUE_DATA(head, struct uvAppend, queue);
assert(append->segment != NULL);
if (append->segment != segment) {
break; /* Not targeted to this segment */
}
QUEUE_REMOVE(head);
QUEUE_PUSH(&q, head);
n_reqs++;
rv = uvAliveSegmentEncodeEntriesToWriteBuf(segment, append);
if (rv != 0) {
goto err;
}
}
/* If we have no more requests for this segment, let's check if it has been
* marked for closing, and in that case finalize it and possibly trigger a
* write against the next segment (unless there is a truncate request, in
* that case we need to wait for it). Otherwise it must mean we have
* exhausted the queue of pending append requests. */
if (n_reqs == 0) {
assert(QUEUE_IS_EMPTY(&uv->append_writing_reqs));
if (segment->finalize) {
uvAliveSegmentFinalize(segment);
if (!QUEUE_IS_EMPTY(&uv->append_pending_reqs)) {
goto start;
}
}
assert(QUEUE_IS_EMPTY(&uv->append_pending_reqs));
return 0;
}
while (!QUEUE_IS_EMPTY(&q)) {
head = QUEUE_HEAD(&q);
QUEUE_REMOVE(head);
QUEUE_PUSH(&uv->append_writing_reqs, head);
}
rv = uvAliveSegmentWrite(segment);
if (rv != 0) {
goto err;
}
return 0;
err:
assert(rv != 0);
return rv;
}
/* Invoked when a newly added open segment becomes ready for writing, after the
* associated UvPrepare request completes (either synchronously or
* asynchronously). */
static int uvAliveSegmentReady(struct uv *uv,
uv_file fd,
uvCounter counter,
struct uvAliveSegment *segment)
{
int rv;
rv = UvWriterInit(&segment->writer, uv->loop, fd, uv->direct_io,
uv->async_io, 1, uv->io->errmsg);
if (rv != 0) {
ErrMsgWrapf(uv->io->errmsg, "setup writer for open-%llu", counter);
return rv;
}
UvWriterSetTracer(&segment->writer, uv->tracer);
segment->counter = counter;
return 0;
}
static void uvAliveSegmentPrepareCb(struct uvPrepare *req, int status)
{
struct uvAliveSegment *segment = req->data;
struct uv *uv = segment->uv;
int rv;
assert(segment->counter == 0);
assert(segment->written == 0);
/* If we have been closed, let's discard the segment. */
if (uv->closing) {
QUEUE_REMOVE(&segment->queue);
assert(status == RAFT_CANCELED); /* UvPrepare cancels pending reqs */
uvSegmentBufferClose(&segment->pending);
RaftHeapFree(segment);
return;
}
if (status != 0) {
tracef("prepare segment failed (%d)", status);
rv = status;
goto err;
}
assert(req->counter > 0);
assert(req->fd >= 0);
/* There must be pending appends that were waiting for this prepare
* requests. */
assert(!QUEUE_IS_EMPTY(&uv->append_pending_reqs));
rv = uvAliveSegmentReady(uv, req->fd, req->counter, segment);
if (rv != 0) {
tracef("prepare segment ready failed (%d)", rv);
goto err;
}
rv = uvAppendMaybeStart(uv);
if (rv != 0) {
tracef("prepare segment start failed (%d)", rv);
goto err;
}
return;
err:
QUEUE_REMOVE(&segment->queue);
RaftHeapFree(segment);
uv->errored = true;
uvAppendFinishPendingRequests(uv, rv);
}
/* Initialize a new open segment object. */
static void uvAliveSegmentInit(struct uvAliveSegment *s, struct uv *uv)
{
s->uv = uv;
s->prepare.data = s;
s->writer.data = s;
s->write.data = s;
s->counter = 0;
s->first_index = uv->append_next_index;
s->pending_last_index = s->first_index - 1;
s->last_index = 0;
s->size = sizeof(uint64_t) /* Format version */;
s->next_block = 0;
uvSegmentBufferInit(&s->pending, uv->block_size);
s->written = 0;
s->barrier = NULL;
s->finalize = false;
}
/* Add a new active open segment, since the append request being submitted does
* not fit in the last segment we scheduled writes for, or no segment had been
* previously requested at all. */
static int uvAppendPushAliveSegment(struct uv *uv)
{
struct uvAliveSegment *segment;
uv_file fd;
uvCounter counter;
int rv;
segment = RaftHeapMalloc(sizeof *segment);
if (segment == NULL) {
rv = RAFT_NOMEM;
goto err;
}
uvAliveSegmentInit(segment, uv);
QUEUE_PUSH(&uv->append_segments, &segment->queue);
rv = UvPrepare(uv, &fd, &counter, &segment->prepare,
uvAliveSegmentPrepareCb);
if (rv != 0) {
goto err_after_alloc;
}
/* If we've been returned a ready prepared segment right away, start writing
* to it immediately. */
if (fd != -1) {
rv = uvAliveSegmentReady(uv, fd, counter, segment);
if (rv != 0) {
goto err_after_prepare;
}
}
return 0;
err_after_prepare:
UvOsClose(fd);
UvFinalize(uv, counter, 0, 0, 0);
err_after_alloc:
QUEUE_REMOVE(&segment->queue);
RaftHeapFree(segment);
err:
assert(rv != 0);
return rv;
}
/* Return the last segment that we have requested to prepare. */
static struct uvAliveSegment *uvGetLastAliveSegment(struct uv *uv)
{
queue *tail;
if (QUEUE_IS_EMPTY(&uv->append_segments)) {
return NULL;
}
tail = QUEUE_TAIL(&uv->append_segments);
return QUEUE_DATA(tail, struct uvAliveSegment, queue);
}
/* Return #true if the remaining capacity of the given segment is equal or
* greater than @size. */
static bool uvAliveSegmentHasEnoughSpareCapacity(struct uvAliveSegment *s,
size_t size)
{
return s->size + size <= s->uv->segment_size;
}
/* Add @size bytes to the number of bytes that the segment will hold. The actual
* write will happen when the previous write completes, if any. */
static void uvAliveSegmentReserveSegmentCapacity(struct uvAliveSegment *s,
size_t size)
{
s->size += size;
}
/* Return the number of bytes needed to store the batch of entries of this
* append request on disk. */
static size_t uvAppendSize(struct uvAppend *a)
{
size_t size = sizeof(uint32_t) * 2; /* CRC checksums */
unsigned i;
size += uvSizeofBatchHeader(a->n); /* Batch header */
for (i = 0; i < a->n; i++) { /* Entries data */
size += bytePad64(a->entries[i].buf.len);
}
return size;
}
/* Enqueue an append entries request, assigning it to the appropriate active
* open segment. */
static int uvAppendEnqueueRequest(struct uv *uv, struct uvAppend *append)
{
struct uvAliveSegment *segment;
size_t size;
bool fits;
int rv;
assert(append->entries != NULL);
assert(append->n > 0);
assert(uv->append_next_index > 0);
tracef("enqueue %u entries", append->n);
size = uvAppendSize(append);
/* If we have no segments yet, it means this is the very first append, and
* we need to add a new segment. Otherwise we check if the last segment has
* enough room for this batch of entries. */
segment = uvGetLastAliveSegment(uv);
if (segment == NULL || segment->finalize) {
fits = false;
} else {
fits = uvAliveSegmentHasEnoughSpareCapacity(segment, size);
if (!fits) {
segment->finalize = true; /* Finalize when all writes are done */
}
}
/* If there's no segment or if this batch does not fit in this segment, we
* need to add a new one. */
if (!fits) {
rv = uvAppendPushAliveSegment(uv);
if (rv != 0) {
goto err;
}
}
segment = uvGetLastAliveSegment(uv); /* Get the last added segment */
uvAliveSegmentReserveSegmentCapacity(segment, size);
append->segment = segment;
QUEUE_PUSH(&uv->append_pending_reqs, &append->queue);
uv->append_next_index += append->n;
tracef("set uv->append_next_index %llu", uv->append_next_index);
return 0;
err:
assert(rv != 0);
return rv;
}
/* Check that all entry buffers are 8-byte aligned */
static int uvCheckEntryBuffersAligned(struct uv *uv,
const struct raft_entry entries[],
unsigned n)
{
unsigned i;
for (i = 0; i < n; i++) {
if (entries[i].buf.len % 8) {
ErrMsgPrintf(uv->io->errmsg,
"entry buffers must be 8-byte aligned");
Tracef(uv->tracer, "%s", uv->io->errmsg);
return RAFT_INVALID;
}
}
return 0;
}
int UvAppend(struct raft_io *io,
struct raft_io_append *req,
const struct raft_entry entries[],
unsigned n,
raft_io_append_cb cb)
{
struct uv *uv;
struct uvAppend *append;
int rv;
uv = io->impl;
assert(!uv->closing);
append = RaftHeapCalloc(1, sizeof *append);
if (append == NULL) {
rv = RAFT_NOMEM;
goto err;
}
append->req = req;
append->entries = entries;
append->n = n;
req->cb = cb;
rv = uvCheckEntryBuffersAligned(uv, entries, n);
if (rv != 0) {
goto err_after_req_alloc;
}
rv = uvAppendEnqueueRequest(uv, append);
if (rv != 0) {
goto err_after_req_alloc;
}
assert(append->segment != NULL);
assert(!QUEUE_IS_EMPTY(&uv->append_pending_reqs));
/* Try to write immediately. */
rv = uvAppendMaybeStart(uv);
if (rv != 0) {
return rv;
}
return 0;
err_after_req_alloc:
RaftHeapFree(append);
err:
assert(rv != 0);
return rv;
}
/* Finalize the current segment as soon as all its pending or inflight append
* requests get completed. */
static void uvFinalizeCurrentAliveSegmentOnceIdle(struct uv *uv)
{
struct uvAliveSegment *s;
queue *head;
bool has_pending_reqs;
bool has_writing_reqs;
s = uvGetCurrentAliveSegment(uv);
if (s == NULL) {
return;
}
/* Check if there are pending append requests targeted to the current
* segment. */
has_pending_reqs = false;
QUEUE_FOREACH (head, &uv->append_pending_reqs) {
struct uvAppend *r = QUEUE_DATA(head, struct uvAppend, queue);
if (r->segment == s) {
has_pending_reqs = true;
break;
}
}
has_writing_reqs = !QUEUE_IS_EMPTY(&uv->append_writing_reqs);
/* If there is no pending append request or inflight write against the
* current segment, we can submit a request for it to be closed
* immediately. Otherwise, we set the finalize flag.
*
* TODO: is it actually possible to have pending requests with no writing
* requests? Probably no. */
if (!has_pending_reqs && !has_writing_reqs) {
uvAliveSegmentFinalize(s);
} else {
s->finalize = true;
}
}
bool UvBarrierReady(struct uv *uv)
{
if (uv->barrier == NULL) {
return true;
}
queue *head;
QUEUE_FOREACH (head, &uv->append_segments) {
struct uvAliveSegment *segment;
segment = QUEUE_DATA(head, struct uvAliveSegment, queue);
if (segment->barrier == uv->barrier) {
return false;
}
}
return true;
}
bool UvBarrierMaybeTrigger(struct UvBarrier *barrier)
{
if (!barrier) {
return false;
}
if (!QUEUE_IS_EMPTY(&barrier->reqs)) {
queue *head;
struct UvBarrierReq *r;
head = QUEUE_HEAD(&barrier->reqs);
QUEUE_REMOVE(head);
r = QUEUE_DATA(head, struct UvBarrierReq, queue);
r->cb(r);
return true;
}
return false;
}
/* Used during cleanup. */
static void uvBarrierTriggerAll(struct UvBarrier *barrier)
{
while (UvBarrierMaybeTrigger(barrier)) {
;
}
}
static struct UvBarrier *uvBarrierCreate(void)
{
struct UvBarrier *barrier;
barrier = RaftHeapCalloc(1, sizeof(*barrier));
if (!barrier) {
return NULL;
}
barrier->blocking = false;
QUEUE_INIT(&barrier->reqs);
return barrier;
}
int UvBarrier(struct uv *uv, raft_index next_index, struct UvBarrierReq *req)
{
/* The barrier to attach to. */
struct UvBarrier *barrier = NULL;
struct uvAliveSegment *segment = NULL;
queue *head;
assert(!uv->closing);
/* The next entry will be appended at this index. */
uv->append_next_index = next_index;
tracef("UvBarrier uv->append_next_index:%llu", uv->append_next_index);
/* Arrange for all open segments not already involved in other barriers to
* be finalized as soon as their append requests get completed and mark them
* as involved in this specific barrier request. */
QUEUE_FOREACH (head, &uv->append_segments) {
segment = QUEUE_DATA(head, struct uvAliveSegment, queue);
if (segment->barrier != NULL) {
/* If a non-blocking barrier precedes this blocking request, we want
* to also block all future writes. */
if (req->blocking) {
segment->barrier->blocking = true;
}
continue;
}
if (!barrier) {
barrier = uvBarrierCreate();
if (!barrier) {
return RAFT_NOMEM;
}
/* And add the request to the barrier. */
UvBarrierAddReq(barrier, req);
}
segment->barrier = barrier;
if (segment == uvGetCurrentAliveSegment(uv)) {
uvFinalizeCurrentAliveSegmentOnceIdle(uv);
continue;
}
segment->finalize = true;
}
/* Unable to attach to a segment, because all segments are involved in a
* barrier, or there are no segments. */
if (barrier == NULL) {
/* Attach req to last segment barrier. */
if (segment != NULL) {
barrier = segment->barrier;
/* There is no segment, attach to uv->barrier. */
} else if (uv->barrier != NULL) {
barrier = uv->barrier;
/* There is no uv->barrier, make new one. */
} else {
barrier = uvBarrierCreate();
if (!barrier) {
return RAFT_NOMEM;
}
}
UvBarrierAddReq(barrier, req);
}
/* Let's not continue writing new entries if something down the line
* asked us to stop writing. */
if (uv->barrier != NULL && req->blocking) {
uv->barrier->blocking = true;
}
assert(barrier != NULL);
if (uv->barrier == NULL) {
uv->barrier = barrier;
/* If there's no pending append-related activity, we can fire the
* callback immediately.
*
* TODO: find a way to avoid invoking this synchronously. */
if (QUEUE_IS_EMPTY(&uv->append_segments) &&
QUEUE_IS_EMPTY(&uv->finalize_reqs) &&
uv->finalize_work.data == NULL) {
/* Not interested in return value. */
UvBarrierMaybeTrigger(barrier);
}
}
return 0;
}
void UvUnblock(struct uv *uv)
{
/* First fire all pending barrier requests. Unblock will be called again
* when that request's callback is fired. */
if (UvBarrierMaybeTrigger(uv->barrier)) {
tracef("UvUnblock triggered barrier request callback.");
return;
}
/* All requests in barrier are finished. */
tracef("UvUnblock queue empty");
RaftHeapFree(uv->barrier);
uv->barrier = NULL;
if (uv->closing) {
uvMaybeFireCloseCb(uv);
return;
}
if (!QUEUE_IS_EMPTY(&uv->append_pending_reqs)) {
int rv;
rv = uvAppendMaybeStart(uv);
if (rv != 0) {
uv->errored = true;
}
}
}
void UvBarrierAddReq(struct UvBarrier *barrier, struct UvBarrierReq *req)
{
assert(barrier != NULL);
assert(req != NULL);
/* Once there's a blocking req, this barrier becomes blocking. */
barrier->blocking |= req->blocking;
QUEUE_PUSH(&barrier->reqs, &req->queue);
}
/* Fire all pending barrier requests, the barrier callback will notice that
* we're closing and abort there. */
static void uvBarrierClose(struct uv *uv)
{
tracef("uv barrier close");
struct UvBarrier *barrier = NULL;
queue *head;
assert(uv->closing);
QUEUE_FOREACH (head, &uv->append_segments) {
struct uvAliveSegment *segment;
segment = QUEUE_DATA(head, struct uvAliveSegment, queue);
if (segment->barrier != NULL && segment->barrier != barrier &&
segment->barrier != uv->barrier) {
barrier = segment->barrier;
/* Fire all barrier cb's, this is safe because the barrier cb exits
* early when uv->closing is true. */
uvBarrierTriggerAll(barrier);
RaftHeapFree(barrier);
}
/* The segment->barrier field is used:
*
* - by UvBarrierReady, to check whether it's time to invoke the barrier
* callback after successfully finalizing a segment
* - by uvAppendMaybeStart, to see whether we should go ahead with
* writing to a segment even though a barrier is active because the
* barrier is waiting on that same segment to be finalized (but see the
* FIXME in that function)
* - to save a barrier for later, if UvBarrier was called when
* uv->barrier was already set
*
* If we're cancelling the barrier, we don't need to save it for later;
* the callback will not be invoked a second time in any case; and
* uvAppendMaybeStart won't be called while closing. So it's fine to
* clear segment->barrier here. */
segment->barrier = NULL;
}
/* There might still be a current barrier set on uv->barrier, meaning
* that the open segment it was associated with has started to be finalized
* and is not anymore in the append_segments queue. Let's cancel all
* untriggered barrier request callbacks too. */
if (uv->barrier != NULL) {
uvBarrierTriggerAll(uv->barrier);
/* Clear uv->barrier if there's no active work on the thread pool. When
* the work on the threadpool finishes, UvUnblock will notice
* we're closing, clear and free uv->barrier and call
* uvMaybeFireCloseCb. UnUnblock will not try to fire anymore barrier
* request callbacks because they were triggered in the line above. */
if (uv->snapshot_put_work.data == NULL &&
uv->truncate_work.data == NULL) {
RaftHeapFree(uv->barrier);
uv->barrier = NULL;
}
}
}
static void uvAppendRetryCloseCb(uv_handle_t *handle)
{
struct uv *uv = handle->data;
assert(uv->closing);
uv->append_retry.data = NULL;
uvMaybeFireCloseCb(uv);
}
void uvAppendClose(struct uv *uv)
{
struct uvAliveSegment *segment;
assert(uv->closing);
uvBarrierClose(uv);
UvPrepareClose(uv);
uvAppendFinishPendingRequests(uv, RAFT_CANCELED);
uvFinalizeCurrentAliveSegmentOnceIdle(uv);
if (uv->append_retry.data != NULL) {
/* If the timer data is not uv, then it's set to an alive segment, and
* we're currently waiting to retry a failed disk write. In that case,
* cancel all inflight requests. */
if (uv->append_retry.data != uv) {
uvAppendFinishWritingRequests(uv, RAFT_CANCELED);
uvAliveSegmentFinalize(uv->append_retry.data);
uv->append_retry.data = uv;
}
uv_timer_stop(&uv->append_retry);
uv_close((uv_handle_t *)&uv->append_retry, uvAppendRetryCloseCb);
}
/* Also finalize the segments that we didn't write at all and are just
* sitting in the append_segments queue waiting for writes against the
* current segment to complete. */
while (!QUEUE_IS_EMPTY(&uv->append_segments)) {
segment = uvGetLastAliveSegment(uv);
assert(segment != NULL);
if (segment == uvGetCurrentAliveSegment(uv)) {
break; /* We reached the head of the queue */
}
assert(segment->written == 0);
uvAliveSegmentFinalize(segment);
}
}
size_t UvAppendCapacity(struct uv *uv)
{
size_t capacity = 0;
queue *head;
QUEUE_FOREACH (head, &uv->append_segments) {
struct uvAliveSegment *segment;
size_t size;
segment = QUEUE_DATA(head, struct uvAliveSegment, queue);
/* Skip segments that were not assigned a prepared segment yet. */
if (segment->counter == 0) {
continue;
}
size = segment->size;
if (size > uv->segment_size) {
size = uv->segment_size;
}
capacity += uv->segment_size - size;
}
return capacity;
}
raft-0.22.1/src/uv_encoding.c 0000664 0000000 0000000 00000044464 14601504142 0015774 0 ustar 00root root 0000000 0000000 #include "uv_encoding.h"
#include
#include
#include "../include/raft/uv.h"
#include "assert.h"
#include "byte.h"
#include "configuration.h"
/**
* Size of the request preamble.
*/
#define RAFT_IO_UV__PREAMBLE_SIZE \
(sizeof(uint64_t) /* Message type. */ + \
sizeof(uint64_t) /* Message size. */)
static size_t sizeofRequestVoteV1(void)
{
return sizeof(uint64_t) + /* Term. */
sizeof(uint64_t) + /* Candidate ID. */
sizeof(uint64_t) + /* Last log index. */
sizeof(uint64_t) /* Last log term. */;
}
static size_t sizeofRequestVote(void)
{
return sizeofRequestVoteV1() + sizeof(uint64_t) /* Flags. */;
}
static size_t sizeofRequestVoteResultV1(void)
{
return sizeof(uint64_t) + /* Term. */
sizeof(uint64_t) /* Vote granted. */;
}
static size_t sizeofRequestVoteResult(void)
{
return sizeofRequestVoteResultV1() + /* Size of older version 1 message */
sizeof(uint8_t) + /* Flags */
sizeof(uint8_t) + /* Unused */
sizeof(uint16_t) + /* Features */
sizeof(uint16_t) + /* Capacity */
sizeof(uint16_t); /* Unused */
}
static size_t sizeofAppendEntries(const struct raft_append_entries *p)
{
return sizeof(uint64_t) + /* Leader's term. */
sizeof(uint64_t) + /* Previous log entry index */
sizeof(uint64_t) + /* Previous log entry term */
sizeof(uint64_t) + /* Leader's commit index */
uvSizeofBatchHeader(p->n_entries) + /* Batch header */
sizeof(uint64_t); /* XXX: currently unused */
}
static size_t sizeofAppendEntriesResultV0(void)
{
return sizeof(uint64_t) + /* Term. */
sizeof(uint64_t) + /* Success. */
sizeof(uint64_t); /* Last log index. */
}
static size_t sizeofAppendEntriesResult(void)
{
return sizeofAppendEntriesResultV0() + /* Size of older version 0 message */
sizeof(uint16_t) + /* Server features. */
sizeof(uint16_t) + /* Capacity. */
sizeof(uint32_t); /* Unused */
}
static size_t sizeofInstallSnapshot(const struct raft_install_snapshot *p)
{
size_t conf_size = configurationEncodedSize(&p->conf);
return sizeof(uint64_t) + /* Leader's term. */
sizeof(uint64_t) + /* Snapshot's last index */
sizeof(uint64_t) + /* Term of last index */
sizeof(uint64_t) + /* Configuration's index */
sizeof(uint64_t) + /* Length of configuration */
conf_size + /* Configuration data */
sizeof(uint64_t) + /* Length of snapshot data */
sizeof(uint64_t); /* XXX: currently unused */
}
static size_t sizeofTimeoutNow(void)
{
return sizeof(uint64_t) + /* Term. */
sizeof(uint64_t) + /* Last log index. */
sizeof(uint64_t) /* Last log term. */;
}
size_t uvSizeofBatchHeader(size_t n)
{
return 8 + /* Number of entries in the batch, little endian */
16 * n /* One header per entry */;
}
static void encodeRequestVote(const struct raft_request_vote *p, void *buf)
{
uint8_t *cursor = buf;
uint64_t flags = 0;
if (p->disrupt_leader) {
flags |= 1 << 0;
}
if (p->pre_vote) {
flags |= 1 << 1;
}
bytePut64(&cursor, p->term);
bytePut64(&cursor, p->candidate_id);
bytePut64(&cursor, p->last_log_index);
bytePut64(&cursor, p->last_log_term);
bytePut64(&cursor, flags);
}
static void encodeRequestVoteResult(const struct raft_request_vote_result *p,
void *buf)
{
uint8_t *cursor = buf;
uint8_t flags = 0;
if (p->pre_vote) {
flags |= (1 << 0);
}
bytePut64(&cursor, p->term);
bytePut64(&cursor, p->vote_granted);
bytePut8(&cursor, flags);
bytePut8(&cursor, 0);
bytePut16(&cursor, p->features);
bytePut16(&cursor, p->capacity);
bytePut16(&cursor, 0);
}
static void encodeAppendEntries(const struct raft_append_entries *p, void *buf)
{
uint8_t *cursor;
cursor = buf;
bytePut64(&cursor, p->term); /* Leader's term. */
bytePut64(&cursor, p->prev_log_index); /* Previous log entry index. */
bytePut64(&cursor, p->prev_log_term); /* Previous log entry term. */
bytePut64(&cursor, p->leader_commit); /* Leader's commit index. */
uvEncodeBatchHeader(p->entries, p->n_entries, cursor); /* Batch header */
cursor = (uint8_t *)cursor + uvSizeofBatchHeader(p->n_entries);
bytePut64(&cursor, 0); /* XXX: currently unused */
}
static void encodeAppendEntriesResult(
const struct raft_append_entries_result *p,
void *buf)
{
uint8_t *cursor = buf;
bytePut64(&cursor, p->term);
bytePut64(&cursor, p->rejected);
bytePut64(&cursor, p->last_log_index);
bytePut16(&cursor, p->features);
bytePut16(&cursor, p->capacity);
bytePut32(&cursor, 0 /* Unused */);
}
static void encodeInstallSnapshot(const struct raft_install_snapshot *p,
void *buf)
{
uint8_t *cursor;
size_t conf_size = configurationEncodedSize(&p->conf);
cursor = buf;
bytePut64(&cursor, p->term); /* Leader's term */
bytePut64(&cursor, p->last_index); /* Snapshot's last index */
bytePut64(&cursor, p->last_term); /* Term of last index */
bytePut64(&cursor, p->conf_index); /* Configuration's index */
bytePut64(&cursor, conf_size); /* Length of configuration */
configurationEncodeToBuf(&p->conf, cursor,
conf_size); /* Configuration data */
cursor = (uint8_t *)cursor + conf_size;
bytePut64(&cursor, p->data.len); /* Length of snapshot data */
bytePut64(&cursor, 0); /* XXX: currently unused */
}
static void encodeTimeoutNow(const struct raft_timeout_now *p, void *buf)
{
uint8_t *cursor = buf;
bytePut64(&cursor, p->term);
bytePut64(&cursor, p->last_log_index);
bytePut64(&cursor, p->last_log_term);
}
int uvEncodeMessage(const struct raft_message *message,
uv_buf_t **bufs,
unsigned *n_bufs)
{
uv_buf_t header;
uint8_t *cursor;
int version;
/* Figure out the length of the header for this request and allocate a
* buffer for it. */
header.len = RAFT_IO_UV__PREAMBLE_SIZE;
switch (message->type) {
case RAFT_REQUEST_VOTE:
header.len += sizeofRequestVote();
version = message->request_vote.version;
break;
case RAFT_REQUEST_VOTE_RESULT:
header.len += sizeofRequestVoteResult();
version = message->request_vote_result.version;
break;
case RAFT_APPEND_ENTRIES:
header.len += sizeofAppendEntries(&message->append_entries);
version = message->append_entries.version;
break;
case RAFT_APPEND_ENTRIES_RESULT:
header.len += sizeofAppendEntriesResult();
version = message->append_entries_result.version;
break;
case RAFT_INSTALL_SNAPSHOT:
header.len += sizeofInstallSnapshot(&message->install_snapshot);
version = message->install_snapshot.version;
break;
case RAFT_TIMEOUT_NOW:
header.len += sizeofTimeoutNow();
version = message->timeout_now.version;
break;
default:
return RAFT_MALFORMED;
};
header.base = raft_malloc(header.len);
if (header.base == NULL) {
goto oom;
}
cursor = (uint8_t *)header.base;
/* Encode the request preamble, with message type, version and size. */
bytePut8(&cursor, (uint8_t)message->type);
bytePut8(&cursor, 0);
bytePut8(&cursor, (uint8_t)version);
bytePut8(&cursor, 0);
bytePut32(&cursor, 0);
bytePut64(&cursor, header.len - RAFT_IO_UV__PREAMBLE_SIZE);
/* Encode the request header. */
switch (message->type) {
case RAFT_REQUEST_VOTE:
encodeRequestVote(&message->request_vote, cursor);
break;
case RAFT_REQUEST_VOTE_RESULT:
encodeRequestVoteResult(&message->request_vote_result, cursor);
break;
case RAFT_APPEND_ENTRIES:
encodeAppendEntries(&message->append_entries, cursor);
break;
case RAFT_APPEND_ENTRIES_RESULT:
encodeAppendEntriesResult(&message->append_entries_result, cursor);
break;
case RAFT_INSTALL_SNAPSHOT:
encodeInstallSnapshot(&message->install_snapshot, cursor);
break;
case RAFT_TIMEOUT_NOW:
encodeTimeoutNow(&message->timeout_now, cursor);
break;
};
*n_bufs = 1;
/* For AppendEntries request we also send the entries payload. */
if (message->type == RAFT_APPEND_ENTRIES) {
*n_bufs += message->append_entries.n_entries;
}
/* For InstallSnapshot request we also send the snapshot payload. */
if (message->type == RAFT_INSTALL_SNAPSHOT) {
*n_bufs += 1;
}
*bufs = raft_calloc(*n_bufs, sizeof **bufs);
if (*bufs == NULL) {
goto oom_after_header_alloc;
}
(*bufs)[0] = header;
if (message->type == RAFT_APPEND_ENTRIES) {
unsigned i;
for (i = 0; i < message->append_entries.n_entries; i++) {
const struct raft_entry *entry =
&message->append_entries.entries[i];
(*bufs)[i + 1].base = entry->buf.base;
(*bufs)[i + 1].len = entry->buf.len;
}
}
if (message->type == RAFT_INSTALL_SNAPSHOT) {
(*bufs)[1].base = message->install_snapshot.data.base;
(*bufs)[1].len = message->install_snapshot.data.len;
}
return 0;
oom_after_header_alloc:
raft_free(header.base);
oom:
return RAFT_NOMEM;
}
void uvEncodeBatchHeader(const struct raft_entry *entries,
unsigned n,
void *buf)
{
unsigned i;
uint8_t *cursor = buf;
/* Number of entries in the batch, little endian */
bytePut64(&cursor, n);
for (i = 0; i < n; i++) {
const struct raft_entry *entry = &entries[i];
/* Term in which the entry was created, little endian. */
bytePut64(&cursor, entry->term);
/* Message type (Either RAFT_COMMAND or RAFT_CHANGE) */
bytePut8(&cursor, (uint8_t)entry->type);
/* Unused */
bytePut8(&cursor, 0);
bytePut8(&cursor, 0);
bytePut8(&cursor, 0);
/* Size of the log entry data, little endian. */
bytePut32(&cursor, (uint32_t)entry->buf.len);
}
}
static void decodeRequestVote(unsigned char version,
const uv_buf_t *buf,
struct raft_request_vote *p)
{
const uint8_t *cursor;
cursor = (void *)buf->base;
/* If the version is 0 the message was sent by a server that
* does not encodes the version byte in the preamble. */
if (version == 0) {
if (buf->len == sizeofRequestVoteV1()) {
version = 1;
} else {
version = 2;
}
}
p->version = version;
p->term = byteGet64(&cursor);
p->candidate_id = byteGet64(&cursor);
p->last_log_index = byteGet64(&cursor);
p->last_log_term = byteGet64(&cursor);
p->disrupt_leader = false;
p->pre_vote = false;
if (p->version >= 1) {
uint64_t flags = byteGet64(&cursor);
p->disrupt_leader = (bool)(flags & 1 << 0);
p->pre_vote = (bool)(flags & 1 << 1);
}
}
static void decodeRequestVoteResult(unsigned char version,
const uv_buf_t *buf,
struct raft_request_vote_result *p)
{
const uint8_t *cursor;
cursor = (void *)buf->base;
/* If the version is 0 the message was sent by a server that
* does not encodes the version byte in the preamble. */
if (version == 0) {
if (buf->len == sizeofRequestVoteResultV1()) {
version = 1;
} else {
version = 2;
}
}
p->version = version;
p->term = byteGet64(&cursor);
p->vote_granted = byteGet64(&cursor);
p->features = 0;
p->capacity = 0;
if (p->version >= 2) {
uint8_t flags = byteGet8(&cursor);
uint8_t unused = byteGet8(&cursor);
uint16_t features = byteGet16(&cursor);
uint16_t capacity = byteGet16(&cursor);
p->version = 2;
p->pre_vote = (flags & (1 << 0));
(void)unused;
p->features = features;
p->capacity = capacity;
}
}
int uvDecodeBatchHeader(const void *batch,
struct raft_entry **entries,
unsigned *n)
{
const uint8_t *cursor = batch;
size_t i;
int rv;
*n = (unsigned)byteGet64(&cursor);
if (*n == 0) {
*entries = NULL;
return 0;
}
*entries = raft_malloc(*n * sizeof **entries);
if (*entries == NULL) {
rv = RAFT_NOMEM;
goto err;
}
for (i = 0; i < *n; i++) {
struct raft_entry *entry = &(*entries)[i];
entry->term = byteGet64(&cursor);
entry->type = byteGet8(&cursor);
if (entry->type != RAFT_COMMAND && entry->type != RAFT_BARRIER &&
entry->type != RAFT_CHANGE) {
rv = RAFT_MALFORMED;
goto err_after_alloc;
}
cursor = (uint8_t *)cursor + 3; /* Unused */
/* Size of the log entry data, little endian. */
entry->buf.len = byteGet32(&cursor);
}
return 0;
err_after_alloc:
raft_free(*entries);
*entries = NULL;
err:
assert(rv != 0);
return rv;
}
static int decodeAppendEntries(unsigned char version,
const uv_buf_t *buf,
struct raft_append_entries *args)
{
const uint8_t *cursor;
int rv;
assert(buf != NULL);
assert(args != NULL);
cursor = (void *)buf->base;
args->version = version;
args->term = byteGet64(&cursor);
args->prev_log_index = byteGet64(&cursor);
args->prev_log_term = byteGet64(&cursor);
args->leader_commit = byteGet64(&cursor);
rv = uvDecodeBatchHeader(cursor, &args->entries, &args->n_entries);
if (rv != 0) {
return rv;
}
return 0;
}
static void decodeAppendEntriesResult(unsigned char version,
const uv_buf_t *buf,
struct raft_append_entries_result *p)
{
const uint8_t *cursor;
cursor = (void *)buf->base;
/* If the version is 0 the message was sent by a server that
* does not encodes the version byte in the preamble. */
if (version == 0) {
if (buf->len > sizeofAppendEntriesResultV0()) {
version = 2;
}
}
p->version = version;
p->term = byteGet64(&cursor);
p->rejected = byteGet64(&cursor);
p->last_log_index = byteGet64(&cursor);
p->features = 0;
p->capacity = 0;
if (p->version >= 1) {
p->features = byteGet16(&cursor);
}
if (p->version >= 2) {
p->capacity = byteGet16(&cursor);
}
}
static int decodeInstallSnapshot(unsigned char version,
const uv_buf_t *buf,
struct raft_install_snapshot *args)
{
const uint8_t *cursor;
struct raft_buffer conf;
int rv;
assert(buf != NULL);
assert(args != NULL);
cursor = (void *)buf->base;
args->version = version;
args->term = byteGet64(&cursor);
args->last_index = byteGet64(&cursor);
args->last_term = byteGet64(&cursor);
args->conf_index = byteGet64(&cursor);
conf.len = (size_t)byteGet64(&cursor);
conf.base = (void *)cursor;
rv = configurationDecode(&conf, &args->conf);
if (rv != 0) {
return rv;
}
cursor = (uint8_t *)cursor + conf.len;
args->data.len = (size_t)byteGet64(&cursor);
return 0;
}
static void decodeTimeoutNow(const uv_buf_t *buf, struct raft_timeout_now *p)
{
const uint8_t *cursor;
cursor = (void *)buf->base;
p->version = 0;
p->term = byteGet64(&cursor);
p->last_log_index = byteGet64(&cursor);
p->last_log_term = byteGet64(&cursor);
}
int uvDecodeMessage(uint8_t type,
uint8_t version,
const uv_buf_t *header,
struct raft_message *message,
size_t *payload_len)
{
unsigned i;
int rv = 0;
memset(message, 0, sizeof(*message));
message->type = (unsigned short)type;
*payload_len = 0;
/* Decode the header. */
switch (type) {
case RAFT_REQUEST_VOTE:
decodeRequestVote(version, header, &message->request_vote);
break;
case RAFT_REQUEST_VOTE_RESULT:
decodeRequestVoteResult(version, header,
&message->request_vote_result);
break;
case RAFT_APPEND_ENTRIES:
rv = decodeAppendEntries(version, header, &message->append_entries);
for (i = 0; i < message->append_entries.n_entries; i++) {
*payload_len += message->append_entries.entries[i].buf.len;
}
break;
case RAFT_APPEND_ENTRIES_RESULT:
decodeAppendEntriesResult(version, header,
&message->append_entries_result);
break;
case RAFT_INSTALL_SNAPSHOT:
rv = decodeInstallSnapshot(version, header,
&message->install_snapshot);
*payload_len += message->install_snapshot.data.len;
break;
case RAFT_TIMEOUT_NOW:
decodeTimeoutNow(header, &message->timeout_now);
break;
default:
rv = RAFT_IOERR;
break;
};
return rv;
}
void uvDecodeEntriesBatch(uint8_t *batch,
size_t offset,
struct raft_entry *entries,
unsigned n)
{
uint8_t *cursor;
size_t i;
assert(batch != NULL);
cursor = batch + offset;
for (i = 0; i < n; i++) {
struct raft_entry *entry = &entries[i];
entry->batch = batch;
if (entry->buf.len == 0) {
entry->buf.base = NULL;
continue;
}
entry->buf.base = cursor;
cursor = cursor + entry->buf.len;
if (entry->buf.len % 8 != 0) {
/* Add padding */
cursor = cursor + 8 - (entry->buf.len % 8);
}
}
}
raft-0.22.1/src/uv_encoding.h 0000664 0000000 0000000 00000003735 14601504142 0015775 0 ustar 00root root 0000000 0000000 /* Encoding routines for the the libuv-based @raft_io backend. */
#ifndef UV_ENCODING_H_
#define UV_ENCODING_H_
#include
#include "../include/raft.h"
/* Current disk format version. */
#define UV__DISK_FORMAT 1
int uvEncodeMessage(const struct raft_message *message,
uv_buf_t **bufs,
unsigned *n_bufs);
int uvDecodeMessage(uint8_t type,
uint8_t version,
const uv_buf_t *header,
struct raft_message *message,
size_t *payload_len);
int uvDecodeBatchHeader(const void *batch,
struct raft_entry **entries,
unsigned *n);
void uvDecodeEntriesBatch(uint8_t *batch,
size_t offset,
struct raft_entry *entries,
unsigned n);
/**
* The layout of the memory pointed at by a @batch pointer is the following:
*
* [8 bytes] Number of entries in the batch, little endian.
* [header1] Header data of the first entry of the batch.
* [ ... ] More headers
* [headerN] Header data of the last entry of the batch.
* [data1 ] Payload data of the first entry of the batch.
* [ ... ] More data
* [dataN ] Payload data of the last entry of the batch.
*
* An entry header is 16-byte long and has the following layout:
*
* [8 bytes] Term in which the entry was created, little endian.
* [1 byte ] Message type (Either RAFT_COMMAND or RAFT_CHANGE)
* [3 bytes] Currently unused.
* [4 bytes] Size of the log entry data, little endian.
*
* A payload data section for an entry is simply a sequence of bytes of
* arbitrary lengths, possibly padded with extra bytes to reach 8-byte boundary
* (which means that all entry data pointers are 8-byte aligned).
*/
size_t uvSizeofBatchHeader(size_t n);
void uvEncodeBatchHeader(const struct raft_entry *entries,
unsigned n,
void *buf);
#endif /* UV_ENCODING_H_ */
raft-0.22.1/src/uv_finalize.c 0000664 0000000 0000000 00000011433 14601504142 0015775 0 ustar 00root root 0000000 0000000 #include "assert.h"
#include "heap.h"
#include "queue.h"
#include "uv.h"
#include "uv_os.h"
#define tracef(...) Tracef(uv->tracer, __VA_ARGS__)
/* Metadata about an open segment not used anymore and that should be closed or
* remove (if not written at all). */
struct uvDyingSegment
{
struct uv *uv;
uvCounter counter; /* Segment counter */
size_t used; /* Number of used bytes */
raft_index first_index; /* Index of first entry */
raft_index last_index; /* Index of last entry */
int status; /* Status code of blocking syscalls */
queue queue; /* Link to finalize queue */
};
/* Run all blocking syscalls involved in closing a used open segment.
*
* An open segment is closed by truncating its length to the number of bytes
* that were actually written into it and then renaming it. */
static void uvFinalizeWorkCb(uv_work_t *work)
{
struct uvDyingSegment *segment = work->data;
struct uv *uv = segment->uv;
char filename1[UV__FILENAME_LEN];
char filename2[UV__FILENAME_LEN];
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
sprintf(filename1, UV__OPEN_TEMPLATE, segment->counter);
sprintf(filename2, UV__CLOSED_TEMPLATE, segment->first_index,
segment->last_index);
tracef("finalize %s into %s", filename1, filename2);
/* If the segment hasn't actually been used (because the writer has been
* closed or aborted before making any write), just remove it. */
if (segment->used == 0) {
rv = UvFsRemoveFile(uv->dir, filename1, errmsg);
if (rv != 0) {
goto err;
}
goto sync;
}
/* Truncate and rename the segment.*/
rv = UvFsTruncateAndRenameFile(uv->dir, segment->used, filename1, filename2,
errmsg);
if (rv != 0) {
goto err;
}
sync:
rv = UvFsSyncDir(uv->dir, errmsg);
if (rv != 0) {
goto err;
}
segment->status = 0;
return;
err:
tracef("truncate segment %s: %s", filename1, errmsg);
assert(rv != 0);
segment->status = rv;
}
static int uvFinalizeStart(struct uvDyingSegment *segment);
static void uvFinalizeAfterWorkCb(uv_work_t *work, int status)
{
struct uvDyingSegment *segment = work->data;
struct uv *uv = segment->uv;
tracef("uv finalize after work segment %p cb status:%d", (void *)segment,
status);
queue *head;
int rv;
assert(status == 0); /* We don't cancel worker requests */
uv->finalize_work.data = NULL;
if (segment->status != 0) {
uv->errored = true;
}
RaftHeapFree(segment);
/* If we have no more dismissed segments to close, check if there's a
* barrier to unblock or if we are done closing. */
if (QUEUE_IS_EMPTY(&uv->finalize_reqs)) {
tracef("unblock barrier or close");
if (uv->barrier != NULL && UvBarrierReady(uv)) {
UvBarrierMaybeTrigger(uv->barrier);
}
uvMaybeFireCloseCb(uv);
return;
}
/* Grab a new dismissed segment to close. */
head = QUEUE_HEAD(&uv->finalize_reqs);
segment = QUEUE_DATA(head, struct uvDyingSegment, queue);
QUEUE_REMOVE(&segment->queue);
rv = uvFinalizeStart(segment);
if (rv != 0) {
RaftHeapFree(segment);
uv->errored = true;
}
}
/* Start finalizing an open segment. */
static int uvFinalizeStart(struct uvDyingSegment *segment)
{
struct uv *uv = segment->uv;
int rv;
assert(uv->finalize_work.data == NULL);
assert(segment->counter > 0);
uv->finalize_work.data = segment;
rv = uv_queue_work(uv->loop, &uv->finalize_work, uvFinalizeWorkCb,
uvFinalizeAfterWorkCb);
if (rv != 0) {
ErrMsgPrintf(uv->io->errmsg, "start to truncate segment file %llu: %s",
segment->counter, uv_strerror(rv));
return RAFT_IOERR;
}
return 0;
}
int UvFinalize(struct uv *uv,
unsigned long long counter,
size_t used,
raft_index first_index,
raft_index last_index)
{
struct uvDyingSegment *segment;
int rv;
if (used > 0) {
assert(first_index > 0);
assert(last_index >= first_index);
}
segment = RaftHeapMalloc(sizeof *segment);
if (segment == NULL) {
return RAFT_NOMEM;
}
segment->uv = uv;
segment->counter = counter;
segment->used = used;
segment->first_index = first_index;
segment->last_index = last_index;
/* If we're already processing a segment, let's put the request in the queue
* and wait. */
if (uv->finalize_work.data != NULL) {
QUEUE_PUSH(&uv->finalize_reqs, &segment->queue);
return 0;
}
rv = uvFinalizeStart(segment);
if (rv != 0) {
RaftHeapFree(segment);
return rv;
}
return 0;
}
#undef tracef
raft-0.22.1/src/uv_fs.c 0000664 0000000 0000000 00000056330 14601504142 0014611 0 ustar 00root root 0000000 0000000 #include "uv_fs.h"
#include
#include
#include
#include
#include "assert.h"
#include "err.h"
#include "heap.h"
#include "uv_os.h"
int UvFsCheckDir(const char *dir, char *errmsg)
{
struct uv_fs_s req;
int rv;
/* Make sure we have a directory we can write into. */
rv = uv_fs_stat(NULL, &req, dir, NULL);
if (rv != 0) {
switch (rv) {
case UV_ENOENT:
ErrMsgPrintf((char *)errmsg, "directory '%s' does not exist",
dir);
return RAFT_NOTFOUND;
case UV_EACCES:
ErrMsgPrintf((char *)errmsg, "can't access directory '%s'",
dir);
return RAFT_UNAUTHORIZED;
case UV_ENOTDIR:
ErrMsgPrintf((char *)errmsg, "path '%s' is not a directory",
dir);
return RAFT_INVALID;
}
ErrMsgPrintf((char *)errmsg, "can't stat '%s': %s", dir,
uv_strerror(rv));
return RAFT_IOERR;
}
if (!(req.statbuf.st_mode & S_IFDIR)) {
ErrMsgPrintf((char *)errmsg, "path '%s' is not a directory", dir);
return RAFT_INVALID;
}
if (!(req.statbuf.st_mode & S_IWRITE)) {
ErrMsgPrintf((char *)errmsg, "directory '%s' is not writable", dir);
return RAFT_INVALID;
}
return 0;
}
int UvFsSyncDir(const char *dir, char *errmsg)
{
uv_file fd;
int rv;
rv = UvOsOpen(dir, UV_FS_O_RDONLY | UV_FS_O_DIRECTORY, 0, &fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "open directory", rv);
return RAFT_IOERR;
}
rv = UvOsFsync(fd);
UvOsClose(fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "fsync directory", rv);
return RAFT_IOERR;
}
return 0;
}
int UvFsFileExists(const char *dir,
const char *filename,
bool *exists,
char *errmsg)
{
uv_stat_t sb;
char path[UV__PATH_SZ];
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsStat(path, &sb);
if (rv != 0) {
if (rv == UV_ENOENT) {
*exists = false;
goto out;
}
UvOsErrMsg(errmsg, "stat", rv);
return RAFT_IOERR;
}
*exists = true;
out:
return 0;
}
/* Get the size of the given file. */
int UvFsFileSize(const char *dir,
const char *filename,
off_t *size,
char *errmsg)
{
uv_stat_t sb;
char path[UV__PATH_SZ];
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsStat(path, &sb);
if (rv != 0) {
UvOsErrMsg(errmsg, "stat", rv);
return RAFT_IOERR;
}
*size = (off_t)sb.st_size;
return 0;
}
int UvFsFileIsEmpty(const char *dir,
const char *filename,
bool *empty,
char *errmsg)
{
off_t size;
int rv;
rv = UvFsFileSize(dir, filename, &size, errmsg);
if (rv != 0) {
return rv;
}
*empty = size == 0 ? true : false;
return 0;
}
/* Open a file in a directory. */
static int uvFsOpenFile(const char *dir,
const char *filename,
int flags,
int mode,
uv_file *fd,
char *errmsg)
{
char path[UV__PATH_SZ];
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsOpen(path, flags, mode, fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "open", rv);
return RAFT_IOERR;
}
return 0;
}
int UvFsOpenFileForReading(const char *dir,
const char *filename,
uv_file *fd,
char *errmsg)
{
char path[UV__PATH_SZ];
int flags = O_RDONLY;
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
return uvFsOpenFile(dir, filename, flags, 0, fd, errmsg);
}
static int uvFsAllocate(uv_file fd, size_t size, char *errmsg)
{
int rv;
rv = UvOsFallocate(fd, 0, (off_t)size);
if (rv != 0) {
switch (rv) {
case UV_ENOSPC:
ErrMsgPrintf(errmsg, "not enough space to allocate %zu bytes",
size);
rv = RAFT_NOSPACE;
break;
default:
UvOsErrMsg(errmsg, "posix_allocate", rv);
rv = RAFT_IOERR;
break;
}
}
return rv;
}
int UvFsAllocateFile(const char *dir,
const char *filename,
size_t size,
uv_file *fd,
char *errmsg)
{
char path[UV__PATH_SZ];
int flags = O_WRONLY | O_CREAT | O_EXCL; /* Common open flags */
int rv = 0;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, fd, errmsg);
if (rv != 0) {
goto err;
}
/* Allocate the desired size. */
rv = uvFsAllocate(*fd, size, errmsg);
if (rv != 0) {
goto err_after_open;
}
return 0;
err_after_open:
UvOsClose(*fd);
UvOsUnlink(path);
err:
assert(rv != 0);
return rv;
}
int UvFsCreateTempFile(const char *dir,
struct raft_buffer *bufs,
unsigned n_bufs,
uv_file *fd,
char *errmsg)
{
unsigned i;
size_t size = 0;
int rv;
for (i = 0; i < n_bufs; i++) {
size += bufs[i].len;
}
rv = uvFsOpenFile(dir, "", O_TMPFILE | O_WRONLY, S_IRUSR | S_IWUSR, fd,
errmsg);
if (rv != 0) {
goto err;
}
rv = uvFsAllocate(*fd, size, errmsg);
if (rv != 0) {
goto err_after_open;
}
rv = UvOsWrite(*fd, (const uv_buf_t *)bufs, n_bufs, 0);
if (rv != (int)(size)) {
if (rv < 0) {
UvOsErrMsg(errmsg, "write", rv);
} else {
ErrMsgPrintf(errmsg, "short write: only %d bytes written", rv);
}
rv = RAFT_IOERR;
goto err_after_open;
}
rv = UvOsFsync(*fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "fsync", rv);
rv = RAFT_IOERR;
goto err_after_open;
}
return 0;
err_after_open:
UvOsClose(*fd);
err:
assert(rv != 0);
return rv;
}
int UvFsFinalizeTempFile(uv_file fd,
const char *dir,
const char *filename,
char *errmsg)
{
char path[UV__PATH_SZ];
char procpath[PATH_MAX];
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
rv = RAFT_INVALID;
goto err_before_close;
}
snprintf(procpath, PATH_MAX, "/proc/self/fd/%d", fd);
rv = UvOsLinkat(AT_FDCWD, procpath, AT_FDCWD, path, AT_SYMLINK_FOLLOW);
if (rv != 0) {
UvOsErrMsg(errmsg, "linkat", rv);
rv = RAFT_IOERR;
goto err_before_close;
}
rv = UvOsClose(fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "close", rv);
rv = RAFT_IOERR;
goto err;
}
return 0;
err_before_close:
UvOsClose(fd);
err:
assert(rv != 0);
return rv;
}
static int uvFsWriteFile(const char *dir,
const char *filename,
int flags,
struct raft_buffer *bufs,
unsigned n_bufs,
char *errmsg)
{
uv_file fd;
int rv;
size_t size;
unsigned i;
size = 0;
for (i = 0; i < n_bufs; i++) {
size += bufs[i].len;
}
rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, &fd, errmsg);
if (rv != 0) {
goto err;
}
rv = UvOsWrite(fd, (const uv_buf_t *)bufs, n_bufs, 0);
if (rv != (int)(size)) {
if (rv < 0) {
UvOsErrMsg(errmsg, "write", rv);
} else {
ErrMsgPrintf(errmsg, "short write: %d only bytes written", rv);
}
goto err_after_file_open;
}
rv = UvOsFsync(fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "fsync", rv);
goto err_after_file_open;
}
rv = UvOsClose(fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "close", rv);
goto err;
}
return 0;
err_after_file_open:
UvOsClose(fd);
err:
return rv;
}
int UvFsMakeFile(const char *dir,
const char *filename,
struct raft_buffer *bufs,
unsigned n_bufs,
char *errmsg)
{
int rv;
char tmp_filename[UV__FILENAME_LEN + 1] = {0};
char path[UV__PATH_SZ] = {0};
char tmp_path[UV__PATH_SZ] = {0};
/* Create a temp file with the given content
* TODO as of libuv 1.34.0, use `uv_fs_mkstemp` */
size_t sz = sizeof(tmp_filename);
rv = snprintf(tmp_filename, sz, TMP_FILE_FMT, filename);
if (rv < 0 || rv >= (int)sz) {
return rv;
}
int flags = UV_FS_O_WRONLY | UV_FS_O_CREAT | UV_FS_O_EXCL;
rv = uvFsWriteFile(dir, tmp_filename, flags, bufs, n_bufs, errmsg);
if (rv != 0) {
goto err_after_tmp_create;
}
/* Check if the file exists */
bool exists = false;
rv = UvFsFileExists(dir, filename, &exists, errmsg);
if (rv != 0) {
goto err_after_tmp_create;
}
if (exists) {
rv = -1;
goto err_after_tmp_create;
}
/* Rename the temp file. Remark that there is a race between the existence
* check and the rename, there is no `renameat2` equivalent in libuv.
* However, in the current implementation this should pose no problems.*/
rv = UvOsJoin(dir, tmp_filename, tmp_path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsRename(tmp_path, path);
if (rv != 0) {
UvOsErrMsg(errmsg, "rename", rv);
goto err_after_tmp_create;
}
rv = UvFsSyncDir(dir, errmsg);
if (rv != 0) {
char ignored[RAFT_ERRMSG_BUF_SIZE];
UvFsRemoveFile(dir, filename, ignored);
return rv;
}
return 0;
err_after_tmp_create:
UvFsRemoveFile(dir, tmp_filename, errmsg);
return rv;
}
int UvFsMakeOrOverwriteFile(const char *dir,
const char *filename,
const struct raft_buffer *buf,
char *errmsg)
{
char path[UV__PATH_SZ];
int flags = UV_FS_O_WRONLY;
int mode = 0;
bool exists = true;
uv_file fd;
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
open:
rv = UvOsOpen(path, flags, mode, &fd);
if (rv != 0) {
if (rv == UV_ENOENT && !(flags & UV_FS_O_CREAT)) {
exists = false;
flags |= UV_FS_O_CREAT;
mode = S_IRUSR | S_IWUSR;
goto open;
}
goto err;
}
rv = UvOsWrite(fd, (const uv_buf_t *)buf, 1, 0);
if (rv != (int)(buf->len)) {
if (rv < 0) {
UvOsErrMsg(errmsg, "write", rv);
} else {
ErrMsgPrintf(errmsg, "short write: %d only bytes written", rv);
}
goto err_after_file_open;
}
if (exists) {
rv = UvOsFdatasync(fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "fsync", rv);
goto err_after_file_open;
}
} else {
rv = UvOsFsync(fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "fsync", rv);
goto err_after_file_open;
}
}
rv = UvOsClose(fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "close", rv);
goto err;
}
if (!exists) {
rv = UvFsSyncDir(dir, errmsg);
if (rv != 0) {
goto err;
}
}
return 0;
err_after_file_open:
UvOsClose(fd);
err:
return RAFT_IOERR;
}
int UvFsReadInto(uv_file fd, struct raft_buffer *buf, char *errmsg)
{
ssize_t rv;
size_t offset = 0;
/* TODO: use uv_fs_read() */
while (offset < buf->len) {
rv = read(fd, (char *)buf->base + offset, buf->len - offset);
if (rv == -1) {
UvOsErrMsg(errmsg, "read", -errno);
return RAFT_IOERR;
}
/* EOF. Don't think this is reachable, but just make very sure
* we don't loop forever. */
if (rv == 0) {
break;
}
assert(rv > 0);
offset += (size_t)rv;
}
if (offset < buf->len) {
ErrMsgPrintf(errmsg, "short read: %zu bytes instead of %zu", offset,
buf->len);
return RAFT_IOERR;
}
return 0;
}
int UvFsReadFile(const char *dir,
const char *filename,
struct raft_buffer *buf,
char *errmsg)
{
uv_stat_t sb;
char path[UV__PATH_SZ];
uv_file fd;
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsStat(path, &sb);
if (rv != 0) {
UvOsErrMsg(errmsg, "stat", rv);
rv = RAFT_IOERR;
goto err;
}
rv = uvFsOpenFile(dir, filename, O_RDONLY, 0, &fd, errmsg);
if (rv != 0) {
goto err;
}
buf->len = (size_t)sb.st_size;
buf->base = RaftHeapMalloc(buf->len);
if (buf->base == NULL) {
ErrMsgOom(errmsg);
rv = RAFT_NOMEM;
goto err_after_open;
}
rv = UvFsReadInto(fd, buf, errmsg);
if (rv != 0) {
goto err_after_buf_alloc;
}
UvOsClose(fd);
return 0;
err_after_buf_alloc:
RaftHeapFree(buf->base);
err_after_open:
UvOsClose(fd);
err:
return rv;
}
int UvFsReadFileInto(const char *dir,
const char *filename,
struct raft_buffer *buf,
char *errmsg)
{
char path[UV__PATH_SZ];
uv_file fd;
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = uvFsOpenFile(dir, filename, O_RDONLY, 0, &fd, errmsg);
if (rv != 0) {
goto err;
}
rv = UvFsReadInto(fd, buf, errmsg);
if (rv != 0) {
goto err_after_open;
}
UvOsClose(fd);
return 0;
err_after_open:
UvOsClose(fd);
err:
return rv;
}
int UvFsRemoveFile(const char *dir, const char *filename, char *errmsg)
{
char path[UV__PATH_SZ];
int rv;
rv = UvOsJoin(dir, filename, path);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsUnlink(path);
if (rv != 0) {
UvOsErrMsg(errmsg, "unlink", rv);
return RAFT_IOERR;
}
return 0;
}
int UvFsRenameFile(const char *dir,
const char *filename1,
const char *filename2,
char *errmsg)
{
char path1[UV__PATH_SZ];
char path2[UV__PATH_SZ];
int rv;
rv = UvOsJoin(dir, filename1, path1);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsJoin(dir, filename2, path2);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsRename(path1, path2);
if (rv != 0) {
UvOsErrMsg(errmsg, "rename", rv);
return rv;
}
return 0;
}
int UvFsTruncateAndRenameFile(const char *dir,
size_t size,
const char *filename1,
const char *filename2,
char *errmsg)
{
char path1[UV__PATH_SZ];
char path2[UV__PATH_SZ];
uv_file fd;
int rv;
rv = UvOsJoin(dir, filename1, path1);
if (rv != 0) {
return RAFT_INVALID;
}
rv = UvOsJoin(dir, filename2, path2);
if (rv != 0) {
return RAFT_INVALID;
}
/* Truncate and rename. */
rv = UvOsOpen(path1, UV_FS_O_RDWR, 0, &fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "open", rv);
goto err;
}
rv = UvOsTruncate(fd, (off_t)size);
if (rv != 0) {
UvOsErrMsg(errmsg, "truncate", rv);
goto err_after_open;
}
rv = UvOsFsync(fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "fsync", rv);
goto err_after_open;
}
UvOsClose(fd);
rv = UvOsRename(path1, path2);
if (rv != 0) {
UvOsErrMsg(errmsg, "rename", rv);
goto err;
}
return 0;
err_after_open:
UvOsClose(fd);
err:
return RAFT_IOERR;
}
/* Check if direct I/O is possible on the given fd. */
static int probeDirectIO(int fd, size_t *size, char *errmsg)
{
struct statfs fs_info; /* To check the file system type. */
void *buf; /* Buffer to use for the probe write. */
int rv;
rv = UvOsSetDirectIo(fd);
if (rv != 0) {
if (rv != UV_EINVAL) {
/* UNTESTED: the parameters are ok, so this should never happen. */
UvOsErrMsg(errmsg, "fnctl", rv);
return RAFT_IOERR;
}
rv = fstatfs(fd, &fs_info);
if (rv == -1) {
/* UNTESTED: in practice ENOMEM should be the only failure mode */
UvOsErrMsg(errmsg, "fstatfs", -errno);
return RAFT_IOERR;
}
switch (fs_info.f_type) {
case 0x01021994: /* TMPFS_MAGIC */
case 0x2fc12fc1: /* ZFS magic */
case 0x24051905: /* UBIFS Support magic */
*size = 0;
return 0;
default:
/* UNTESTED: this is an unsupported file system. */
#if defined(__s390x__)
ErrMsgPrintf(errmsg, "unsupported file system: %ux",
fs_info.f_type);
#else
ErrMsgPrintf(errmsg, "unsupported file system: %zx",
fs_info.f_type);
#endif
return RAFT_IOERR;
}
}
/* Try to perform direct I/O, using various buffer size. */
*size = 4096;
while (*size >= 512) {
buf = raft_aligned_alloc(*size, *size);
if (buf == NULL) {
ErrMsgOom(errmsg);
return RAFT_NOMEM;
}
memset(buf, 0, *size);
rv = (int)write(fd, buf, *size);
raft_aligned_free(*size, buf);
if (rv > 0) {
/* Since we fallocate'ed the file, we should never fail because of
* lack of disk space, and all bytes should have been written. */
assert(rv == (int)(*size));
return 0;
}
assert(rv == -1);
if (errno != EIO && errno != EOPNOTSUPP) {
/* UNTESTED: this should basically fail only because of disk errors,
* since we allocated the file with posix_fallocate. */
/* FIXME: this is a workaround because shiftfs doesn't return EINVAL
* in the fnctl call above, for example when the underlying fs is
* ZFS. */
if (errno == EINVAL && *size == 4096) {
*size = 0;
return 0;
}
UvOsErrMsg(errmsg, "write", -errno);
return RAFT_IOERR;
}
*size = *size / 2;
}
*size = 0;
return 0;
}
/* Check if fully non-blocking async I/O is possible on the given fd. */
static int probeAsyncIO(int fd, size_t size, bool *ok, char *errmsg)
{
void *buf; /* Buffer to use for the probe write */
aio_context_t ctx = 0; /* KAIO context handle */
struct iocb iocb; /* KAIO request object */
struct iocb *iocbs = &iocb; /* Because the io_submit() API sucks */
struct io_event event; /* KAIO response object */
int n_events;
int rv;
/* Setup the KAIO context handle */
rv = UvOsIoSetup(1, &ctx);
if (rv != 0) {
UvOsErrMsg(errmsg, "io_setup", rv);
/* UNTESTED: in practice this should fail only with ENOMEM */
return RAFT_IOERR;
}
/* Allocate the write buffer */
buf = raft_aligned_alloc(size, size);
if (buf == NULL) {
ErrMsgOom(errmsg);
return RAFT_NOMEM;
}
memset(buf, 0, size);
/* Prepare the KAIO request object */
memset(&iocb, 0, sizeof iocb);
iocb.aio_lio_opcode = IOCB_CMD_PWRITE;
iocb.aio_buf = (uintptr_t)buf;
iocb.aio_nbytes = size;
iocb.aio_offset = 0;
iocb.aio_fildes = (uint32_t)fd;
iocb.aio_reqprio = 0;
iocb.aio_rw_flags |= RWF_NOWAIT | RWF_DSYNC;
/* Submit the KAIO request */
rv = UvOsIoSubmit(ctx, 1, &iocbs);
if (rv != 0) {
/* UNTESTED: in practice this should fail only with ENOMEM */
raft_aligned_free(size, buf);
UvOsIoDestroy(ctx);
/* On ZFS 0.8 this is not properly supported yet. Also, when running on
* older kernels a binary compiled on a kernel with RWF_NOWAIT support,
* we might get EINVAL. */
if (errno == EOPNOTSUPP || errno == EINVAL) {
*ok = false;
return 0;
}
UvOsErrMsg(errmsg, "io_submit", rv);
return RAFT_IOERR;
}
/* Fetch the response: will block until done. */
n_events = UvOsIoGetevents(ctx, 1, 1, &event, NULL);
assert(n_events == 1);
if (n_events != 1) {
/* UNTESTED */
UvOsErrMsg(errmsg, "UvOsIoGetevents", n_events);
return RAFT_IOERR;
}
/* Release the write buffer. */
raft_aligned_free(size, buf);
/* Release the KAIO context handle. */
rv = UvOsIoDestroy(ctx);
if (rv != 0) {
UvOsErrMsg(errmsg, "io_destroy", rv);
return RAFT_IOERR;
}
if (event.res > 0) {
assert(event.res == (int)size);
*ok = true;
} else {
if (event.res == -EAGAIN) {
/* UNTESTED: starting from around kernel version 6, xfs has started
* to occasionally fail with EAGAIN, presumeably because the write
* would block in some way. We still want to try submitting writes
* asynchronously in that case, and we'll deal with retries in the
* writer. */
*ok = true;
} else {
/* UNTESTED: this should basically fail only because of disk errors,
* since we allocated the file with posix_fallocate and the block
* size is supposed to be correct. */
*ok = false;
}
}
return 0;
}
#define UV__FS_PROBE_FILE ".probe"
#define UV__FS_PROBE_FILE_SIZE 4096
int UvFsProbeCapabilities(const char *dir,
size_t *direct,
bool *async,
char *errmsg)
{
int fd; /* File descriptor of the probe file */
int rv;
char ignored[RAFT_ERRMSG_BUF_SIZE];
/* Create a temporary probe file. */
UvFsRemoveFile(dir, UV__FS_PROBE_FILE, ignored);
rv = UvFsAllocateFile(dir, UV__FS_PROBE_FILE, UV__FS_PROBE_FILE_SIZE, &fd,
errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "create I/O capabilities probe file");
goto err;
}
UvFsRemoveFile(dir, UV__FS_PROBE_FILE, ignored);
/* Check if we can use direct I/O. */
rv = probeDirectIO(fd, direct, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "probe Direct I/O");
goto err_after_file_open;
}
/* If direct I/O is not possible, we can't perform fully asynchronous
* I/O, because io_submit might potentially block. */
if (*direct == 0) {
*async = false;
goto out;
}
rv = probeAsyncIO(fd, *direct, async, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "probe Async I/O");
goto err_after_file_open;
}
out:
close(fd);
return 0;
err_after_file_open:
close(fd);
err:
return rv;
}
raft-0.22.1/src/uv_fs.h 0000664 0000000 0000000 00000011227 14601504142 0014612 0 ustar 00root root 0000000 0000000 /* File system related utilities. */
#ifndef UV_FS_H_
#define UV_FS_H_
#include
#include "../include/raft.h"
#include "err.h"
#define TMP_FILE_PREFIX "tmp-"
#define TMP_FILE_FMT TMP_FILE_PREFIX "%s"
/* Check that the given directory can be used. */
int UvFsCheckDir(const char *dir, char *errmsg);
/* Sync the given directory by calling fsync(). */
int UvFsSyncDir(const char *dir, char *errmsg);
/* Check whether a the given file exists. */
int UvFsFileExists(const char *dir,
const char *filename,
bool *exists,
char *errmsg);
/* Get the size of the given file. */
int UvFsFileSize(const char *dir,
const char *filename,
off_t *size,
char *errmsg);
/* Check whether the given file in the given directory is empty. */
int UvFsFileIsEmpty(const char *dir,
const char *filename,
bool *empty,
char *errmsg);
/* Create the given file in the given directory and allocate the given size to
* it, returning its file descriptor. The file must not exist yet. */
int UvFsAllocateFile(const char *dir,
const char *filename,
size_t size,
uv_file *fd,
char *errmsg);
/* Allocate and write an invisible temporary file of the given size within the
* given directory, returning its file descriptor. */
int UvFsCreateTempFile(const char *dir,
struct raft_buffer *bufs,
unsigned n_bufs,
uv_file *fd,
char *errmsg);
/* Finalize an invisible tempfile renaming it to the given name. */
int UvFsFinalizeTempFile(uv_file fd,
const char *dir,
const char *filename,
char *errmsg);
/* Create a file and write the given content into it. */
int UvFsMakeFile(const char *dir,
const char *filename,
struct raft_buffer *bufs,
unsigned n_bufs,
char *errmsg);
/* Create or overwrite a file.
*
* If the file does not exists yet, it gets created, the given content written
* to it, and then fully persisted to disk by fsync()'ing the file and the
* dir.
*
* If the file already exists, it gets overwritten. The assumption is that the
* file size will stay the same and its content will change, so only fdatasync()
* will be used */
int UvFsMakeOrOverwriteFile(const char *dir,
const char *filename,
const struct raft_buffer *buf,
char *errmsg);
/* Open a file for reading. */
int UvFsOpenFileForReading(const char *dir,
const char *filename,
uv_file *fd,
char *errmsg);
/* Read exactly buf->len bytes from the given file descriptor into
buf->base. Fail if less than buf->len bytes are read. */
int UvFsReadInto(uv_file fd, struct raft_buffer *buf, char *errmsg);
/* Read all the content of the given file. */
int UvFsReadFile(const char *dir,
const char *filename,
struct raft_buffer *buf,
char *errmsg);
/* Read exactly buf->len bytes from the given file into buf->base. Fail if less
* than buf->len bytes are read. */
int UvFsReadFileInto(const char *dir,
const char *filename,
struct raft_buffer *buf,
char *errmsg);
/* Synchronously remove a file, calling the unlink() system call. */
int UvFsRemoveFile(const char *dir, const char *filename, char *errmsg);
/* Synchronously truncate a file to the given size and then rename it. */
int UvFsTruncateAndRenameFile(const char *dir,
size_t size,
const char *filename1,
const char *filename2,
char *errmsg);
/* Synchronously rename a file. */
int UvFsRenameFile(const char *dir,
const char *filename1,
const char *filename2,
char *errmsg);
/* Return information about the I/O capabilities of the underlying file
* system.
*
* The @direct parameter will be set to zero if direct I/O is not possible, or
* to the block size to use for direct I/O otherwise.
*
* The @async parameter will be set to true if fully asynchronous I/O is
* possible using the KAIO API. */
int UvFsProbeCapabilities(const char *dir,
size_t *direct,
bool *async,
char *errmsg);
#endif /* UV_FS_H_ */
raft-0.22.1/src/uv_ip.c 0000664 0000000 0000000 00000004173 14601504142 0014607 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include "../include/raft.h"
#include "uv_ip.h"
static const char *strCpyUntil(char *target,
const char *source,
size_t target_size,
char separator)
{
size_t i;
for (i = 0; i < target_size; ++i) {
if (!source[i] || source[i] == separator) {
target[i] = 0;
return source + i;
} else {
target[i] = source[i];
}
}
return NULL;
}
int uvIpAddrSplit(const char *address,
char *host,
size_t host_size,
char *service,
size_t service_size)
{
char colon = ':';
const char *service_ptr = NULL;
if (host) {
service_ptr = strCpyUntil(host, address, host_size, colon);
if (!service_ptr) {
return RAFT_NAMETOOLONG;
}
}
if (service) {
if (!service_ptr) {
service_ptr = strchr(address, colon);
}
if (!service_ptr || *service_ptr == 0 || *(++service_ptr) == 0) {
service_ptr = "8080";
}
if (!strCpyUntil(service, service_ptr, service_size, 0)) {
return RAFT_NAMETOOLONG;
}
}
return 0;
}
/* Synchronoues resolve hostname to IP address */
int uvIpResolveBindAddresses(const char *address, struct addrinfo **ai_result)
{
static struct addrinfo hints = {.ai_flags = AI_PASSIVE | AI_NUMERICSERV,
.ai_family = AF_INET,
.ai_socktype = SOCK_STREAM,
.ai_protocol = 0};
char hostname[NI_MAXHOST];
char service[NI_MAXSERV];
int rv;
rv = uvIpAddrSplit(address, hostname, sizeof(hostname), service,
sizeof(service));
if (rv != 0) {
return rv;
}
if (hostname[0]) {
rv = getaddrinfo(hostname, service, &hints, ai_result);
} else {
rv = getaddrinfo(NULL, service, &hints, ai_result);
}
if (rv != 0) {
return RAFT_IOERR;
}
return 0;
}
raft-0.22.1/src/uv_ip.h 0000664 0000000 0000000 00000000737 14601504142 0014616 0 ustar 00root root 0000000 0000000 /* IP-related utils. */
#ifndef UV_IP_H_
#define UV_IP_H_
#include
/* Split @address into @host and @service. */
int uvIpAddrSplit(const char *address,
char *host,
size_t host_size,
char *service,
size_t service_size);
struct addrinfo;
/* Synchronous resolve hostname to IP address */
int uvIpResolveBindAddresses(const char *address, struct addrinfo **ai_result);
#endif /* UV_IP_H */
raft-0.22.1/src/uv_list.c 0000664 0000000 0000000 00000005672 14601504142 0015157 0 ustar 00root root 0000000 0000000 #include
#include "assert.h"
#include "uv.h"
#define tracef(...) Tracef(uv->tracer, __VA_ARGS__)
static const char *uvListIgnored[] = {".", "..", "metadata1", "metadata2",
NULL};
/* Return true if the given filename should be ignored. */
static bool uvListShouldIgnore(const char *filename)
{
const char **cursor = uvListIgnored;
bool result = false;
if (strlen(filename) >= UV__FILENAME_LEN) {
return true;
}
while (*cursor != NULL) {
if (strcmp(filename, *cursor) == 0) {
result = true;
break;
}
cursor++;
}
return result;
}
int UvList(struct uv *uv,
struct uvSnapshotInfo *snapshots[],
size_t *n_snapshots,
struct uvSegmentInfo *segments[],
size_t *n_segments,
char *errmsg)
{
struct uv_fs_s req;
struct uv_dirent_s entry;
int n;
int i;
int rv;
int rv2;
n = uv_fs_scandir(NULL, &req, uv->dir, 0, NULL);
if (n < 0) {
ErrMsgPrintf(errmsg, "scan data directory: %s", uv_strerror(n));
return RAFT_IOERR;
}
*snapshots = NULL;
*n_snapshots = 0;
*segments = NULL;
*n_segments = 0;
rv = 0;
for (i = 0; i < n; i++) {
const char *filename;
bool appended;
rv = uv_fs_scandir_next(&req, &entry);
assert(rv == 0); /* Can't fail in libuv */
filename = entry.name;
/* If an error occurred while processing a preceeding entry or if we
* know that this is not a segment filename, just free it and skip to
* the next one. */
if (rv != 0 || uvListShouldIgnore(filename)) {
if (rv == 0) {
tracef("ignore %s", filename);
}
continue;
}
/* Append to the snapshot list if it's a snapshot metadata filename and
* a valid associated snapshot file exists. */
rv = UvSnapshotInfoAppendIfMatch(uv, filename, snapshots, n_snapshots,
&appended);
if (appended || rv != 0) {
if (rv == 0) {
tracef("snapshot %s", filename);
}
continue;
}
/* Append to the segment list if it's a segment filename */
rv = uvSegmentInfoAppendIfMatch(entry.name, segments, n_segments,
&appended);
if (appended || rv != 0) {
if (rv == 0) {
tracef("segment %s", filename);
}
continue;
}
tracef("ignore %s", filename);
}
rv2 = uv_fs_scandir_next(&req, &entry);
assert(rv2 == UV_EOF);
if (rv != 0 && *segments != NULL) {
raft_free(*segments);
}
if (*snapshots != NULL) {
UvSnapshotSort(*snapshots, *n_snapshots);
}
if (*segments != NULL) {
uvSegmentSort(*segments, *n_segments);
}
return rv;
}
#undef tracef
raft-0.22.1/src/uv_metadata.c 0000664 0000000 0000000 00000013645 14601504142 0015763 0 ustar 00root root 0000000 0000000 #include "assert.h"
#include "byte.h"
#include "uv.h"
#include "uv_encoding.h"
/* We have metadata1 and metadata2. */
#define METADATA_FILENAME_PREFIX "metadata"
#define METADATA_FILENAME_SIZE (sizeof(METADATA_FILENAME_PREFIX) + 2)
/* Format, version, term, vote */
#define METADATA_CONTENT_SIZE (8 * 4)
/* Encode the content of a metadata file. */
static void uvMetadataEncode(const struct uvMetadata *metadata, void *buf)
{
uint8_t *cursor = buf;
bytePut64(&cursor, UV__DISK_FORMAT);
bytePut64(&cursor, metadata->version);
bytePut64(&cursor, metadata->term);
bytePut64(&cursor, metadata->voted_for);
}
/* Decode the content of a metadata file. */
static int uvMetadataDecode(const void *buf,
struct uvMetadata *metadata,
char *errmsg)
{
const uint8_t *cursor = buf;
uint64_t format;
format = byteGet64(&cursor);
if (format != UV__DISK_FORMAT) {
ErrMsgPrintf(errmsg, "bad format version %ju", format);
return RAFT_MALFORMED;
}
metadata->version = byteGet64(&cursor);
metadata->term = byteGet64(&cursor);
metadata->voted_for = byteGet64(&cursor);
/* Coherence checks that values make sense */
if (metadata->version == 0) {
ErrMsgPrintf(errmsg, "version is set to zero");
return RAFT_CORRUPT;
}
return 0;
}
/* Render the filename of the metadata file with index @n. */
static void uvMetadataFilename(const unsigned short n, char *filename)
{
sprintf(filename, METADATA_FILENAME_PREFIX "%d", n);
}
/* Read the n'th metadata file (with n equal to 1 or 2) and decode the content
* of the file, populating the given metadata buffer accordingly. */
static int uvMetadataLoadN(const char *dir,
const unsigned short n,
struct uvMetadata *metadata,
char *errmsg)
{
char filename[METADATA_FILENAME_SIZE]; /* Filename of the metadata file */
uint8_t content[METADATA_CONTENT_SIZE]; /* Content of metadata file */
off_t size;
struct raft_buffer buf;
bool exists;
int rv;
assert(n == 1 || n == 2);
/* Render the metadata path */
uvMetadataFilename(n, filename);
rv = UvFsFileExists(dir, filename, &exists, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "check if %s exists", filename);
return rv;
}
memset(metadata, 0, sizeof *metadata);
/* If the file does not exist, just return. */
if (!exists) {
return 0;
}
/* If the file exists but has less bytes than expected assume that the
* server crashed while writing this metadata file, and pretend it has not
* been written at all. If it has more file than expected, return an
* error. */
rv = UvFsFileSize(dir, filename, &size, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "check size of %s", filename);
return rv;
}
if (size != sizeof content) {
if ((size_t)size < sizeof content) {
rv = UvFsRemoveFile(dir, filename, errmsg);
if (rv != 0) {
return rv;
}
return 0;
}
ErrMsgPrintf(errmsg, "%s has size %jd instead of %zu", filename,
(intmax_t)size, sizeof content);
return RAFT_CORRUPT;
}
/* Read the content of the metadata file. */
buf.base = content;
buf.len = sizeof content;
rv = UvFsReadFileInto(dir, filename, &buf, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "read content of %s", filename);
return rv;
};
/* Decode the content of the metadata file. */
rv = uvMetadataDecode(content, metadata, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "decode content of %s", filename);
return rv;
}
return 0;
}
int uvMetadataLoad(const char *dir, struct uvMetadata *metadata, char *errmsg)
{
struct uvMetadata metadata1;
struct uvMetadata metadata2;
int rv;
/* Read the two metadata files (if available). */
rv = uvMetadataLoadN(dir, 1, &metadata1, errmsg);
if (rv != 0) {
return rv;
}
rv = uvMetadataLoadN(dir, 2, &metadata2, errmsg);
if (rv != 0) {
return rv;
}
/* Check the versions. */
if (metadata1.version == 0 && metadata2.version == 0) {
/* Neither metadata file exists: have a brand new server. */
metadata->version = 0;
metadata->term = 0;
metadata->voted_for = 0;
} else if (metadata1.version == metadata2.version) {
/* The two metadata files can't have the same version. */
ErrMsgPrintf(errmsg, "metadata1 and metadata2 are both at version %llu",
metadata1.version);
return RAFT_CORRUPT;
} else {
/* Pick the metadata with the grater version. */
if (metadata1.version > metadata2.version) {
*metadata = metadata1;
} else {
*metadata = metadata2;
}
}
return 0;
}
/* Return the metadata file index associated with the given version. */
static unsigned short uvMetadataFileIndex(unsigned long long version)
{
return version % 2 == 1 ? 1 : 2;
}
int uvMetadataStore(struct uv *uv, const struct uvMetadata *metadata)
{
char filename[METADATA_FILENAME_SIZE]; /* Filename of the metadata file */
uint8_t content[METADATA_CONTENT_SIZE]; /* Content of metadata file */
struct raft_buffer buf;
unsigned short n;
int rv;
assert(metadata->version > 0);
/* Encode the given metadata. */
uvMetadataEncode(metadata, content);
/* Render the metadata file name. */
n = uvMetadataFileIndex(metadata->version);
uvMetadataFilename(n, filename);
/* Write the metadata file, creating it if it does not exist. */
buf.base = content;
buf.len = sizeof content;
rv = UvFsMakeOrOverwriteFile(uv->dir, filename, &buf, uv->io->errmsg);
if (rv != 0) {
ErrMsgWrapf(uv->io->errmsg, "persist %s", filename);
return rv;
}
return 0;
}
raft-0.22.1/src/uv_os.c 0000664 0000000 0000000 00000012207 14601504142 0014615 0 ustar 00root root 0000000 0000000 #include "uv_os.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "assert.h"
#include "err.h"
#include "syscall.h"
/* Default permissions when creating a directory. */
#define DEFAULT_DIR_PERM 0700
int UvOsOpen(const char *path, int flags, int mode, uv_file *fd)
{
struct uv_fs_s req;
int rv;
rv = uv_fs_open(NULL, &req, path, flags, mode, NULL);
if (rv < 0) {
return rv;
}
*fd = rv;
return 0;
}
int UvOsClose(uv_file fd)
{
struct uv_fs_s req;
return uv_fs_close(NULL, &req, fd, NULL);
}
/* Emulate fallocate(). Mostly taken from glibc's implementation. */
static int uvOsFallocateEmulation(int fd, off_t offset, off_t len)
{
ssize_t increment;
struct statfs f;
int rv;
rv = fstatfs(fd, &f);
if (rv != 0) {
return errno;
}
if (f.f_bsize == 0) {
increment = 512;
} else if (f.f_bsize < 4096) {
increment = f.f_bsize;
} else {
increment = 4096;
}
for (offset += (len - 1) % increment; len > 0; offset += increment) {
len -= increment;
rv = (int)pwrite(fd, "", 1, offset);
if (rv != 1)
return errno;
}
return 0;
}
int UvOsFallocate(uv_file fd, off_t offset, off_t len)
{
int rv;
rv = posix_fallocate(fd, offset, len);
if (rv != 0) {
/* From the manual page:
*
* posix_fallocate() returns zero on success, or an error number on
* failure. Note that errno is not set.
*/
if (rv != EOPNOTSUPP) {
return -rv;
}
/* This might be a libc implementation (e.g. musl) that doesn't
* implement a transparent fallback if fallocate() is not supported
* by the underlying file system. */
rv = uvOsFallocateEmulation(fd, offset, len);
if (rv != 0) {
return -EOPNOTSUPP;
}
}
return 0;
}
int UvOsTruncate(uv_file fd, off_t offset)
{
struct uv_fs_s req;
return uv_fs_ftruncate(NULL, &req, fd, offset, NULL);
}
int UvOsFsync(uv_file fd)
{
struct uv_fs_s req;
return uv_fs_fsync(NULL, &req, fd, NULL);
}
int UvOsFdatasync(uv_file fd)
{
struct uv_fs_s req;
return uv_fs_fdatasync(NULL, &req, fd, NULL);
}
int UvOsStat(const char *path, uv_stat_t *sb)
{
struct uv_fs_s req;
int rv;
rv = uv_fs_stat(NULL, &req, path, NULL);
if (rv != 0) {
return rv;
}
memcpy(sb, &req.statbuf, sizeof *sb);
return 0;
}
int UvOsWrite(uv_file fd,
const uv_buf_t bufs[],
unsigned int nbufs,
int64_t offset)
{
struct uv_fs_s req;
return uv_fs_write(NULL, &req, fd, bufs, nbufs, offset, NULL);
}
int UvOsUnlink(const char *path)
{
struct uv_fs_s req;
return uv_fs_unlink(NULL, &req, path, NULL);
}
int UvOsRename(const char *path1, const char *path2)
{
struct uv_fs_s req;
return uv_fs_rename(NULL, &req, path1, path2, NULL);
}
int UvOsJoin(const char *dir, const char *filename, char *path)
{
if (!UV__DIR_HAS_VALID_LEN(dir) || !UV__FILENAME_HAS_VALID_LEN(filename)) {
return -1;
}
strcpy(path, dir);
strcat(path, "/");
strcat(path, filename);
return 0;
}
int UvOsIoSetup(unsigned nr, aio_context_t *ctxp)
{
int rv;
rv = io_setup(nr, ctxp);
if (rv == -1) {
return -errno;
}
return 0;
}
int UvOsIoDestroy(aio_context_t ctx)
{
int rv;
rv = io_destroy(ctx);
if (rv == -1) {
return -errno;
}
return 0;
}
int UvOsIoSubmit(aio_context_t ctx, long nr, struct iocb **iocbpp)
{
int rv;
rv = io_submit(ctx, nr, iocbpp);
if (rv == -1) {
return -errno;
}
assert(rv == nr); /* TODO: can something else be returned? */
return 0;
}
int UvOsIoGetevents(aio_context_t ctx,
long min_nr,
long max_nr,
struct io_event *events,
struct timespec *timeout)
{
int rv;
do {
rv = io_getevents(ctx, min_nr, max_nr, events, timeout);
} while (rv == -1 && errno == EINTR);
if (rv == -1) {
return -errno;
}
assert(rv >= min_nr);
assert(rv <= max_nr);
return rv;
}
int UvOsEventfd(unsigned int initval, int flags)
{
int rv;
/* At the moment only UV_FS_O_NONBLOCK is supported */
assert(flags == UV_FS_O_NONBLOCK);
flags = EFD_NONBLOCK | EFD_CLOEXEC;
rv = eventfd(initval, flags);
if (rv == -1) {
return -errno;
}
return rv;
}
int UvOsSetDirectIo(uv_file fd)
{
int flags; /* Current fcntl flags */
int rv;
flags = fcntl(fd, F_GETFL);
rv = fcntl(fd, F_SETFL, flags | UV_FS_O_DIRECT);
if (rv == -1) {
return -errno;
}
return 0;
}
int UvOsLinkat(uv_file olddirfd,
const char *oldpath,
int newdirfd,
const char *newpath,
int flags)
{
int rv;
rv = linkat(olddirfd, oldpath, newdirfd, newpath, flags);
if (rv == -1) {
return -errno;
}
assert(rv == 0);
return 0;
}
raft-0.22.1/src/uv_os.h 0000664 0000000 0000000 00000005411 14601504142 0014621 0 ustar 00root root 0000000 0000000 /* Operating system related utilities. */
#ifndef UV_OS_H_
#define UV_OS_H_
#include
#include
#include
#include
#include
/* Maximum size of a full file system path string. */
#define UV__PATH_SZ 1024
/* Maximum length of a filename string. */
#define UV__FILENAME_LEN 128
/* Length of path separator. */
#define UV__SEP_LEN 1 /* strlen("/") */
/* True if STR's length is at most LEN. */
#define LEN_AT_MOST_(STR, LEN) (strnlen(STR, LEN + 1) <= LEN)
/* Maximum length of a directory path string. */
#define UV__DIR_LEN (UV__PATH_SZ - UV__SEP_LEN - UV__FILENAME_LEN - 1)
/* True if the given DIR string has at most UV__DIR_LEN chars. */
#define UV__DIR_HAS_VALID_LEN(DIR) LEN_AT_MOST_(DIR, UV__DIR_LEN)
/* True if the given FILENAME string has at most UV__FILENAME_LEN chars. */
#define UV__FILENAME_HAS_VALID_LEN(FILENAME) \
LEN_AT_MOST_(FILENAME, UV__FILENAME_LEN)
/* Portable open() */
int UvOsOpen(const char *path, int flags, int mode, uv_file *fd);
/* Portable close() */
int UvOsClose(uv_file fd);
/* TODO: figure a portable abstraction. */
int UvOsFallocate(uv_file fd, off_t offset, off_t len);
/* Portable truncate() */
int UvOsTruncate(uv_file fd, off_t offset);
/* Portable fsync() */
int UvOsFsync(uv_file fd);
/* Portable fdatasync() */
int UvOsFdatasync(uv_file fd);
/* Portable stat() */
int UvOsStat(const char *path, uv_stat_t *sb);
/* Portable write() */
int UvOsWrite(uv_file fd,
const uv_buf_t bufs[],
unsigned int nbufs,
int64_t offset);
/* Portable unlink() */
int UvOsUnlink(const char *path);
/* Portable rename() */
int UvOsRename(const char *path1, const char *path2);
/* Join dir and filename into a full OS path. */
int UvOsJoin(const char *dir, const char *filename, char *path);
/* TODO: figure a portable abstraction. */
int UvOsIoSetup(unsigned nr, aio_context_t *ctxp);
int UvOsIoDestroy(aio_context_t ctx);
int UvOsIoSubmit(aio_context_t ctx, long nr, struct iocb **iocbpp);
int UvOsIoGetevents(aio_context_t ctx,
long min_nr,
long max_nr,
struct io_event *events,
struct timespec *timeout);
int UvOsEventfd(unsigned int initval, int flags);
int UvOsSetDirectIo(uv_file fd);
int UvOsLinkat(uv_file olddirfd,
const char *oldpath,
int newdirfd,
const char *newpath,
int flags);
/* Format an error message caused by a failed system call or stdlib function. */
#define UvOsErrMsg(ERRMSG, SYSCALL, ERRNUM) \
{ \
ErrMsgPrintf(ERRMSG, "%s", uv_strerror(ERRNUM)); \
ErrMsgWrapf(ERRMSG, SYSCALL); \
}
#endif /* UV_OS_H_ */
raft-0.22.1/src/uv_prepare.c 0000664 0000000 0000000 00000024645 14601504142 0015643 0 ustar 00root root 0000000 0000000 #include
#include
#include "assert.h"
#include "heap.h"
#include "uv.h"
#include "uv_os.h"
#define tracef(...) Tracef(uv->tracer, __VA_ARGS__)
#define trace(TYPE, INFO) Trace(uv->tracer, TYPE, INFO)
/* The happy path for UvPrepare is:
*
* - If there is an unused open segment available, return its fd and counter
* immediately.
*
* - Otherwise, wait for the creation of a new open segment to complete,
* possibly kicking off the creation logic if no segment is being created
* currently.
*
* Possible failure modes are:
*
* - The create file request fails, in that case we fail all pending prepare
* requests and we mark the uv instance as errored.
*
* On close:
*
* - Cancel all pending prepare requests.
* - Remove unused prepared open segments.
* - Wait for any pending internal segment creation and then discard the newly
* created segment.
*/
/* Number of open segments that we try to keep ready for writing. */
#define UV__TARGET_POOL_SIZE 2
/* An open segment being prepared or sitting in the pool */
struct uvIdleSegment
{
struct uv *uv; /* Open segment file */
size_t size; /* Segment size */
struct uv_work_s work; /* To execute logic in the threadpool */
int status; /* Result of threadpool callback */
char errmsg[RAFT_ERRMSG_BUF_SIZE]; /* Error of threadpool callback */
uvCounter counter; /* Segment counter */
char filename[UV__FILENAME_LEN]; /* Filename of the segment */
uv_file fd; /* File descriptor of prepared file */
queue queue; /* Pool */
};
static int uvPrepareCreateSegment(struct uvIdleSegment *segment)
{
struct uv *uv = segment->uv;
int rv;
rv = UvFsAllocateFile(uv->dir, segment->filename, segment->size,
&segment->fd, segment->errmsg);
if (rv != 0) {
goto err;
}
rv = UvFsSyncDir(uv->dir, segment->errmsg);
if (rv != 0) {
goto err_after_allocate;
}
return 0;
err_after_allocate:
UvOsClose(segment->fd);
err:
assert(rv != 0);
return rv;
}
static void uvPrepareWorkCb(uv_work_t *work)
{
struct uvIdleSegment *segment = work->data;
segment->status = uvPrepareCreateSegment(segment);
}
/* Flush all pending requests, invoking their callbacks with the given
* status. */
static void uvPrepareFinishAllRequests(struct uv *uv, int status)
{
while (!QUEUE_IS_EMPTY(&uv->prepare_reqs)) {
queue *head;
struct uvPrepare *req;
head = QUEUE_HEAD(&uv->prepare_reqs);
req = QUEUE_DATA(head, struct uvPrepare, queue);
QUEUE_REMOVE(&req->queue);
req->cb(req, status);
}
}
/* Pop the oldest prepared segment in the pool and return its fd and counter
* through the given pointers. */
static void uvPrepareConsume(struct uv *uv, uv_file *fd, uvCounter *counter)
{
queue *head;
struct uvIdleSegment *segment;
/* Pop a segment from the pool. */
head = QUEUE_HEAD(&uv->prepare_pool);
segment = QUEUE_DATA(head, struct uvIdleSegment, queue);
assert(segment->fd >= 0);
QUEUE_REMOVE(&segment->queue);
*fd = segment->fd;
*counter = segment->counter;
RaftHeapFree(segment);
}
/* Finish the oldest pending prepare request using the next available prepared
* segment. */
static void uvPrepareFinishOldestRequest(struct uv *uv)
{
queue *head;
struct uvPrepare *req;
assert(!uv->closing);
assert(!QUEUE_IS_EMPTY(&uv->prepare_reqs));
assert(!QUEUE_IS_EMPTY(&uv->prepare_pool));
/* Pop the head of the prepare requests queue. */
head = QUEUE_HEAD(&uv->prepare_reqs);
req = QUEUE_DATA(head, struct uvPrepare, queue);
QUEUE_REMOVE(&req->queue);
/* Finish the request */
uvPrepareConsume(uv, &req->fd, &req->counter);
req->cb(req, 0);
}
unsigned UvPrepareCount(struct uv *uv)
{
queue *head;
unsigned n;
n = 0;
QUEUE_FOREACH (head, &uv->prepare_pool) {
n++;
}
return n;
}
static void uvPrepareAfterWorkCb(uv_work_t *work, int status);
static struct uvIdleSegment *uvIdleSegmentCreate(struct uv *uv)
{
struct uvIdleSegment *s;
s = RaftHeapMalloc(sizeof *s);
if (s == NULL) {
return NULL;
}
memset(s, 0, sizeof *s);
s->uv = uv;
s->counter = uv->prepare_next_counter;
s->work.data = s;
s->fd = -1;
s->size = uv->block_size * uvSegmentBlocks(uv);
sprintf(s->filename, UV__OPEN_TEMPLATE, s->counter);
return s;
}
/* Start creating a new segment file. */
static int uvPrepareStart(struct uv *uv)
{
struct uvIdleSegment *segment;
int rv;
assert(uv->prepare_inflight == NULL);
assert(UvPrepareCount(uv) < UV__TARGET_POOL_SIZE);
segment = uvIdleSegmentCreate(uv);
if (segment == NULL) {
rv = RAFT_NOMEM;
goto err;
}
tracef("create open segment %s", segment->filename);
rv = uv_queue_work(uv->loop, &segment->work, uvPrepareWorkCb,
uvPrepareAfterWorkCb);
if (rv != 0) {
/* UNTESTED: with the current libuv implementation this can't fail. */
tracef("can't create segment %s: %s", segment->filename,
uv_strerror(rv));
rv = RAFT_IOERR;
goto err_after_segment_alloc;
}
uv->prepare_inflight = segment;
uv->prepare_next_counter++;
return 0;
err_after_segment_alloc:
RaftHeapFree(segment);
err:
assert(rv != 0);
return rv;
}
/* Retry segment creation. */
static void uvPrepareRetryTimerCb(uv_timer_t *timer)
{
struct uvIdleSegment *segment = timer->data;
struct uv *uv = segment->uv;
int rv;
assert(uv->prepare_inflight == segment);
uv->prepare_retry.data = uv;
tracef("retry creating segment %s", segment->filename);
rv = uv_queue_work(uv->loop, &segment->work, uvPrepareWorkCb,
uvPrepareAfterWorkCb);
assert(rv == 0);
}
static void uvPrepareAfterWorkCb(uv_work_t *work, int status)
{
struct uvIdleSegment *segment = work->data;
struct uv *uv = segment->uv;
int rv;
assert(status == 0);
/* If we are closing, let's discard the segment. All pending requests have
* already being fired with RAFT_CANCELED. */
if (uv->closing) {
uv->prepare_inflight = NULL;
assert(QUEUE_IS_EMPTY(&uv->prepare_pool));
assert(QUEUE_IS_EMPTY(&uv->prepare_reqs));
if (segment->status == 0) {
char errmsg[RAFT_ERRMSG_BUF_SIZE];
UvOsClose(segment->fd);
UvFsRemoveFile(uv->dir, segment->filename, errmsg);
}
tracef("canceled creation of %s", segment->filename);
RaftHeapFree(segment);
uvMaybeFireCloseCb(uv);
return;
}
/* If the request has failed, retry again after a while. */
if (segment->status != 0) {
assert(uv->prepare_retry.data == uv);
uv->prepare_retry.data = segment;
rv = uv_timer_start(&uv->prepare_retry, uvPrepareRetryTimerCb,
uv->disk_retry, 0);
assert(rv == 0);
return;
}
uv->prepare_inflight = NULL; /* Reset the creation in-progress marker. */
assert(segment->fd >= 0);
tracef("completed creation of %s", segment->filename);
QUEUE_PUSH(&uv->prepare_pool, &segment->queue);
/* Let's process any pending request. */
if (!QUEUE_IS_EMPTY(&uv->prepare_reqs)) {
uvPrepareFinishOldestRequest(uv);
}
/* If we are already creating a segment, we're done. */
if (uv->prepare_inflight != NULL) {
return;
}
/* If we have already enough prepared open segments, we're done. There can't
* be any outstanding prepare requests, since if the request queue was not
* empty, we would have called uvPrepareFinishOldestRequest() above, thus
* reducing the pool size and making it smaller than the target size. */
if (UvPrepareCount(uv) >= UV__TARGET_POOL_SIZE) {
assert(QUEUE_IS_EMPTY(&uv->prepare_reqs));
return;
}
/* Let's start preparing a new open segment. */
rv = uvPrepareStart(uv);
if (rv != 0) {
uvPrepareFinishAllRequests(uv, rv);
uv->errored = true;
}
}
/* Discard a prepared open segment, closing its file descriptor and removing the
* underlying file. */
static void uvPrepareDiscard(struct uv *uv, uv_file fd, uvCounter counter)
{
char errmsg[RAFT_ERRMSG_BUF_SIZE];
char filename[UV__FILENAME_LEN];
assert(counter > 0);
assert(fd >= 0);
sprintf(filename, UV__OPEN_TEMPLATE, counter);
UvOsClose(fd);
UvFsRemoveFile(uv->dir, filename, errmsg);
}
int UvPrepare(struct uv *uv,
uv_file *fd,
uvCounter *counter,
struct uvPrepare *req,
uvPrepareCb cb)
{
int rv;
assert(!uv->closing);
if (!QUEUE_IS_EMPTY(&uv->prepare_pool)) {
uvPrepareConsume(uv, fd, counter);
goto maybe_start;
}
*fd = -1;
*counter = 0;
req->cb = cb;
QUEUE_PUSH(&uv->prepare_reqs, &req->queue);
maybe_start:
/* If we are already creating a segment, let's just wait. */
if (uv->prepare_inflight != NULL) {
return 0;
}
rv = uvPrepareStart(uv);
if (rv != 0) {
goto err;
}
return 0;
err:
if (*fd != -1) {
uvPrepareDiscard(uv, *fd, *counter);
} else {
QUEUE_REMOVE(&req->queue);
}
assert(rv != 0);
return rv;
}
void UvPrepareStart(struct uv *uv)
{
unsigned i;
for (i = 0; i < UV__TARGET_POOL_SIZE; i++) {
struct uvIdleSegment *segment = uvIdleSegmentCreate(uv);
int rv;
if (segment == NULL) {
break;
}
rv = uvPrepareCreateSegment(segment);
if (rv != 0) {
RaftHeapFree(segment);
break;
}
uv->io->capacity += (unsigned short)(segment->size / 1024);
uv->prepare_next_counter++;
QUEUE_PUSH(&uv->prepare_pool, &segment->queue);
}
}
void UvPrepareClose(struct uv *uv)
{
assert(uv->closing);
/* Cancel all pending prepare requests. */
uvPrepareFinishAllRequests(uv, RAFT_CANCELED);
/* Remove any unused prepared segment. */
while (!QUEUE_IS_EMPTY(&uv->prepare_pool)) {
queue *head;
struct uvIdleSegment *segment;
head = QUEUE_HEAD(&uv->prepare_pool);
segment = QUEUE_DATA(head, struct uvIdleSegment, queue);
QUEUE_REMOVE(&segment->queue);
uvPrepareDiscard(uv, segment->fd, segment->counter);
RaftHeapFree(segment);
}
}
#undef tracef
#undef trace
raft-0.22.1/src/uv_recv.c 0000664 0000000 0000000 00000031335 14601504142 0015136 0 ustar 00root root 0000000 0000000 #include
#include "../include/raft/uv.h"
#include "assert.h"
#include "byte.h"
#include "configuration.h"
#include "err.h"
#include "heap.h"
#include "uv.h"
#include "uv_encoding.h"
#define tracef(...) Tracef(uv->tracer, __VA_ARGS__)
/* The happy path for a receiving an RPC message is:
*
* - When a peer server successfully establishes a new connection with us, the
* transport invokes our accept callback.
*
* - A new server object is created and added to the servers array. It starts
* reading from the stream handle of the new connection.
*
* - The RPC message preamble is read, which contains the message type and the
* message length.
*
* - The RPC message header is read, whose content depends on the message type.
*
* - Optionally, the RPC message payload is read (for AppendEntries requests).
*
* - The recv callback passed to raft_io->start() gets fired with the received
* message.
*
* Possible failure modes are:
*
* - The peer server disconnects. In this case the read callback will fire with
* UV_EOF, we'll close the stream handle and then release all memory
* associated with the server object.
*
* - The peer server sends us invalid data. In this case we close the stream
* handle and act like above.
*/
struct uvServer
{
struct uv *uv; /* libuv I/O implementation object */
raft_id id; /* ID of the remote server */
char *address; /* Address of the other server */
struct uv_stream_s *stream; /* Connection handle */
uv_buf_t buf; /* Sliding buffer for reading incoming data */
uint64_t preamble[2]; /* Static buffer with the request preamble */
uv_buf_t header; /* Dynamic buffer with the request header */
uv_buf_t payload; /* Dynamic buffer with the request payload */
struct raft_message message; /* The message being received */
queue queue; /* Servers queue */
};
/* Initialize a new server object for reading requests from an incoming
* connection. */
static int uvServerInit(struct uvServer *s,
struct uv *uv,
const raft_id id,
const char *address,
struct uv_stream_s *stream)
{
s->uv = uv;
s->id = id;
s->address = RaftHeapMalloc(strlen(address) + 1);
if (s->address == NULL) {
return RAFT_NOMEM;
}
strcpy(s->address, address);
s->stream = stream;
s->stream->data = s;
s->buf.base = NULL;
s->buf.len = 0;
s->preamble[0] = 0;
s->preamble[1] = 0;
s->header.base = NULL;
s->header.len = 0;
s->message.type = 0;
s->payload.base = NULL;
s->payload.len = 0;
QUEUE_PUSH(&uv->servers, &s->queue);
return 0;
}
static void uvServerDestroy(struct uvServer *s)
{
QUEUE_REMOVE(&s->queue);
if (s->header.base != NULL) {
/* This means we were interrupted while reading the header. */
RaftHeapFree(s->header.base);
switch (s->message.type) {
case RAFT_APPEND_ENTRIES:
RaftHeapFree(s->message.append_entries.entries);
break;
case RAFT_INSTALL_SNAPSHOT:
configurationClose(&s->message.install_snapshot.conf);
break;
default:
break;
}
}
if (s->payload.base != NULL) {
/* This means we were interrupted while reading the payload. */
RaftHeapFree(s->payload.base);
}
RaftHeapFree(s->address);
RaftHeapFree(s->stream);
}
/* Invoked to initialize the read buffer for the next asynchronous read on the
* socket. */
static void uvServerAllocCb(uv_handle_t *handle,
size_t suggested_size,
uv_buf_t *buf)
{
struct uvServer *s = handle->data;
(void)suggested_size;
assert(!s->uv->closing);
/* If this is the first read of the preamble, or of the header, or of the
* payload, then initialize the read buffer, according to the chunk of data
* that we expect next. */
if (s->buf.len == 0) {
assert(s->buf.base == NULL);
/* Check if we expect the preamble. */
if (s->header.len == 0) {
assert(s->preamble[0] == 0);
assert(s->preamble[1] == 0);
s->buf.base = (char *)s->preamble;
s->buf.len = sizeof s->preamble;
goto out;
}
/* Check if we expect the header. */
if (s->payload.len == 0) {
assert(s->header.len > 0);
assert(s->header.base == NULL);
s->header.base = RaftHeapMalloc(s->header.len);
if (s->header.base == NULL) {
/* Setting all buffer fields to 0 will make read_cb fail with
* ENOBUFS. */
memset(buf, 0, sizeof *buf);
return;
}
s->buf = s->header;
goto out;
}
/* If we get here we should be expecting the payload. */
assert(s->payload.len > 0);
s->payload.base = RaftHeapMalloc(s->payload.len);
if (s->payload.base == NULL) {
/* Setting all buffer fields to 0 will make read_cb fail with
* ENOBUFS. */
memset(buf, 0, sizeof *buf);
return;
}
s->buf = s->payload;
}
out:
*buf = s->buf;
}
/* Callback invoked afer the stream handle of this server connection has been
* closed. We can release all resources associated with the server object. */
static void uvServerStreamCloseCb(uv_handle_t *handle)
{
struct uvServer *s = handle->data;
struct uv *uv = s->uv;
uvServerDestroy(s);
RaftHeapFree(s);
uvMaybeFireCloseCb(uv);
}
static void uvServerAbort(struct uvServer *s)
{
struct uv *uv = s->uv;
QUEUE_REMOVE(&s->queue);
QUEUE_PUSH(&uv->aborting, &s->queue);
uv_close((struct uv_handle_s *)s->stream, uvServerStreamCloseCb);
}
/* Invoke the receive callback. */
static void uvFireRecvCb(struct uvServer *s)
{
s->uv->recv_cb(s->uv->io, &s->message);
/* Reset our state as we'll start reading a new message. We don't need to
* release the payload buffer, since ownership was transferred to the
* user. */
memset(s->preamble, 0, sizeof s->preamble);
raft_free(s->header.base);
s->message.type = 0;
s->header.base = NULL;
s->header.len = 0;
s->payload.base = NULL;
s->payload.len = 0;
}
/* Callback invoked when data has been read from the socket. */
static void uvServerReadCb(uv_stream_t *stream,
ssize_t nread,
const uv_buf_t *buf)
{
struct uvServer *s = stream->data;
int rv;
(void)buf;
assert(!s->uv->closing);
/* If the read was successful, let's check if we have received all the data
* we expected. */
if (nread > 0) {
size_t n = (size_t)nread;
/* We shouldn't have read more data than the pending amount. */
assert(n <= s->buf.len);
/* Advance the read window */
s->buf.base += n;
s->buf.len -= n;
/* If there's more data to read in order to fill the current
* read buffer, just return, we'll be invoked again. */
if (s->buf.len > 0) {
return;
}
if (s->header.len == 0) {
/* If the header buffer is not set, it means that we've just
* completed reading the preamble. */
assert(s->header.base == NULL);
s->header.len = (size_t)byteFlip64(s->preamble[1]);
/* The length of the header must be greater than zero. */
if (s->header.len == 0) {
Tracef(s->uv->tracer, "message has zero length");
goto abort;
}
} else if (s->payload.len == 0) {
/* If the payload buffer is not set, it means we just completed
* reading the message header. */
uint64_t preamble0;
uint8_t type;
uint8_t version;
assert(s->header.base != NULL);
preamble0 = byteFlip64(s->preamble[0]);
/* Use only the first byte of the type. Normally we would check if
* type doesn't overflow UINT8_MAX, but we don't do this to allow
* future legacy nodes to still handle messages that include extra
* information in the next byte of the preamble.
*
* Once this change has been active for sufficiently long time, we
* can start using the second byte of the preamble if needed. */
type = (uint8_t)preamble0; /* Byte 0 */
version = (uint8_t)(preamble0 >> 16); /* Byte 2 */
rv = uvDecodeMessage(type, version, &s->header, &s->message,
&s->payload.len);
if (rv != 0) {
Tracef(s->uv->tracer, "decode message: %s",
errCodeToString(rv));
goto abort;
}
s->message.server_id = s->id;
s->message.server_address = s->address;
/* If the message has no payload, we're done. */
if (s->payload.len == 0) {
uvFireRecvCb(s);
}
} else {
/* If we get here it means that we've just completed reading the
* payload. TODO: avoid converting from uv_buf_t */
struct raft_buffer payload;
assert(s->payload.base != NULL);
assert(s->payload.len > 0);
switch (s->message.type) {
case RAFT_APPEND_ENTRIES:
payload.base = s->payload.base;
payload.len = s->payload.len;
uvDecodeEntriesBatch(payload.base, 0,
s->message.append_entries.entries,
s->message.append_entries.n_entries);
break;
case RAFT_INSTALL_SNAPSHOT:
s->message.install_snapshot.data.base = s->payload.base;
break;
default:
/* We should never have read a payload in the first place */
assert(0);
}
uvFireRecvCb(s);
}
/* Mark that we're done with this chunk. When the alloc callback will
* trigger again it will notice that it needs to change the read
* buffer. */
assert(s->buf.len == 0);
s->buf.base = NULL;
return;
}
/* The if nread>0 condition above should always exit the function with a
* goto abort or a return. */
assert(nread <= 0);
if (nread == 0) {
/* Empty read */
return;
}
if (nread != UV_EOF) {
Tracef(s->uv->tracer, "receive data: %s", uv_strerror((int)nread));
}
abort:
uvServerAbort(s);
}
/* Start reading incoming requests. */
static int uvServerStart(struct uvServer *s)
{
int rv;
rv = uv_read_start(s->stream, uvServerAllocCb, uvServerReadCb);
if (rv != 0) {
Tracef(s->uv->tracer, "start reading: %s", uv_strerror(rv));
return RAFT_IOERR;
}
return 0;
}
static int uvAddServer(struct uv *uv,
raft_id id,
const char *address,
struct uv_stream_s *stream)
{
struct uvServer *server;
int rv;
/* Initialize the new connection */
server = RaftHeapMalloc(sizeof *server);
if (server == NULL) {
rv = RAFT_NOMEM;
goto err;
}
rv = uvServerInit(server, uv, id, address, stream);
if (rv != 0) {
goto err_after_server_alloc;
}
/* This will start reading requests. */
rv = uvServerStart(server);
if (rv != 0) {
goto err_after_init_server;
}
return 0;
err_after_init_server:
uvServerDestroy(server);
err_after_server_alloc:
raft_free(server);
err:
assert(rv != 0);
return rv;
}
static void uvRecvAcceptCb(struct raft_uv_transport *transport,
raft_id id,
const char *address,
struct uv_stream_s *stream)
{
struct uv *uv = transport->data;
int rv;
assert(!uv->closing);
rv = uvAddServer(uv, id, address, stream);
if (rv != 0) {
tracef("add server: %s", errCodeToString(rv));
uv_close((struct uv_handle_s *)stream, (uv_close_cb)RaftHeapFree);
}
}
int UvRecvStart(struct uv *uv)
{
int rv;
rv = uv->transport->listen(uv->transport, uvRecvAcceptCb);
if (rv != 0) {
return rv;
}
return 0;
}
void UvRecvClose(struct uv *uv)
{
while (!QUEUE_IS_EMPTY(&uv->servers)) {
queue *head;
struct uvServer *server;
head = QUEUE_HEAD(&uv->servers);
server = QUEUE_DATA(head, struct uvServer, queue);
uvServerAbort(server);
}
}
#undef tracef
raft-0.22.1/src/uv_segment.c 0000664 0000000 0000000 00000102031 14601504142 0015631 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include
#include
#include
#include
#include "array.h"
#include "assert.h"
#include "byte.h"
#include "configuration.h"
#include "entry.h"
#include "heap.h"
#include "uv.h"
#include "uv_encoding.h"
#define tracef(...) Tracef(uv->tracer, __VA_ARGS__)
/* Check if the given filename matches the one of a closed segment (xxx-yyy), or
* of an open segment (open-xxx), and fill the given info structure if so.
*
* Return true if the filename matched, false otherwise. */
static bool uvSegmentInfoMatch(const char *filename, struct uvSegmentInfo *info)
{
int consumed;
int matched;
size_t n;
size_t filename_len = strnlen(filename, UV__FILENAME_LEN + 1);
assert(filename_len < UV__FILENAME_LEN);
matched = sscanf(filename, UV__CLOSED_TEMPLATE "%n", &info->first_index,
&info->end_index, &consumed);
if (matched == 2 && consumed == (int)filename_len) {
info->is_open = false;
goto match;
}
matched =
sscanf(filename, UV__OPEN_TEMPLATE "%n", &info->counter, &consumed);
if (matched == 1 && consumed == (int)filename_len) {
info->is_open = true;
goto match;
}
return false;
match:
n = sizeof(info->filename) - 1;
strncpy(info->filename, filename, n);
info->filename[n] = '\0';
return true;
}
int uvSegmentInfoAppendIfMatch(const char *filename,
struct uvSegmentInfo *infos[],
size_t *n_infos,
bool *appended)
{
struct uvSegmentInfo info;
bool matched;
int rv;
/* Check if it's a closed or open filename */
matched = uvSegmentInfoMatch(filename, &info);
/* If this is neither a closed or an open segment, return. */
if (!matched) {
*appended = false;
return 0;
}
ARRAY__APPEND(struct uvSegmentInfo, info, infos, n_infos, rv);
if (rv == -1) {
return RAFT_NOMEM;
}
*appended = true;
return 0;
}
/* Compare two segments to decide which one is more recent. */
static int uvSegmentInfoCompare(const void *p1, const void *p2)
{
struct uvSegmentInfo *s1 = (struct uvSegmentInfo *)p1;
struct uvSegmentInfo *s2 = (struct uvSegmentInfo *)p2;
/* Closed segments are less recent than open segments. */
if (s1->is_open && !s2->is_open) {
return 1;
}
if (!s1->is_open && s2->is_open) {
return -1;
}
/* If the segments are open, compare the counter. */
if (s1->is_open) {
assert(s2->is_open);
assert(s1->counter != s2->counter);
return s1->counter < s2->counter ? -1 : 1;
}
/* If the segments are closed, compare the first index. The index ranges
* must be disjoint. */
if (s2->first_index > s1->end_index) {
return -1;
}
return 1;
}
void uvSegmentSort(struct uvSegmentInfo *infos, size_t n_infos)
{
qsort(infos, n_infos, sizeof *infos, uvSegmentInfoCompare);
}
int uvSegmentKeepTrailing(struct uv *uv,
struct uvSegmentInfo *segments,
size_t n,
raft_index last_index,
size_t trailing,
char *errmsg)
{
raft_index retain_index;
size_t i;
int rv;
assert(last_index > 0);
assert(n > 0);
if (last_index <= trailing) {
return 0;
}
/* Index of the oldest entry we want to retain. */
retain_index = last_index - trailing + 1;
for (i = 0; i < n; i++) {
struct uvSegmentInfo *segment = &segments[i];
if (segment->is_open) {
break;
}
if (trailing == 0 || segment->end_index < retain_index) {
rv = UvFsRemoveFile(uv->dir, segment->filename, errmsg);
if (rv != 0) {
ErrMsgWrapf(errmsg, "delete closed segment %s",
segment->filename);
return rv;
}
} else {
break;
}
}
return 0;
}
/* Read a segment file and return its format version. */
static int uvReadSegmentFile(struct uv *uv,
const char *filename,
struct raft_buffer *buf,
uint64_t *format)
{
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
rv = UvFsReadFile(uv->dir, filename, buf, errmsg);
if (rv != 0) {
ErrMsgTransfer(errmsg, uv->io->errmsg, "read file");
return RAFT_IOERR;
}
if (buf->len < 8) {
ErrMsgPrintf(uv->io->errmsg, "file has only %zu bytes", buf->len);
RaftHeapFree(buf->base);
return RAFT_IOERR;
}
*format = byteFlip64(*(uint64_t *)buf->base);
return 0;
}
/* Consume the content buffer, returning a pointer to the current position and
* advancing the offset of n bytes. Return an error if not enough bytes are
* available. */
static int uvConsumeContent(const struct raft_buffer *content,
size_t *offset,
size_t n,
void **data,
char *errmsg)
{
if (*offset + n > content->len) {
size_t remaining = content->len - *offset;
ErrMsgPrintf(errmsg, "short read: %zu bytes instead of %zu", remaining,
n);
return RAFT_IOERR;
}
if (data != NULL) {
*data = &((uint8_t *)content->base)[*offset];
}
*offset += n;
return 0;
}
/* Load a single batch of entries from a segment.
*
* Set @last to #true if the loaded batch is the last one. */
static int uvLoadEntriesBatch(struct uv *uv,
const struct raft_buffer *content,
struct raft_entry **entries,
unsigned *n_entries,
size_t *offset, /* Offset of last batch */
bool *last)
{
void *checksums; /* CRC32 checksums */
void *batch; /* Entries batch */
unsigned long n; /* Number of entries in the batch */
unsigned max_n; /* Maximum number of entries we expect */
unsigned i; /* Iterate through the entries */
struct raft_buffer header; /* Batch header */
struct raft_buffer data; /* Batch data */
uint32_t crc1; /* Target checksum */
uint32_t crc2; /* Actual checksum */
char errmsg[RAFT_ERRMSG_BUF_SIZE];
size_t start;
int rv;
/* Save the current offset, to provide more information when logging. */
start = *offset;
/* Read the checksums. */
rv = uvConsumeContent(content, offset, sizeof(uint32_t) * 2, &checksums,
errmsg);
if (rv != 0) {
ErrMsgTransfer(errmsg, uv->io->errmsg, "read preamble");
return RAFT_IOERR;
}
/* Read the first 8 bytes of the batch, which contains the number of entries
* in the batch. */
rv = uvConsumeContent(content, offset, sizeof(uint64_t), &batch, errmsg);
if (rv != 0) {
ErrMsgTransfer(errmsg, uv->io->errmsg, "read preamble");
return RAFT_IOERR;
}
n = (size_t)byteFlip64(*(uint64_t *)batch);
if (n == 0) {
ErrMsgPrintf(uv->io->errmsg, "entries count in preamble is zero");
rv = RAFT_CORRUPT;
goto err;
}
/* Very optimistic upper bound of the number of entries we should
* expect. This is mainly a protection against allocating too much
* memory. Each entry will consume at least 4 words (for term, type, size
* and payload). */
max_n = UV__MAX_SEGMENT_SIZE / (sizeof(uint64_t) * 4);
if (n > max_n) {
ErrMsgPrintf(uv->io->errmsg,
"entries count %lu in preamble is too high", n);
rv = RAFT_CORRUPT;
goto err;
}
/* Consume the batch header, excluding the first 8 bytes containing the
* number of entries, which we have already read. */
header.len = uvSizeofBatchHeader(n);
header.base = batch;
rv = uvConsumeContent(content, offset,
uvSizeofBatchHeader(n) - sizeof(uint64_t), NULL,
errmsg);
if (rv != 0) {
ErrMsgTransfer(errmsg, uv->io->errmsg, "read header");
rv = RAFT_IOERR;
goto err;
}
/* Check batch header integrity. */
crc1 = byteFlip32(((uint32_t *)checksums)[0]);
crc2 = byteCrc32(header.base, header.len, 0);
if (crc1 != crc2) {
ErrMsgPrintf(uv->io->errmsg, "header checksum mismatch");
rv = RAFT_CORRUPT;
goto err;
}
/* Decode the batch header, allocating the entries array. */
rv = uvDecodeBatchHeader(header.base, entries, n_entries);
if (rv != 0) {
goto err;
}
/* Calculate the total size of the batch data */
data.len = 0;
for (i = 0; i < n; i++) {
data.len += (*entries)[i].buf.len;
}
data.base = (uint8_t *)content->base + *offset;
/* Consume the batch data */
rv = uvConsumeContent(content, offset, data.len, NULL, errmsg);
if (rv != 0) {
ErrMsgTransfer(errmsg, uv->io->errmsg, "read data");
rv = RAFT_IOERR;
goto err_after_header_decode;
}
/* Check batch data integrity. */
crc1 = byteFlip32(((uint32_t *)checksums)[1]);
crc2 = byteCrc32(data.base, data.len, 0);
if (crc1 != crc2) {
ErrMsgPrintf(uv->io->errmsg, "data checksum mismatch");
rv = RAFT_CORRUPT;
goto err_after_header_decode;
}
uvDecodeEntriesBatch(content->base, *offset - data.len, *entries,
*n_entries);
*last = *offset == content->len;
return 0;
err_after_header_decode:
RaftHeapFree(*entries);
err:
*entries = NULL;
*n_entries = 0;
assert(rv != 0);
*offset = start;
return rv;
}
/* Append to @entries2 all entries in @entries1. */
static int extendEntries(const struct raft_entry *entries1,
const size_t n_entries1,
struct raft_entry **entries2,
size_t *n_entries2)
{
struct raft_entry *entries; /* To re-allocate the given entries */
size_t i;
entries =
raft_realloc(*entries2, (*n_entries2 + n_entries1) * sizeof *entries);
if (entries == NULL) {
return RAFT_NOMEM;
}
for (i = 0; i < n_entries1; i++) {
entries[*n_entries2 + i] = entries1[i];
}
*entries2 = entries;
*n_entries2 += n_entries1;
return 0;
}
int uvSegmentLoadClosed(struct uv *uv,
struct uvSegmentInfo *info,
struct raft_entry *entries[],
size_t *n)
{
bool empty; /* Whether the file is empty */
uint64_t format; /* Format version */
bool last; /* Whether the last batch was reached */
struct raft_entry *tmp_entries; /* Entries in current batch */
struct raft_buffer buf; /* Segment file content */
size_t offset; /* Content read cursor */
unsigned tmp_n; /* Number of entries in current batch */
unsigned expected_n; /* Number of entries that we expect to find */
int i;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
expected_n = (unsigned)(info->end_index - info->first_index + 1);
/* If the segment is completely empty, just bail out. */
rv = UvFsFileIsEmpty(uv->dir, info->filename, &empty, errmsg);
if (rv != 0) {
tracef("stat %s: %s", info->filename, errmsg);
rv = RAFT_IOERR;
goto err;
}
if (empty) {
ErrMsgPrintf(uv->io->errmsg, "file is empty");
rv = RAFT_CORRUPT;
goto err;
}
/* Open the segment file. */
rv = uvReadSegmentFile(uv, info->filename, &buf, &format);
if (rv != 0) {
goto err;
}
if (format != UV__DISK_FORMAT) {
ErrMsgPrintf(uv->io->errmsg, "unexpected format version %ju", format);
rv = RAFT_CORRUPT;
goto err_after_read;
}
/* Load all batches in the segment. */
*entries = NULL;
*n = 0;
last = false;
offset = sizeof format;
for (i = 1; !last; i++) {
rv = uvLoadEntriesBatch(uv, &buf, &tmp_entries, &tmp_n, &offset, &last);
if (rv != 0) {
ErrMsgWrapf(uv->io->errmsg, "entries batch %u starting at byte %zu",
i, offset);
/* Clean up the last allocation from extendEntries. */
goto err_after_extend_entries;
}
rv = extendEntries(tmp_entries, tmp_n, entries, n);
if (rv != 0) {
goto err_after_batch_load;
}
raft_free(tmp_entries);
}
if (*n != expected_n) {
ErrMsgPrintf(uv->io->errmsg, "found %zu entries (expected %u)", *n,
expected_n);
rv = RAFT_CORRUPT;
goto err_after_extend_entries;
}
assert(i > 1); /* At least one batch was loaded. */
assert(*n > 0); /* At least one entry was loaded. */
return 0;
err_after_batch_load:
raft_free(tmp_entries[0].batch);
raft_free(tmp_entries);
err_after_extend_entries:
if (*entries != NULL) {
RaftHeapFree(*entries);
}
err_after_read:
RaftHeapFree(buf.base);
err:
assert(rv != 0);
return rv;
}
/* Check if the content of the segment file contains all zeros from the current
* offset onward. */
static bool uvContentHasOnlyTrailingZeros(const struct raft_buffer *buf,
size_t offset)
{
size_t i;
for (i = offset; i < buf->len; i++) {
if (((char *)buf->base)[i] != 0) {
return false;
}
}
return true;
}
/* Load all entries contained in an open segment. */
static int uvSegmentLoadOpen(struct uv *uv,
struct uvSegmentInfo *info,
struct raft_entry *entries[],
size_t *n,
raft_index *next_index)
{
raft_index first_index; /* Index of first entry in segment */
bool all_zeros; /* Whether the file is zero'ed */
bool empty; /* Whether the segment file is empty */
bool remove = false; /* Whether to remove this segment */
bool last = false; /* Whether the last batch was reached */
uint64_t format; /* Format version */
size_t n_batches = 0; /* Number of loaded batches */
struct raft_entry *tmp_entries; /* Entries in current batch */
struct raft_buffer buf = {0}; /* Segment file content */
size_t offset; /* Content read cursor */
unsigned tmp_n_entries; /* Number of entries in current batch */
int i;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
first_index = *next_index;
rv = UvFsFileIsEmpty(uv->dir, info->filename, &empty, errmsg);
if (rv != 0) {
tracef("check if %s is empty: %s", info->filename, errmsg);
rv = RAFT_IOERR;
goto err;
}
if (empty) {
/* Empty segment, let's discard it. */
tracef("remove empty open segment %s", info->filename);
remove = true;
goto done;
}
rv = uvReadSegmentFile(uv, info->filename, &buf, &format);
if (rv != 0) {
goto err;
}
/* Check that the format is the expected one, or perhaps 0, indicating that
* the segment was allocated but never written. */
offset = sizeof format;
if (format != UV__DISK_FORMAT) {
if (format == 0) {
all_zeros = uvContentHasOnlyTrailingZeros(&buf, offset);
if (all_zeros) {
/* This is equivalent to the empty case, let's remove the
* segment. */
tracef("remove zeroed open segment %s", info->filename);
remove = true;
RaftHeapFree(buf.base);
buf.base = NULL;
goto done;
}
}
ErrMsgPrintf(uv->io->errmsg, "unexpected format version %ju", format);
rv = RAFT_CORRUPT;
goto err_after_read;
}
/* Load all batches in the segment. */
for (i = 1; !last; i++) {
rv = uvLoadEntriesBatch(uv, &buf, &tmp_entries, &tmp_n_entries, &offset,
&last);
if (rv != 0) {
/* If this isn't a decoding error, just bail out. */
if (rv != RAFT_CORRUPT) {
ErrMsgWrapf(uv->io->errmsg,
"entries batch %u starting at byte %zu", i, offset);
goto err_after_read;
}
/* If this is a decoding error, and not an OS error, check if the
* rest of the file is filled with zeros. In that case we assume
* that the server shutdown uncleanly and we just truncate this
* incomplete data. */
all_zeros = uvContentHasOnlyTrailingZeros(&buf, offset);
if (!all_zeros) {
tracef("%s has non-zero trail", info->filename);
}
Tracef(uv->tracer,
"truncate open segment %s at %zu (batch %d), since it has "
"corrupted "
"entries",
info->filename, offset, i);
break;
}
rv = extendEntries(tmp_entries, tmp_n_entries, entries, n);
if (rv != 0) {
goto err_after_batch_load;
}
raft_free(tmp_entries);
n_batches++;
*next_index += tmp_n_entries;
}
if (n_batches == 0) {
RaftHeapFree(buf.base);
buf.base = NULL;
remove = true;
}
done:
/* If the segment has no valid entries in it, we remove it. Otherwise we
* rename it and keep it. */
if (remove) {
rv = UvFsRemoveFile(uv->dir, info->filename, errmsg);
if (rv != 0) {
tracef("unlink %s: %s", info->filename, errmsg);
rv = RAFT_IOERR;
goto err_after_read;
}
} else {
char filename[UV__SEGMENT_FILENAME_BUF_SIZE];
raft_index end_index = *next_index - 1;
/* At least one entry was loaded */
assert(end_index >= first_index);
int nb = snprintf(filename, sizeof(filename), UV__CLOSED_TEMPLATE,
first_index, end_index);
if ((nb < 0) || ((size_t)nb >= sizeof(filename))) {
tracef("snprintf failed: %d", nb);
rv = RAFT_IOERR;
goto err;
}
tracef("finalize %s into %s", info->filename, filename);
rv = UvFsTruncateAndRenameFile(uv->dir, (size_t)offset, info->filename,
filename, errmsg);
if (rv != 0) {
tracef("finalize %s: %s", info->filename, errmsg);
rv = RAFT_IOERR;
goto err;
}
info->is_open = false;
info->first_index = first_index;
info->end_index = end_index;
memset(info->filename, '\0', sizeof(info->filename));
_Static_assert(sizeof(info->filename) >= sizeof(filename),
"Destination buffer too small");
/* info->filename is zeroed out, info->filename is at least as large as
* filename and we checked that nb < sizeof(filename) -> we won't
* overflow and the result will be zero terminated. */
memcpy(info->filename, filename, (size_t)nb);
}
return 0;
err_after_batch_load:
raft_free(tmp_entries[0].batch);
raft_free(tmp_entries);
err_after_read:
if (buf.base != NULL) {
RaftHeapFree(buf.base);
}
err:
assert(rv != 0);
return rv;
}
/* Ensure that the write buffer of the given segment is large enough to hold the
* the given number of bytes size. */
static int uvEnsureSegmentBufferIsLargeEnough(struct uvSegmentBuffer *b,
size_t size)
{
unsigned n = (unsigned)(size / b->block_size);
void *base;
size_t len;
if (b->arena.len >= size) {
assert(b->arena.base != NULL);
return 0;
}
if (size % b->block_size != 0) {
n++;
}
len = b->block_size * n;
base = raft_aligned_alloc(b->block_size, len);
if (base == NULL) {
return RAFT_NOMEM;
}
memset(base, 0, len);
/* If the current arena is initialized, we need to copy its content, since
* it might have data that we want to retain in the next write. */
if (b->arena.base != NULL) {
assert(b->arena.len >= b->block_size);
memcpy(base, b->arena.base, b->arena.len);
raft_aligned_free(b->block_size, b->arena.base);
}
b->arena.base = base;
b->arena.len = len;
return 0;
}
void uvSegmentBufferInit(struct uvSegmentBuffer *b, size_t block_size)
{
b->block_size = block_size;
b->arena.base = NULL;
b->arena.len = 0;
b->n = 0;
}
void uvSegmentBufferClose(struct uvSegmentBuffer *b)
{
if (b->arena.base != NULL) {
raft_aligned_free(b->block_size, b->arena.base);
}
}
int uvSegmentBufferFormat(struct uvSegmentBuffer *b)
{
int rv;
uint8_t *cursor;
size_t n;
assert(b->n == 0);
n = sizeof(uint64_t);
rv = uvEnsureSegmentBufferIsLargeEnough(b, n);
if (rv != 0) {
return rv;
}
b->n = n;
cursor = (uint8_t *)b->arena.base;
bytePut64(&cursor, UV__DISK_FORMAT);
return 0;
}
int uvSegmentBufferAppend(struct uvSegmentBuffer *b,
const struct raft_entry entries[],
unsigned n_entries)
{
size_t size; /* Total size of the batch */
uint32_t crc1; /* Header checksum */
uint32_t crc2; /* Data checksum */
uint8_t *crc1_p; /* Pointer to header checksum slot */
uint8_t *crc2_p; /* Pointer to data checksum slot */
void *header; /* Pointer to the header section */
uint8_t *cursor;
unsigned i;
int rv;
size = sizeof(uint32_t) * 2; /* CRC checksums */
size += uvSizeofBatchHeader(n_entries); /* Batch header */
for (i = 0; i < n_entries; i++) { /* Entries data */
size += bytePad64(entries[i].buf.len);
}
rv = uvEnsureSegmentBufferIsLargeEnough(b, b->n + size);
if (rv != 0) {
return rv;
}
cursor = (uint8_t *)b->arena.base + b->n;
/* Placeholder of the checksums */
crc1_p = cursor;
bytePut32(&cursor, 0);
crc2_p = cursor;
bytePut32(&cursor, 0);
/* Batch header */
header = cursor;
uvEncodeBatchHeader(entries, n_entries, cursor);
crc1 = byteCrc32(header, uvSizeofBatchHeader(n_entries), 0);
cursor = (uint8_t *)cursor + uvSizeofBatchHeader(n_entries);
/* Batch data */
crc2 = 0;
for (i = 0; i < n_entries; i++) {
const struct raft_entry *entry = &entries[i];
assert(entry->buf.len % sizeof(uint64_t) == 0);
memcpy(cursor, entry->buf.base, entry->buf.len);
crc2 = byteCrc32(cursor, entry->buf.len, crc2);
cursor = (uint8_t *)cursor + entry->buf.len;
}
bytePut32(&crc1_p, crc1);
bytePut32(&crc2_p, crc2);
b->n += size;
return 0;
}
void uvSegmentBufferFinalize(struct uvSegmentBuffer *b, uv_buf_t *out)
{
unsigned n_blocks;
unsigned tail;
n_blocks = (unsigned)(b->n / b->block_size);
if (b->n % b->block_size != 0) {
n_blocks++;
}
/* Set the remainder of the last block to 0 */
tail = (unsigned)(b->n % b->block_size);
if (tail != 0) {
memset(b->arena.base + b->n, 0, b->block_size - tail);
}
out->base = b->arena.base;
out->len = n_blocks * b->block_size;
}
void uvSegmentBufferReset(struct uvSegmentBuffer *b, unsigned retain)
{
assert(b->n > 0);
assert(b->arena.base != NULL);
if (retain == 0) {
b->n = 0;
memset(b->arena.base, 0, b->block_size);
return;
}
memcpy(b->arena.base, b->arena.base + retain * b->block_size,
b->block_size);
b->n = b->n % b->block_size;
}
/* When a corrupted segment is detected, the segment is renamed.
* Upon a restart, raft will not detect the segment anymore and will try
* to start without it.
* */
#define CORRUPT_FILE_FMT "corrupt-%" PRId64 "-%s"
static void uvMoveCorruptSegment(struct uv *uv, struct uvSegmentInfo *info)
{
char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0};
char new_filename[UV__FILENAME_LEN + 1] = {0};
size_t sz = sizeof(new_filename);
int rv;
struct timespec ts = {0};
/* Ignore errors */
clock_gettime(CLOCK_REALTIME, &ts);
int64_t ns = ts.tv_sec * 1000000000 + ts.tv_nsec;
rv = snprintf(new_filename, sz, CORRUPT_FILE_FMT, ns, info->filename);
if (rv < 0 || rv >= (int)sz) {
tracef("snprintf %d", rv);
return;
}
UvFsRenameFile(uv->dir, info->filename, new_filename, errmsg);
if (rv != 0) {
tracef("%s", errmsg);
return;
}
}
/*
* On startup, raft will try to recover when a corrupt segment is detected.
*
* When a corrupt open segment is encountered, it, and all subsequent open
* segments, are renamed. Not renaming newer, possible non-corrupt, open
* segments could lead to loading inconsistent data.
*
* When a corrupt closed segment is encountered, it will be renamed when
* it is the last closed segment, in that case all open-segments are renamed
* too.
*/
static void uvRecoverFromCorruptSegment(struct uv *uv,
size_t i_corrupt,
struct uvSegmentInfo *infos,
size_t n_infos)
{
struct uvSegmentInfo *info = &infos[i_corrupt];
if (info->is_open) {
for (size_t i = i_corrupt; i < n_infos; ++i) {
info = &infos[i];
uvMoveCorruptSegment(uv, info);
}
} else {
size_t i_next = i_corrupt + 1;
/* last segment or last closed segment. */
if (i_next == n_infos || infos[i_next].is_open) {
for (size_t i = i_corrupt; i < n_infos; ++i) {
info = &infos[i];
uvMoveCorruptSegment(uv, info);
}
}
}
}
int uvSegmentLoadAll(struct uv *uv,
const raft_index start_index,
struct uvSegmentInfo *infos,
size_t n_infos,
struct raft_entry **entries,
size_t *n_entries)
{
raft_index next_index; /* Next entry to load from disk */
struct raft_entry *tmp_entries; /* Entries in current segment */
size_t tmp_n; /* Number of entries in current segment */
size_t i;
int rv;
assert(start_index >= 1);
assert(n_infos > 0);
*entries = NULL;
*n_entries = 0;
next_index = start_index;
for (i = 0; i < n_infos; i++) {
struct uvSegmentInfo *info = &infos[i];
tracef("load segment %s", info->filename);
if (info->is_open) {
rv = uvSegmentLoadOpen(uv, info, entries, n_entries, &next_index);
ErrMsgWrapf(uv->io->errmsg, "load open segment %s", info->filename);
if (rv != 0) {
if (rv == RAFT_CORRUPT && uv->auto_recovery) {
uvRecoverFromCorruptSegment(uv, i, infos, n_infos);
}
goto err;
}
} else {
assert(info->first_index >= start_index);
assert(info->first_index <= info->end_index);
/* Check that the start index encoded in the name of the segment
* matches what we expect and there are no gaps in the sequence. */
if (info->first_index != next_index) {
ErrMsgPrintf(uv->io->errmsg,
"unexpected closed segment %s: first index should "
"have been %llu",
info->filename, next_index);
rv = RAFT_CORRUPT;
goto err;
}
rv = uvSegmentLoadClosed(uv, info, &tmp_entries, &tmp_n);
if (rv != 0) {
ErrMsgWrapf(uv->io->errmsg, "load closed segment %s",
info->filename);
if (rv == RAFT_CORRUPT && uv->auto_recovery) {
uvRecoverFromCorruptSegment(uv, i, infos, n_infos);
}
goto err;
}
assert(tmp_n > 0);
rv = extendEntries(tmp_entries, tmp_n, entries, n_entries);
if (rv != 0) {
/* TODO: release memory of entries in tmp_entries */
goto err;
}
raft_free(tmp_entries);
next_index += tmp_n;
}
}
return 0;
err:
assert(rv != 0);
/* Free any batch that we might have allocated and the entries array as
* well. */
if (*entries != NULL) {
void *batch = NULL;
for (i = 0; i < *n_entries; i++) {
struct raft_entry *entry = &(*entries)[i];
if (entry->batch != batch) {
batch = entry->batch;
raft_free(batch);
}
}
raft_free(*entries);
*entries = NULL;
*n_entries = 0;
}
return rv;
}
/* Write a closed segment */
static int uvWriteClosedSegment(struct uv *uv,
raft_index first_index,
raft_index last_index,
const struct raft_buffer *conf,
raft_term conf_term)
{
char filename[UV__FILENAME_LEN];
struct uvSegmentBuffer buf = {0};
struct raft_buffer data;
struct raft_entry entry = {0};
size_t cap;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
assert(first_index <= last_index);
/* Render the path */
sprintf(filename, UV__CLOSED_TEMPLATE, first_index, last_index);
/* Make sure that the given encoded configuration fits in the first
* block */
cap = uv->block_size -
(sizeof(uint64_t) /* Format version */ +
sizeof(uint64_t) /* Checksums */ + uvSizeofBatchHeader(1));
if (conf->len > cap) {
return RAFT_TOOBIG;
}
uvSegmentBufferInit(&buf, uv->block_size);
rv = uvSegmentBufferFormat(&buf);
if (rv != 0) {
return rv;
}
entry.term = conf_term;
entry.type = RAFT_CHANGE;
entry.buf = *conf;
rv = uvSegmentBufferAppend(&buf, &entry, 1);
if (rv != 0) {
uvSegmentBufferClose(&buf);
return rv;
}
data.base = buf.arena.base;
data.len = buf.n;
rv = UvFsMakeFile(uv->dir, filename, &data, 1, errmsg);
uvSegmentBufferClose(&buf);
if (rv != 0) {
tracef("write segment %s: %s", filename, errmsg);
return RAFT_IOERR;
}
return 0;
}
int uvSegmentCreateFirstClosed(struct uv *uv,
const struct raft_configuration *configuration)
{
return uvSegmentCreateClosedWithConfiguration(uv, 1, configuration, 1);
}
int uvSegmentCreateClosedWithConfiguration(
struct uv *uv,
raft_index index,
const struct raft_configuration *configuration,
raft_term conf_term)
{
struct raft_buffer buf;
char filename[UV__FILENAME_LEN];
int rv;
/* Render the path */
sprintf(filename, UV__CLOSED_TEMPLATE, index, index);
/* Encode the given configuration. */
rv = configurationEncode(configuration, &buf);
if (rv != 0) {
goto err;
}
/* Write the file */
rv = uvWriteClosedSegment(uv, index, index, &buf, conf_term);
if (rv != 0) {
goto err_after_configuration_encode;
}
raft_free(buf.base);
rv = UvFsSyncDir(uv->dir, uv->io->errmsg);
if (rv != 0) {
return RAFT_IOERR;
}
return 0;
err_after_configuration_encode:
raft_free(buf.base);
err:
assert(rv != 0);
return rv;
}
int uvSegmentTruncate(struct uv *uv,
struct uvSegmentInfo *segment,
raft_index index)
{
char filename[UV__FILENAME_LEN];
struct raft_entry *entries;
struct uvSegmentBuffer buf;
struct raft_buffer data;
size_t n;
unsigned m;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
assert(!segment->is_open);
tracef("truncate %llu-%llu at %llu", segment->first_index,
segment->end_index, index);
rv = uvSegmentLoadClosed(uv, segment, &entries, &n);
if (rv != 0) {
ErrMsgWrapf(uv->io->errmsg, "load closed segment %s",
segment->filename);
goto out;
}
/* Discard all entries after the truncate index (included) */
assert(index - segment->first_index < n);
m = (unsigned)(index - segment->first_index);
uvSegmentBufferInit(&buf, uv->block_size);
rv = uvSegmentBufferFormat(&buf);
if (rv != 0) {
goto out_after_buffer_init;
}
rv = uvSegmentBufferAppend(&buf, entries, m);
if (rv != 0) {
goto out_after_buffer_init;
}
/* Render the path.
*
* TODO: we should use a temporary file name so in case of crash we don't
* consider this segment as corrupted.
*/
sprintf(filename, UV__CLOSED_TEMPLATE, segment->first_index, index - 1);
data.base = buf.arena.base;
data.len = buf.n;
rv = UvFsMakeFile(uv->dir, filename, &data, 1, errmsg);
if (rv != 0) {
tracef("write %s: %s", filename, errmsg);
rv = RAFT_IOERR;
goto out_after_buffer_init;
}
out_after_buffer_init:
uvSegmentBufferClose(&buf);
entryBatchesDestroy(entries, n);
out:
return rv;
}
#undef tracef
raft-0.22.1/src/uv_send.c 0000664 0000000 0000000 00000035546 14601504142 0015140 0 ustar 00root root 0000000 0000000 #include
#include "../include/raft/uv.h"
#include "assert.h"
#include "heap.h"
#include "uv.h"
#include "uv_encoding.h"
#define tracef(...) Tracef(c->uv->tracer, __VA_ARGS__)
/* The happy path for an raft_io_send request is:
*
* - Get the uvClient object whose address matches the one of target server.
* - Encode the message and write it using the uvClient's TCP handle.
* - Once the write completes, fire the send request callback.
*
* Possible failure modes are:
*
* - The uv->clients queue has no client object with a matching address. In this
* case add a new client object to the array, add the send request to the
* queue of pending requests and submit a connection request. Once the
* connection request succeeds, try to write the encoded request to the
* connected stream handle. If the connection request fails, schedule another
* attempt.
*
* - The uv->clients queue has a client object which is not connected. Add the
* send request to the pending queue, and, if there's no connection attempt
* already in progress, start a new one.
*
* - The write request fails (either synchronously or asynchronously). In this
* case we fire the request callback with an error, close the connection
* stream, and start a re-connection attempt.
*/
/* Maximum number of requests that can be buffered. */
#define UV__CLIENT_MAX_PENDING 3
struct uvClient
{
struct uv *uv; /* libuv I/O implementation object */
struct uv_timer_s timer; /* Schedule connection attempts */
struct raft_uv_connect connect; /* Connection request */
struct uv_stream_s *stream; /* Current connection handle */
struct uv_stream_s *old_stream; /* Connection handle being closed */
unsigned n_connect_attempt; /* Consecutive connection attempts */
raft_id id; /* ID of the other server */
char *address; /* Address of the other server */
queue pending; /* Pending send message requests */
queue queue; /* Clients queue */
bool closing; /* True after calling uvClientAbort */
};
/* Hold state for a single send RPC message request. */
struct uvSend
{
struct uvClient *client; /* Client connected to the target server */
struct raft_io_send *req; /* User request */
uv_buf_t *bufs; /* Encoded raft RPC message to send */
unsigned n_bufs; /* Number of buffers */
uv_write_t write; /* Stream write request */
queue queue; /* Pending send requests queue */
};
/* Free all memory used by the given send request object, including the object
* itself. */
static void uvSendDestroy(struct uvSend *s)
{
if (s->bufs != NULL) {
/* Just release the first buffer. Further buffers are entry or snapshot
* payloads, which we were passed but we don't own. */
RaftHeapFree(s->bufs[0].base);
/* Release the buffers array. */
RaftHeapFree(s->bufs);
}
RaftHeapFree(s);
}
/* Initialize a new client associated with the given server. */
static int uvClientInit(struct uvClient *c,
struct uv *uv,
raft_id id,
const char *address)
{
int rv;
c->uv = uv;
c->timer.data = c;
c->connect.data = NULL; /* Set upon starting a connect request */
c->stream = NULL; /* Set upon successful connection */
c->old_stream = NULL; /* Set after closing the current connection */
c->n_connect_attempt = 0;
c->id = id;
c->address = RaftHeapMalloc(strlen(address) + 1);
if (c->address == NULL) {
return RAFT_NOMEM;
}
rv = uv_timer_init(c->uv->loop, &c->timer);
assert(rv == 0);
strcpy(c->address, address);
QUEUE_INIT(&c->pending);
c->closing = false;
QUEUE_PUSH(&uv->clients, &c->queue);
return 0;
}
/* If there's no more pending cleanup, remove the client from the abort queue
* and destroy it. */
static void uvClientMaybeDestroy(struct uvClient *c)
{
struct uv *uv = c->uv;
assert(c->stream == NULL);
if (c->connect.data != NULL) {
return;
}
if (c->timer.data != NULL) {
return;
}
if (c->old_stream != NULL) {
return;
}
while (!QUEUE_IS_EMPTY(&c->pending)) {
queue *head;
struct uvSend *send;
struct raft_io_send *req;
head = QUEUE_HEAD(&c->pending);
send = QUEUE_DATA(head, struct uvSend, queue);
QUEUE_REMOVE(head);
req = send->req;
uvSendDestroy(send);
if (req->cb != NULL) {
req->cb(req, RAFT_CANCELED);
}
}
QUEUE_REMOVE(&c->queue);
assert(c->address != NULL);
RaftHeapFree(c->address);
RaftHeapFree(c);
uvMaybeFireCloseCb(uv);
}
/* Forward declaration. */
static void uvClientConnect(struct uvClient *c);
static void uvClientDisconnectCloseCb(struct uv_handle_s *handle)
{
struct uvClient *c = handle->data;
assert(c->old_stream != NULL);
assert(c->stream == NULL);
assert(handle == (struct uv_handle_s *)c->old_stream);
RaftHeapFree(c->old_stream);
c->old_stream = NULL;
if (c->closing) {
uvClientMaybeDestroy(c);
} else {
uvClientConnect(c); /* Trigger a new connection attempt. */
}
}
/* Close the current connection. */
static void uvClientDisconnect(struct uvClient *c)
{
assert(c->stream != NULL);
assert(c->old_stream == NULL);
c->old_stream = c->stream;
c->stream = NULL;
uv_close((struct uv_handle_s *)c->old_stream, uvClientDisconnectCloseCb);
}
/* Invoked once an encoded RPC message has been written out. */
static void uvSendWriteCb(struct uv_write_s *write, const int status)
{
struct uvSend *send = write->data;
struct uvClient *c = send->client;
struct raft_io_send *req = send->req;
int cb_status = 0;
/* If the write failed and we're not currently closing, let's consider the
* current stream handle as busted and start disconnecting (unless we're
* already doing so). We'll trigger a new connection attempt once the handle
* is closed. */
if (status != 0) {
cb_status = RAFT_IOERR;
if (!c->closing) {
if (c->stream != NULL) {
uvClientDisconnect(c);
}
} else if (status == UV_ECANCELED) {
cb_status = RAFT_CANCELED;
}
}
uvSendDestroy(send);
if (req->cb != NULL) {
req->cb(req, cb_status);
}
}
static int uvClientSend(struct uvClient *c, struct uvSend *send)
{
int rv;
assert(!c->closing);
send->client = c;
/* If there's no connection available, let's queue the request. */
if (c->stream == NULL) {
tracef("no connection available -> enqueue message");
QUEUE_PUSH(&c->pending, &send->queue);
return 0;
}
tracef("connection available -> write message");
send->write.data = send;
rv = uv_write(&send->write, c->stream, send->bufs, send->n_bufs,
uvSendWriteCb);
if (rv != 0) {
tracef("write message failed -> rv %d", rv);
/* UNTESTED: what are the error conditions? perhaps ENOMEM */
return RAFT_IOERR;
}
return 0;
}
/* Try to execute all send requests that were blocked in the queue waiting for a
* connection. */
static void uvClientSendPending(struct uvClient *c)
{
int rv;
assert(c->stream != NULL);
tracef("send pending messages");
while (!QUEUE_IS_EMPTY(&c->pending)) {
queue *head;
struct uvSend *send;
head = QUEUE_HEAD(&c->pending);
send = QUEUE_DATA(head, struct uvSend, queue);
QUEUE_REMOVE(head);
rv = uvClientSend(c, send);
if (rv != 0) {
if (send->req->cb != NULL) {
send->req->cb(send->req, rv);
}
uvSendDestroy(send);
}
}
}
static void uvClientTimerCb(uv_timer_t *timer)
{
struct uvClient *c = timer->data;
tracef("timer expired -> attempt to reconnect");
uvClientConnect(c); /* Retry to connect. */
}
/* Return the number of send requests that we have been parked in the send queue
* because no connection is available yet. */
static unsigned uvClientPendingCount(struct uvClient *c)
{
queue *head;
unsigned n = 0;
QUEUE_FOREACH (head, &c->pending) {
n++;
}
return n;
}
static void uvClientConnectCb(struct raft_uv_connect *req,
struct uv_stream_s *stream,
int status)
{
struct uvClient *c = req->data;
unsigned n_pending;
int rv;
tracef("connect attempt completed -> status %s", errCodeToString(status));
assert(c->connect.data != NULL);
assert(c->stream == NULL);
assert(c->old_stream == NULL);
assert(!uv_is_active((struct uv_handle_s *)&c->timer));
c->connect.data = NULL;
/* If we are closing, bail out, possibly discarding the new connection. */
if (c->closing) {
if (status == 0) {
assert(stream != NULL);
c->stream = stream;
c->stream->data = c;
uvClientDisconnect(c);
} else {
uvClientMaybeDestroy(c);
}
return;
}
/* If, the connection attempt was successful, we're good. If we have pending
* requests, let's try to execute them. */
if (status == 0) {
assert(stream != NULL);
c->stream = stream;
c->n_connect_attempt = 0;
c->stream->data = c;
uvClientSendPending(c);
return;
}
/* Shrink the queue of pending requests, by failing the oldest ones */
n_pending = uvClientPendingCount(c);
if (n_pending > UV__CLIENT_MAX_PENDING) {
unsigned i;
for (i = 0; i < n_pending - UV__CLIENT_MAX_PENDING; i++) {
tracef("queue full -> evict oldest message");
queue *head;
struct uvSend *old_send;
struct raft_io_send *old_req;
head = QUEUE_HEAD(&c->pending);
old_send = QUEUE_DATA(head, struct uvSend, queue);
QUEUE_REMOVE(head);
old_req = old_send->req;
uvSendDestroy(old_send);
if (old_req->cb != NULL) {
old_req->cb(old_req, RAFT_NOCONNECTION);
}
}
}
/* Let's schedule another attempt. */
rv = uv_timer_start(&c->timer, uvClientTimerCb, c->uv->connect_retry_delay,
0);
assert(rv == 0);
}
/* Perform a single connection attempt, scheduling a retry if it fails. */
static void uvClientConnect(struct uvClient *c)
{
int rv;
assert(!c->closing);
assert(c->stream == NULL);
assert(c->old_stream == NULL);
assert(!uv_is_active((struct uv_handle_s *)&c->timer));
assert(c->connect.data == NULL);
c->n_connect_attempt++;
c->connect.data = c;
rv = c->uv->transport->connect(c->uv->transport, &c->connect, c->id,
c->address, uvClientConnectCb);
if (rv != 0) {
/* Restart the timer, so we can retry. */
c->connect.data = NULL;
rv = uv_timer_start(&c->timer, uvClientTimerCb,
c->uv->connect_retry_delay, 0);
assert(rv == 0);
}
}
/* Final callback in the close chain of an io_uv__client object */
static void uvClientTimerCloseCb(struct uv_handle_s *handle)
{
struct uvClient *c = handle->data;
assert(handle == (struct uv_handle_s *)&c->timer);
c->timer.data = NULL;
uvClientMaybeDestroy(c);
}
/* Start shutting down a client. This happens when the `raft_io` instance
* has been closed or when the address of the client has changed. */
static void uvClientAbort(struct uvClient *c)
{
struct uv *uv = c->uv;
int rv;
assert(c->stream != NULL || c->old_stream != NULL ||
uv_is_active((struct uv_handle_s *)&c->timer) ||
c->connect.data != NULL);
QUEUE_REMOVE(&c->queue);
QUEUE_PUSH(&uv->aborting, &c->queue);
rv = uv_timer_stop(&c->timer);
assert(rv == 0);
/* If we are connected, let's close the outbound stream handle. This will
* eventually complete all inflight write requests, possibly with failing
* them with UV_ECANCELED. */
if (c->stream != NULL) {
uvClientDisconnect(c);
}
/* Closing the timer implicitly stop it, so the timeout callback won't be
* fired. */
uv_close((struct uv_handle_s *)&c->timer, uvClientTimerCloseCb);
c->closing = true;
}
/* Find the client object associated with the given server, or create one if
* there's none yet. */
static int uvGetClient(struct uv *uv,
const raft_id id,
const char *address,
struct uvClient **client)
{
queue *head;
int rv;
/* Check if we already have a client object for this peer server. */
QUEUE_FOREACH (head, &uv->clients) {
*client = QUEUE_DATA(head, struct uvClient, queue);
if ((*client)->id != id) {
continue;
}
/* Client address has changed, abort connection and create a new one. */
if (strcmp((*client)->address, address) != 0) {
uvClientAbort(*client);
break;
}
return 0;
}
/* Initialize the new connection */
*client = RaftHeapMalloc(sizeof **client);
if (*client == NULL) {
rv = RAFT_NOMEM;
goto err;
}
rv = uvClientInit(*client, uv, id, address);
if (rv != 0) {
goto err_after_client_alloc;
}
/* Make a first connection attempt right away.. */
uvClientConnect(*client);
return 0;
err_after_client_alloc:
RaftHeapFree(*client);
err:
assert(rv != 0);
return rv;
}
int UvSend(struct raft_io *io,
struct raft_io_send *req,
const struct raft_message *message,
raft_io_send_cb cb)
{
struct uv *uv = io->impl;
struct uvSend *send;
struct uvClient *client;
int rv;
assert(!uv->closing);
/* Allocate a new request object. */
send = RaftHeapMalloc(sizeof *send);
if (send == NULL) {
rv = RAFT_NOMEM;
goto err;
}
send->req = req;
req->cb = cb;
rv = uvEncodeMessage(message, &send->bufs, &send->n_bufs);
if (rv != 0) {
send->bufs = NULL;
goto err_after_send_alloc;
}
/* Get a client object connected to the target server, creating it if it
* doesn't exist yet. */
rv = uvGetClient(uv, message->server_id, message->server_address, &client);
if (rv != 0) {
goto err_after_send_alloc;
}
rv = uvClientSend(client, send);
if (rv != 0) {
goto err_after_send_alloc;
}
return 0;
err_after_send_alloc:
uvSendDestroy(send);
err:
assert(rv != 0);
return rv;
}
void UvSendClose(struct uv *uv)
{
assert(uv->closing);
while (!QUEUE_IS_EMPTY(&uv->clients)) {
queue *head;
struct uvClient *client;
head = QUEUE_HEAD(&uv->clients);
client = QUEUE_DATA(head, struct uvClient, queue);
uvClientAbort(client);
}
}
#undef tracef
raft-0.22.1/src/uv_snapshot.c 0000664 0000000 0000000 00000061520 14601504142 0016035 0 ustar 00root root 0000000 0000000 #include
#include
#include "array.h"
#include "assert.h"
#include "byte.h"
#include "compress.h"
#include "configuration.h"
#include "heap.h"
#include "uv.h"
#include "uv_encoding.h"
#include "uv_os.h"
#define tracef(...) Tracef(uv->tracer, __VA_ARGS__)
/* Arbitrary maximum configuration size. Should be practically be enough */
#define UV__META_MAX_CONFIGURATION_SIZE 1024 * 1024
/* Returns true if the filename is a valid snapshot file or snapshot meta
* filename depending on the `meta` switch. If the parse is successful, the
* arguments will contain the parsed values. */
static bool uvSnapshotParseFilename(const char *filename,
bool meta,
raft_term *term,
raft_index *index,
raft_time *timestamp)
{
/* Check if it's a well-formed snapshot filename */
int consumed = 0;
int matched;
size_t filename_len = strlen(filename);
assert(filename_len < UV__FILENAME_LEN);
if (meta) {
matched = sscanf(filename, UV__SNAPSHOT_META_TEMPLATE "%n", term, index,
timestamp, &consumed);
} else {
matched = sscanf(filename, UV__SNAPSHOT_TEMPLATE "%n", term, index,
timestamp, &consumed);
}
if (matched != 3 || consumed != (int)filename_len) {
return false;
}
return true;
}
/* Check if the given filename matches the pattern of a snapshot metadata
* filename (snapshot-xxx-yyy-zzz.meta), and fill the given info structure if
* so.
*
* Return true if the filename matched, false otherwise. */
static bool uvSnapshotInfoMatch(const char *filename,
struct uvSnapshotInfo *info)
{
if (!uvSnapshotParseFilename(filename, true, &info->term, &info->index,
&info->timestamp)) {
return false;
}
/* Allow room for '\0' terminator */
size_t n = sizeof(info->filename) - 1;
strncpy(info->filename, filename, n);
info->filename[n] = '\0';
return true;
}
void uvSnapshotFilenameOf(struct uvSnapshotInfo *info, char *filename)
{
size_t len = strlen(info->filename) - strlen(".meta");
assert(len < UV__FILENAME_LEN);
strcpy(filename, info->filename);
filename[len] = 0;
}
int UvSnapshotInfoAppendIfMatch(struct uv *uv,
const char *filename,
struct uvSnapshotInfo *infos[],
size_t *n_infos,
bool *appended)
{
struct uvSnapshotInfo info;
bool matched;
char snapshot_filename[UV__FILENAME_LEN];
bool exists;
bool is_empty;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
/* Check if it's a snapshot metadata filename */
matched = uvSnapshotInfoMatch(filename, &info);
if (!matched) {
*appended = false;
return 0;
}
/* Check if there's actually a valid snapshot file for this snapshot
* metadata. If there's none or it's empty, it means that we aborted before
* finishing the snapshot, or that another thread is still busy writing the
* snapshot. */
uvSnapshotFilenameOf(&info, snapshot_filename);
rv = UvFsFileExists(uv->dir, snapshot_filename, &exists, errmsg);
if (rv != 0) {
tracef("stat %s: %s", snapshot_filename, errmsg);
rv = RAFT_IOERR;
return rv;
}
if (!exists) {
*appended = false;
return 0;
}
/* TODO This check is strictly not needed, snapshot files are created by
* renaming fully written and synced tmp-files. Leaving it here, just to be
* extra-safe. Can probably be removed once more data integrity checks are
* performed at startup. */
rv = UvFsFileIsEmpty(uv->dir, snapshot_filename, &is_empty, errmsg);
if (rv != 0) {
tracef("is_empty %s: %s", snapshot_filename, errmsg);
rv = RAFT_IOERR;
return rv;
}
if (is_empty) {
*appended = false;
return 0;
}
ARRAY__APPEND(struct uvSnapshotInfo, info, infos, n_infos, rv);
if (rv == -1) {
return RAFT_NOMEM;
}
*appended = true;
return 0;
}
static int uvSnapshotIsOrphanInternal(const char *dir,
const char *filename,
bool meta,
bool *orphan)
{
raft_term term;
raft_index index;
raft_time timestamp;
bool is_snapshot_filename;
int rv;
*orphan = false;
is_snapshot_filename =
uvSnapshotParseFilename(filename, meta, &term, &index, ×tamp);
if (!is_snapshot_filename) {
return 0;
}
/* filename is a well-formed snapshot filename, check if the sibling file
* exists. */
char sibling_filename[UV__FILENAME_LEN];
if (meta) {
rv = snprintf(sibling_filename, UV__FILENAME_LEN, UV__SNAPSHOT_TEMPLATE,
term, index, timestamp);
} else {
rv = snprintf(sibling_filename, UV__FILENAME_LEN,
UV__SNAPSHOT_META_TEMPLATE, term, index, timestamp);
}
if (rv >= UV__FILENAME_LEN) {
/* Output truncated */
return -1;
}
bool sibling_exists = false;
char ignored[RAFT_ERRMSG_BUF_SIZE];
rv = UvFsFileExists(dir, sibling_filename, &sibling_exists, ignored);
if (rv != 0) {
return rv;
}
*orphan = !sibling_exists;
return 0;
}
int UvSnapshotIsOrphan(const char *dir, const char *filename, bool *orphan)
{
return uvSnapshotIsOrphanInternal(dir, filename, false, orphan);
}
int UvSnapshotMetaIsOrphan(const char *dir, const char *filename, bool *orphan)
{
return uvSnapshotIsOrphanInternal(dir, filename, true, orphan);
}
/* Compare two snapshots to decide which one is more recent. */
static int uvSnapshotCompare(const void *p1, const void *p2)
{
struct uvSnapshotInfo *s1 = (struct uvSnapshotInfo *)p1;
struct uvSnapshotInfo *s2 = (struct uvSnapshotInfo *)p2;
/* If terms are different, the snapshot with the highest term is the most
* recent. */
if (s1->term != s2->term) {
return s1->term < s2->term ? -1 : 1;
}
/* If the term are identical and the index differ, the snapshot with the
* highest index is the most recent */
if (s1->index != s2->index) {
return s1->index < s2->index ? -1 : 1;
}
/* If term and index are identical, compare the timestamp. */
return s1->timestamp < s2->timestamp ? -1 : 1;
}
/* Sort the given snapshots. */
void UvSnapshotSort(struct uvSnapshotInfo *infos, size_t n_infos)
{
qsort(infos, n_infos, sizeof *infos, uvSnapshotCompare);
}
/* Parse the metadata file of a snapshot and populate the metadata portion of
* the given snapshot object accordingly. */
static int uvSnapshotLoadMeta(struct uv *uv,
struct uvSnapshotInfo *info,
struct raft_snapshot *snapshot,
char *errmsg)
{
uint64_t header[1 + /* Format version */
1 + /* CRC checksum */
1 + /* Configuration index */
1 /* Configuration length */];
struct raft_buffer buf;
uint64_t format;
uint32_t crc1;
uint32_t crc2;
uv_file fd;
int rv;
snapshot->term = info->term;
snapshot->index = info->index;
rv = UvFsOpenFileForReading(uv->dir, info->filename, &fd, errmsg);
if (rv != 0) {
tracef("open %s: %s", info->filename, errmsg);
rv = RAFT_IOERR;
goto err;
}
buf.base = header;
buf.len = sizeof header;
rv = UvFsReadInto(fd, &buf, errmsg);
if (rv != 0) {
tracef("read %s: %s", info->filename, errmsg);
rv = RAFT_IOERR;
goto err_after_open;
}
format = byteFlip64(header[0]);
if (format != UV__DISK_FORMAT) {
tracef("load %s: unsupported format %ju", info->filename, format);
rv = RAFT_MALFORMED;
goto err_after_open;
}
crc1 = (uint32_t)byteFlip64(header[1]);
snapshot->configuration_index = byteFlip64(header[2]);
buf.len = (size_t)byteFlip64(header[3]);
if (buf.len > UV__META_MAX_CONFIGURATION_SIZE) {
tracef("load %s: configuration data too big (%zd)", info->filename,
buf.len);
rv = RAFT_CORRUPT;
goto err_after_open;
}
if (buf.len == 0) {
tracef("load %s: no configuration data", info->filename);
rv = RAFT_CORRUPT;
goto err_after_open;
}
buf.base = RaftHeapMalloc(buf.len);
if (buf.base == NULL) {
rv = RAFT_NOMEM;
goto err_after_open;
}
rv = UvFsReadInto(fd, &buf, errmsg);
if (rv != 0) {
tracef("read %s: %s", info->filename, errmsg);
rv = RAFT_IOERR;
goto err_after_buf_malloc;
}
crc2 = byteCrc32(header + 2, sizeof header - sizeof(uint64_t) * 2, 0);
crc2 = byteCrc32(buf.base, buf.len, crc2);
if (crc1 != crc2) {
ErrMsgPrintf(errmsg, "read %s: checksum mismatch", info->filename);
rv = RAFT_CORRUPT;
goto err_after_buf_malloc;
}
rv = configurationDecode(&buf, &snapshot->configuration);
if (rv != 0) {
goto err_after_buf_malloc;
}
RaftHeapFree(buf.base);
UvOsClose(fd);
return 0;
err_after_buf_malloc:
RaftHeapFree(buf.base);
err_after_open:
close(fd);
err:
assert(rv != 0);
return rv;
}
/* Load the snapshot data file and populate the data portion of the given
* snapshot object accordingly. */
static int uvSnapshotLoadData(struct uv *uv,
struct uvSnapshotInfo *info,
struct raft_snapshot *snapshot,
char *errmsg)
{
char filename[UV__FILENAME_LEN];
struct raft_buffer buf;
int rv;
uvSnapshotFilenameOf(info, filename);
rv = UvFsReadFile(uv->dir, filename, &buf, errmsg);
if (rv != 0) {
tracef("stat %s: %s", filename, errmsg);
goto err;
}
if (IsCompressed(buf.base, buf.len)) {
struct raft_buffer decompressed = {0};
tracef("snapshot decompress start");
rv = Decompress(buf, &decompressed, errmsg);
tracef("snapshot decompress end %d", rv);
if (rv != 0) {
tracef("decompress failed rv:%d", rv);
goto err_after_read_file;
}
RaftHeapFree(buf.base);
buf = decompressed;
}
snapshot->bufs = RaftHeapMalloc(sizeof *snapshot->bufs);
snapshot->n_bufs = 1;
if (snapshot->bufs == NULL) {
rv = RAFT_NOMEM;
goto err_after_read_file;
}
snapshot->bufs[0] = buf;
return 0;
err_after_read_file:
RaftHeapFree(buf.base);
err:
assert(rv != 0);
return rv;
}
int UvSnapshotLoad(struct uv *uv,
struct uvSnapshotInfo *meta,
struct raft_snapshot *snapshot,
char *errmsg)
{
int rv;
rv = uvSnapshotLoadMeta(uv, meta, snapshot, errmsg);
if (rv != 0) {
return rv;
}
rv = uvSnapshotLoadData(uv, meta, snapshot, errmsg);
if (rv != 0) {
return rv;
}
return 0;
}
struct uvSnapshotPut
{
struct uv *uv;
size_t trailing;
struct raft_io_snapshot_put *req;
const struct raft_snapshot *snapshot;
uv_file snapshot_fd; /* Pre-allocated snapshot file */
struct
{
unsigned long long timestamp;
uint64_t header[4]; /* Format, CRC, configuration index/len */
struct raft_buffer bufs[2]; /* Preamble and configuration */
uv_file fd; /* Pre-allocated metadata temp file */
} meta;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int status;
struct UvBarrierReq barrier;
};
struct uvSnapshotGet
{
struct uv *uv;
struct raft_io_snapshot_get *req;
struct raft_snapshot *snapshot;
struct uv_work_s work;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int status;
queue queue;
};
static int uvSnapshotKeepLastTwo(struct uv *uv,
struct uvSnapshotInfo *snapshots,
size_t n)
{
size_t i;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
/* Leave at least two snapshots, for safety. */
if (n <= 2) {
return 0;
}
for (i = 0; i < n - 2; i++) {
struct uvSnapshotInfo *snapshot = &snapshots[i];
char filename[UV__FILENAME_LEN];
rv = UvFsRemoveFile(uv->dir, snapshot->filename, errmsg);
if (rv != 0) {
tracef("unlink %s: %s", snapshot->filename, errmsg);
return RAFT_IOERR;
}
uvSnapshotFilenameOf(snapshot, filename);
rv = UvFsRemoveFile(uv->dir, filename, errmsg);
if (rv != 0) {
tracef("unlink %s: %s", filename, errmsg);
return RAFT_IOERR;
}
}
return 0;
}
/* Remove all segments and snapshots that are not needed anymore, because their
past the trailing amount. */
static int uvRemoveOldSegmentsAndSnapshots(struct uv *uv,
raft_index last_index,
size_t trailing,
char *errmsg)
{
struct uvSnapshotInfo *snapshots;
struct uvSegmentInfo *segments;
size_t n_snapshots;
size_t n_segments;
int rv = 0;
rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments, errmsg);
if (rv != 0) {
goto out;
}
rv = uvSnapshotKeepLastTwo(uv, snapshots, n_snapshots);
if (rv != 0) {
goto out;
}
if (segments != NULL) {
rv = uvSegmentKeepTrailing(uv, segments, n_segments, last_index,
trailing, errmsg);
if (rv != 0) {
goto out;
}
}
rv = UvFsSyncDir(uv->dir, errmsg);
out:
if (snapshots != NULL) {
RaftHeapFree(snapshots);
}
if (segments != NULL) {
RaftHeapFree(segments);
}
return rv;
}
static void uvSnapshotPutWorkCb(uv_work_t *work)
{
struct uvSnapshotPut *put = work->data;
struct uv *uv = put->uv;
char metadata[UV__FILENAME_LEN];
char snapshot[UV__FILENAME_LEN];
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
sprintf(metadata, UV__SNAPSHOT_META_TEMPLATE, put->snapshot->term,
put->snapshot->index, put->meta.timestamp);
rv = UvFsFinalizeTempFile(put->meta.fd, uv->dir, metadata, put->errmsg);
if (rv != 0) {
ErrMsgWrapf(put->errmsg, "finalize %s", metadata);
tracef("snapshot.meta creation failed: %s", put->errmsg);
put->status = RAFT_IOERR;
return;
}
sprintf(snapshot, UV__SNAPSHOT_TEMPLATE, put->snapshot->term,
put->snapshot->index, put->meta.timestamp);
rv = UvFsFinalizeTempFile(put->snapshot_fd, uv->dir, snapshot, put->errmsg);
tracef("snapshot write end %d", rv);
if (rv != 0) {
tracef("snapshot creation failed %d", rv);
ErrMsgWrapf(put->errmsg, "finalize %s", snapshot);
UvFsRemoveFile(uv->dir, metadata, errmsg);
UvFsRemoveFile(uv->dir, snapshot, errmsg);
put->status = RAFT_IOERR;
return;
}
rv = UvFsSyncDir(uv->dir, put->errmsg);
if (rv != 0) {
put->status = RAFT_IOERR;
return;
}
rv = uvRemoveOldSegmentsAndSnapshots(uv, put->snapshot->index,
put->trailing, put->errmsg);
if (rv != 0) {
put->status = rv;
return;
}
put->status = 0;
return;
}
/* Finish the put request, releasing all associated memory and invoking its
* callback. */
static void uvSnapshotPutFinish(struct uvSnapshotPut *put)
{
struct raft_io_snapshot_put *req = put->req;
int status = put->status;
struct uv *uv = put->uv;
assert(uv->snapshot_put_work.data == NULL);
RaftHeapFree(put->meta.bufs[1].base);
RaftHeapFree(put);
req->cb(req, status);
}
static void uvSnapshotPutAfterWorkCb(uv_work_t *work, int status)
{
struct uvSnapshotPut *put = work->data;
struct uv *uv = put->uv;
assert(status == 0);
uv->snapshot_put_work.data = NULL;
uvSnapshotPutFinish(put);
UvUnblock(uv);
}
/* Start processing the given put request. */
static void uvSnapshotPutStart(struct uvSnapshotPut *put)
{
struct uv *uv = put->uv;
int rv;
/* If this is an install request, the barrier callback must have fired. */
if (put->trailing == 0) {
assert(put->barrier.data == NULL);
}
uv->snapshot_put_work.data = put;
rv = uv_queue_work(uv->loop, &uv->snapshot_put_work, uvSnapshotPutWorkCb,
uvSnapshotPutAfterWorkCb);
if (rv != 0) {
tracef("store snapshot %lld: %s", put->snapshot->index,
uv_strerror(rv));
uv->errored = true;
}
}
static void uvSnapshotPutBarrierCb(struct UvBarrierReq *barrier)
{
/* Ensure that we don't invoke this callback more than once. */
barrier->cb = NULL;
struct uvSnapshotPut *put = barrier->data;
if (put == NULL) {
return;
}
struct uv *uv = put->uv;
put->barrier.data = NULL;
/* If we're closing, abort the request. */
if (uv->closing) {
put->status = RAFT_CANCELED;
uvSnapshotPutFinish(put);
uvMaybeFireCloseCb(uv);
return;
}
uvSnapshotPutStart(put);
}
static void uvSnapshotPutWorkAllocateCb(uv_work_t *work)
{
struct uvSnapshotPut *put = work->data;
struct uv *uv = put->uv;
const struct raft_snapshot *snapshot = put->snapshot;
int rv;
rv = UvFsCreateTempFile(uv->dir, put->meta.bufs, 2, &put->meta.fd,
put->errmsg);
if (rv != 0) {
goto abort;
}
rv = UvFsCreateTempFile(uv->dir, snapshot->bufs, snapshot->n_bufs,
&put->snapshot_fd, put->errmsg);
if (rv != 0) {
goto abort_after_meta_open;
}
put->status = 0;
return;
abort_after_meta_open:
UvOsClose(put->meta.fd);
abort:
put->status = rv;
}
static void uvSnapshotPutAfterWorkAllocateCb(uv_work_t *work, int status);
static void uvSnapshotPutRetryTimerCb(uv_timer_t *timer)
{
struct uvSnapshotPut *put = timer->data;
struct uv *uv = put->uv;
int rv;
uv->snapshot_put_retry.data = uv;
uv->snapshot_put_work.data = put;
rv = uv_queue_work(uv->loop, &uv->snapshot_put_work,
uvSnapshotPutWorkAllocateCb,
uvSnapshotPutAfterWorkAllocateCb);
assert(rv == 0);
}
static void uvSnapshotPutAfterWorkAllocateCb(uv_work_t *work, int status)
{
struct uvSnapshotPut *put = work->data;
const struct raft_snapshot *snapshot = put->snapshot;
struct uv *uv = put->uv;
raft_index next_index;
int rv;
assert(status == 0);
uv->snapshot_put_work.data = NULL;
if (uv->closing) {
put->status = RAFT_CANCELED;
goto abort;
}
if (put->status != 0) {
assert(uv->snapshot_put_retry.data == uv);
uv->snapshot_put_retry.data = put;
tracef("retry snapshot write");
rv = uv_timer_start(&uv->snapshot_put_retry, uvSnapshotPutRetryTimerCb,
uv->disk_retry, 0);
assert(rv == 0);
return;
}
/* - If the trailing parameter is set to 0, it means that we're restoring a
* snapshot. Submit a barrier request setting the next append index to the
* snapshot's last index + 1.
* - When we are only writing a snapshot during normal operation, we close
* all current open segments. New writes can continue on newly opened
* segments that will only contain entries that are newer than the snapshot,
* and we don't change append_next_index. */
next_index =
(put->trailing == 0) ? (snapshot->index + 1) : uv->append_next_index;
rv = UvBarrier(uv, next_index, &put->barrier);
if (rv != 0) {
put->status = rv;
goto abort;
}
return;
abort:
assert(put->status != 0);
uvSnapshotPutFinish(put);
UvUnblock(uv);
}
int UvSnapshotPut(struct raft_io *io,
unsigned trailing,
struct raft_io_snapshot_put *req,
const struct raft_snapshot *snapshot,
raft_io_snapshot_put_cb cb)
{
struct uv *uv;
struct uvSnapshotPut *put;
uint8_t *cursor;
unsigned crc;
int rv;
uv = io->impl;
if (uv->closing) {
return RAFT_CANCELED;
}
assert(uv->snapshot_put_work.data == NULL);
tracef("put snapshot at %lld, keeping %d", snapshot->index, trailing);
put = RaftHeapMalloc(sizeof *put);
if (put == NULL) {
rv = RAFT_NOMEM;
goto err;
}
put->uv = uv;
put->req = req;
put->snapshot = snapshot;
put->meta.timestamp = uv_now(uv->loop);
put->trailing = trailing;
put->barrier.data = put;
put->barrier.blocking = trailing == 0;
put->barrier.cb = uvSnapshotPutBarrierCb;
req->cb = cb;
/* Prepare the buffers for the metadata file. */
put->meta.bufs[0].base = put->meta.header;
put->meta.bufs[0].len = sizeof put->meta.header;
rv = configurationEncode(&snapshot->configuration, &put->meta.bufs[1]);
if (rv != 0) {
goto err_after_req_alloc;
}
cursor = (uint8_t *)put->meta.header;
bytePut64(&cursor, UV__DISK_FORMAT);
bytePut64(&cursor, 0);
bytePut64(&cursor, snapshot->configuration_index);
bytePut64(&cursor, put->meta.bufs[1].len);
crc = byteCrc32(&put->meta.header[2], sizeof(uint64_t) * 2, 0);
crc = byteCrc32(put->meta.bufs[1].base, put->meta.bufs[1].len, crc);
cursor = (uint8_t *)&put->meta.header[1];
bytePut64(&cursor, crc);
/* Allocate two temporary files to hold the metadata and snapshot files. */
uv->snapshot_put_work.data = put;
rv = uv_queue_work(uv->loop, &uv->snapshot_put_work,
uvSnapshotPutWorkAllocateCb,
uvSnapshotPutAfterWorkAllocateCb);
if (rv != 0) {
goto err_after_configuration_encode;
}
return 0;
err_after_configuration_encode:
RaftHeapFree(put->meta.bufs[1].base);
err_after_req_alloc:
RaftHeapFree(put);
err:
assert(rv != 0);
return rv;
}
static void uvSnapshotGetWorkCb(uv_work_t *work)
{
struct uvSnapshotGet *get = work->data;
struct uv *uv = get->uv;
struct uvSnapshotInfo *snapshots;
size_t n_snapshots;
struct uvSegmentInfo *segments;
size_t n_segments;
int rv;
get->status = 0;
rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments,
get->errmsg);
if (rv != 0) {
get->status = rv;
goto out;
}
if (snapshots != NULL) {
rv = UvSnapshotLoad(uv, &snapshots[n_snapshots - 1], get->snapshot,
get->errmsg);
if (rv != 0) {
get->status = rv;
}
RaftHeapFree(snapshots);
}
if (segments != NULL) {
RaftHeapFree(segments);
}
out:
return;
}
static void uvSnapshotGetAfterWorkCb(uv_work_t *work, int status)
{
struct uvSnapshotGet *get = work->data;
struct raft_io_snapshot_get *req = get->req;
struct raft_snapshot *snapshot = get->snapshot;
int req_status = get->status;
struct uv *uv = get->uv;
assert(status == 0);
QUEUE_REMOVE(&get->queue);
RaftHeapFree(get);
req->cb(req, snapshot, req_status);
uvMaybeFireCloseCb(uv);
}
int UvSnapshotGet(struct raft_io *io,
struct raft_io_snapshot_get *req,
raft_io_snapshot_get_cb cb)
{
struct uv *uv;
struct uvSnapshotGet *get;
int rv;
uv = io->impl;
assert(!uv->closing);
get = RaftHeapMalloc(sizeof *get);
if (get == NULL) {
rv = RAFT_NOMEM;
goto err;
}
get->uv = uv;
get->req = req;
req->cb = cb;
get->snapshot = RaftHeapMalloc(sizeof *get->snapshot);
if (get->snapshot == NULL) {
rv = RAFT_NOMEM;
goto err_after_req_alloc;
}
get->work.data = get;
QUEUE_PUSH(&uv->snapshot_get_reqs, &get->queue);
rv = uv_queue_work(uv->loop, &get->work, uvSnapshotGetWorkCb,
uvSnapshotGetAfterWorkCb);
if (rv != 0) {
QUEUE_REMOVE(&get->queue);
tracef("get last snapshot: %s", uv_strerror(rv));
rv = RAFT_IOERR;
goto err_after_snapshot_alloc;
}
return 0;
err_after_snapshot_alloc:
RaftHeapFree(get->snapshot);
err_after_req_alloc:
RaftHeapFree(get);
err:
assert(rv != 0);
return rv;
}
static void uvSnapshotPutRetryCloseCb(uv_handle_t *handle)
{
struct uv *uv = handle->data;
assert(uv->closing);
uv->snapshot_put_retry.data = NULL;
uvMaybeFireCloseCb(uv);
}
void UvSnapshotClose(struct uv *uv)
{
if (uv->snapshot_put_retry.data != NULL) {
if (uv->snapshot_put_retry.data != uv) {
struct uvSnapshotPut *put = uv->snapshot_put_retry.data;
put->status = RAFT_CANCELED;
uvSnapshotPutFinish(put);
UvUnblock(uv);
uv->snapshot_put_retry.data = uv;
}
uv_timer_stop(&uv->snapshot_put_retry);
uv_close((uv_handle_t *)&uv->snapshot_put_retry,
uvSnapshotPutRetryCloseCb);
}
}
#undef tracef
raft-0.22.1/src/uv_tcp.c 0000664 0000000 0000000 00000005762 14601504142 0014772 0 ustar 00root root 0000000 0000000 #include "uv_tcp.h"
#include "uv_ip.h"
#include
#include "../include/raft.h"
#include "../include/raft/uv.h"
#include "assert.h"
#include "err.h"
#include "heap.h"
/* Implementation of raft_uv_transport->init. */
static int uvTcpInit(struct raft_uv_transport *transport,
raft_id id,
const char *address)
{
struct UvTcp *t = transport->impl;
assert(id > 0);
assert(address != NULL);
t->id = id;
t->address = address;
return 0;
}
/* Implementation of raft_uv_transport->close. */
static void uvTcpClose(struct raft_uv_transport *transport,
raft_uv_transport_close_cb cb)
{
struct UvTcp *t = transport->impl;
assert(!t->closing);
t->closing = true;
t->close_cb = cb;
UvTcpListenClose(t);
UvTcpConnectClose(t);
UvTcpMaybeFireCloseCb(t);
}
void UvTcpMaybeFireCloseCb(struct UvTcp *t)
{
if (!t->closing) {
return;
}
assert(QUEUE_IS_EMPTY(&t->accepting));
assert(QUEUE_IS_EMPTY(&t->connecting));
if (!QUEUE_IS_EMPTY(&t->aborting)) {
return;
}
if (t->listeners != NULL) {
return;
}
if (t->close_cb != NULL) {
t->close_cb(t->transport);
}
}
int raft_uv_tcp_init(struct raft_uv_transport *transport,
struct uv_loop_s *loop)
{
struct UvTcp *t;
void *data = transport->data;
int version = transport->version;
if (version != 1) {
ErrMsgPrintf(transport->errmsg, "Invalid version: %d", version);
return RAFT_INVALID;
}
memset(transport, 0, sizeof *transport);
transport->data = data;
transport->version = version;
t = raft_malloc(sizeof *t);
if (t == NULL) {
ErrMsgOom(transport->errmsg);
return RAFT_NOMEM;
}
t->transport = transport;
t->loop = loop;
t->id = 0;
t->address = NULL;
t->bind_address = NULL;
t->listeners = NULL;
t->n_listeners = 0;
t->accept_cb = NULL;
QUEUE_INIT(&t->accepting);
QUEUE_INIT(&t->connecting);
QUEUE_INIT(&t->aborting);
t->closing = false;
t->close_cb = NULL;
transport->impl = t;
transport->init = uvTcpInit;
transport->close = uvTcpClose;
transport->listen = UvTcpListen;
transport->connect = UvTcpConnect;
return 0;
}
void raft_uv_tcp_close(struct raft_uv_transport *transport)
{
struct UvTcp *t = transport->impl;
raft_free(t->bind_address);
raft_free(t);
}
int raft_uv_tcp_set_bind_address(struct raft_uv_transport *transport,
const char *address)
{
struct UvTcp *t = transport->impl;
char hostname[NI_MAXHOST];
char service[NI_MAXSERV];
int rv;
rv = uvIpAddrSplit(address, hostname, sizeof(hostname), service,
sizeof(service));
if (rv != 0) {
return RAFT_INVALID;
}
t->bind_address = raft_malloc(strlen(address) + 1);
if (t->bind_address == NULL) {
return RAFT_NOMEM;
}
strcpy(t->bind_address, address);
return 0;
}
raft-0.22.1/src/uv_tcp.h 0000664 0000000 0000000 00000003552 14601504142 0014772 0 ustar 00root root 0000000 0000000 #ifndef UV_TCP_H_
#define UV_TCP_H_
#include "../include/raft/uv.h"
#include "queue.h"
/* Protocol version. */
#define UV__TCP_HANDSHAKE_PROTOCOL 1
struct UvTcp
{
struct raft_uv_transport *transport; /* Interface object we implement */
struct uv_loop_s *loop; /* Event loop */
raft_id id; /* ID of this raft server */
const char *address; /* Address of this raft server */
unsigned n_listeners; /* Number of listener sockets */
struct uv_tcp_s *listeners; /* Listener sockets */
raft_uv_accept_cb accept_cb; /* Call after accepting a connection */
queue accepting; /* Connections being accepted */
queue connecting; /* Pending connection requests */
queue aborting; /* Connections being aborted */
bool closing; /* True after close() is called */
raft_uv_transport_close_cb close_cb; /* Call when it's safe to free us */
char *bind_address; /* Optional address:port to bind to */
};
/* Implementation of raft_uv_transport->listen. */
int UvTcpListen(struct raft_uv_transport *transport, raft_uv_accept_cb cb);
/* Stop accepting new connection and close all connections being accepted. */
void UvTcpListenClose(struct UvTcp *t);
/* Implementation of raft_uv_transport->connect. */
int UvTcpConnect(struct raft_uv_transport *transport,
struct raft_uv_connect *req,
raft_id id,
const char *address,
raft_uv_connect_cb cb);
/* Abort all pending connection requests. */
void UvTcpConnectClose(struct UvTcp *t);
/* Fire the transport close callback if the transport is closing and there's no
* more pending callback. */
void UvTcpMaybeFireCloseCb(struct UvTcp *t);
#endif /* UV_TCP_H_ */
raft-0.22.1/src/uv_tcp_connect.c 0000664 0000000 0000000 00000027165 14601504142 0016504 0 ustar 00root root 0000000 0000000 #include
#include "assert.h"
#include "byte.h"
#include "err.h"
#include "heap.h"
#include "uv_ip.h"
#include "uv_tcp.h"
/* The happy path of a connection request is:
*
* - Create a TCP handle and submit a TCP connect request.
* - Initiate an asynchronous dns resolve request
* - Once the name lookup was successfull connect to the first resolved IP
* - Once connected over TCP, submit a write request for the handshake.
* - Once the write completes, fire the connection request callback.
*
* Alternative happy path of a connection request, if hostname resolves to
* multiple IPs and first/second/... IP is reachable:
* - close the tcp handle and initiate a new connect with next IP in cb
*
* Possible failure modes are:
*
* - The name resolve for the hostname is not sucessfull, close the TCP handle
* and fire the request callback.
*
* - The raft_uv_transport object gets closed, close the TCP handle and and fire
* the request callback with RAFT_CANCELED.
*
* - Either the TCP connect or the write request fails: close the TCP handle and
* fire the request callback with RAFT_NOCONNECTION.
*/
/* Hold state for a single connection request. */
struct uvTcpConnect
{
struct UvTcp *t; /* Transport implementation */
struct raft_uv_connect *req; /* User request */
uv_buf_t handshake; /* Handshake data */
struct uv_tcp_s *tcp; /* TCP connection socket handle */
struct uv_getaddrinfo_s getaddrinfo; /* DNS resolve request */
const struct addrinfo *ai_current; /* The current sockaddr to connect to */
struct uv_connect_s connect; /* TCP connection request */
struct uv_write_s write; /* TCP handshake request */
int status; /* Returned to the request callback */
bool resolving; /* Indicate name resolving in progress */
bool retry; /* Indicate tcp connect failure handling */
queue queue; /* Pending connect queue */
};
/* Encode an handshake message into the given buffer. */
static int uvTcpEncodeHandshake(raft_id id, const char *address, uv_buf_t *buf)
{
uint8_t *cursor;
size_t address_len = strlen(address) + 1;
size_t address_len_padded = bytePad64(address_len);
buf->len = sizeof(uint64_t) + /* Protocol version. */
sizeof(uint64_t) + /* Server ID. */
sizeof(uint64_t) + /* Size of the address buffer */
address_len_padded;
buf->base = RaftHeapMalloc(buf->len);
if (buf->base == NULL) {
return RAFT_NOMEM;
}
cursor = (uint8_t *)buf->base;
bytePut64(&cursor, UV__TCP_HANDSHAKE_PROTOCOL);
bytePut64(&cursor, id);
bytePut64(&cursor, address_len_padded);
memcpy(cursor, address, address_len);
cursor += address_len;
memset(cursor, 0, address_len_padded - address_len);
return 0;
}
/* Finish the connect request, releasing its memory and firing the connect
* callback. */
static void uvTcpConnectFinish(struct uvTcpConnect *connect)
{
struct uv_stream_s *stream = (struct uv_stream_s *)connect->tcp;
struct raft_uv_connect *req = connect->req;
int status = connect->status;
QUEUE_REMOVE(&connect->queue);
RaftHeapFree(connect->handshake.base);
uv_freeaddrinfo(connect->getaddrinfo.addrinfo);
raft_free(connect);
req->cb(req, stream, status);
}
/* The TCP connection handle has been closed in consequence of an error or
* because the transport is closing. */
static void uvTcpConnectUvCloseCb(struct uv_handle_s *handle)
{
struct uvTcpConnect *connect = handle->data;
struct UvTcp *t = connect->t;
assert(connect->status != 0);
assert(handle == (struct uv_handle_s *)connect->tcp);
RaftHeapFree(connect->tcp);
connect->tcp = NULL;
uvTcpConnectFinish(connect);
UvTcpMaybeFireCloseCb(t);
}
/* Abort a connection request. */
static void uvTcpConnectAbort(struct uvTcpConnect *connect)
{
QUEUE_REMOVE(&connect->queue);
QUEUE_PUSH(&connect->t->aborting, &connect->queue);
uv_cancel((struct uv_req_s *)&connect->getaddrinfo);
/* If there is no getaddrinfo request in flight, close the TCP handle now
* (otherwise it will be closed after the getaddrinfo request completes). */
if (!connect->resolving && !connect->retry) {
uv_close((struct uv_handle_s *)connect->tcp, uvTcpConnectUvCloseCb);
}
}
/* The handshake TCP write completes. Fire the connect callback. */
static void uvTcpConnectUvWriteCb(struct uv_write_s *write, int status)
{
struct uvTcpConnect *connect = write->data;
struct UvTcp *t = connect->t;
if (t->closing) {
connect->status = RAFT_CANCELED;
return;
}
if (status != 0) {
assert(status != UV_ECANCELED); /* t->closing would have been true */
connect->status = RAFT_NOCONNECTION;
uvTcpConnectAbort(connect);
return;
}
uvTcpConnectFinish(connect);
}
/* Helper function to connect to the remote node */
static void uvTcpAsyncConnect(struct uvTcpConnect *connect);
/* The TCP connect failed, we closed the handle and want to try with next IP */
static void uvTcpTryNextConnectCb(struct uv_handle_s *handle)
{
struct uvTcpConnect *connect = handle->data;
struct UvTcp *t = connect->t;
int rv;
connect->retry = false;
if (t->closing) {
connect->status = RAFT_CANCELED;
/* We are already in close cb for the tcp handle, simply invoke final cb
*/
uvTcpConnectUvCloseCb(handle);
return;
}
rv = uv_tcp_init(t->loop, connect->tcp);
assert(rv == 0);
uvTcpAsyncConnect(connect);
}
/* The TCP connection is established. Write the handshake data. */
static void uvTcpConnectUvConnectCb(struct uv_connect_s *req, int status)
{
struct uvTcpConnect *connect = req->data;
struct UvTcp *t = connect->t;
int rv;
if (t->closing) {
connect->status = RAFT_CANCELED;
return;
}
if (status != 0) {
assert(status != UV_ECANCELED); /* t->closing would have been true */
connect->ai_current = connect->ai_current->ai_next;
if (connect->ai_current) {
/* For the next connect attempt we need to close the tcp handle. */
/* To avoid interference with aborting we set a flag to indicate the
* connect attempt */
connect->retry = true;
uv_close((struct uv_handle_s *)connect->tcp, uvTcpTryNextConnectCb);
return;
}
connect->status = RAFT_NOCONNECTION;
ErrMsgPrintf(t->transport->errmsg, "uv_tcp_connect(): %s",
uv_strerror(status));
goto err;
}
rv = uv_write(&connect->write, (struct uv_stream_s *)connect->tcp,
&connect->handshake, 1, uvTcpConnectUvWriteCb);
if (rv != 0) {
/* UNTESTED: what are the error conditions? perhaps ENOMEM */
connect->status = RAFT_NOCONNECTION;
goto err;
}
return;
err:
uvTcpConnectAbort(connect);
}
/* Helper function to connect to the remote node */
static void uvTcpAsyncConnect(struct uvTcpConnect *connect)
{
int rv;
rv = uv_tcp_connect(&connect->connect, connect->tcp,
connect->ai_current->ai_addr, uvTcpConnectUvConnectCb);
if (rv != 0) {
/* UNTESTED: since parsing succeed, this should fail only because of
* lack of system resources */
ErrMsgPrintf(connect->t->transport->errmsg, "uv_tcp_connect(): %s",
uv_strerror(rv));
connect->status = RAFT_NOCONNECTION;
uvTcpConnectAbort(connect);
}
}
/* The hostname resolve is finished */
static void uvTcpConnectGetAddrInfoCb(uv_getaddrinfo_t *req,
int status,
struct addrinfo *res)
{
struct uvTcpConnect *connect = req->data;
struct UvTcp *t = connect->t;
connect->resolving =
false; /* Indicate we are in the name resolving phase */
if (t->closing) {
connect->status = RAFT_CANCELED;
/* We need to close the tcp handle to abort connection attempt */
uv_close((struct uv_handle_s *)connect->tcp, uvTcpConnectUvCloseCb);
return;
}
if (status < 0) {
ErrMsgPrintf(t->transport->errmsg, "uv_getaddrinfo(): %s",
uv_err_name(status));
connect->status = RAFT_NOCONNECTION;
uvTcpConnectAbort(connect);
return;
}
connect->ai_current = res;
uvTcpAsyncConnect(connect);
}
/* Create a new TCP handle and submit a connection request to the event loop. */
static int uvTcpConnectStart(struct uvTcpConnect *r, const char *address)
{
struct UvTcp *t = r->t;
static struct addrinfo hints = {.ai_family = AF_INET,
.ai_socktype = SOCK_STREAM,
.ai_protocol = 0,
.ai_flags = 0};
char hostname[NI_MAXHOST];
char service[NI_MAXSERV];
int rv;
rv = uvIpAddrSplit(address, hostname, sizeof(hostname), service,
sizeof(service));
if (rv) {
ErrMsgPrintf(t->transport->errmsg,
"uv_tcp_connect(): Cannot split %s into host and service",
address);
rv = RAFT_NOCONNECTION;
goto err;
}
/* Initialize the handshake buffer. */
rv = uvTcpEncodeHandshake(t->id, t->address, &r->handshake);
if (rv != 0) {
assert(rv == RAFT_NOMEM);
ErrMsgOom(t->transport->errmsg);
goto err;
}
r->tcp = RaftHeapMalloc(sizeof *r->tcp);
if (r->tcp == NULL) {
ErrMsgOom(t->transport->errmsg);
rv = RAFT_NOMEM;
goto err_after_encode_handshake;
}
rv = uv_tcp_init(r->t->loop, r->tcp);
assert(rv == 0);
r->tcp->data = r;
rv = uv_getaddrinfo(r->t->loop, &r->getaddrinfo, &uvTcpConnectGetAddrInfoCb,
hostname, service, &hints);
if (rv) {
ErrMsgPrintf(t->transport->errmsg,
"uv_tcp_connect(): Cannot initiate getaddrinfo %s",
uv_strerror(rv));
rv = RAFT_NOCONNECTION;
goto err_after_tcp_init;
}
r->resolving = true; /* Indicate we are in the name resolving phase */
return 0;
err_after_tcp_init:
uv_close((uv_handle_t *)r->tcp, (uv_close_cb)RaftHeapFree);
err_after_encode_handshake:
RaftHeapFree(r->handshake.base);
err:
return rv;
}
int UvTcpConnect(struct raft_uv_transport *transport,
struct raft_uv_connect *req,
raft_id id,
const char *address,
raft_uv_connect_cb cb)
{
struct UvTcp *t = transport->impl;
struct uvTcpConnect *r;
int rv;
(void)id;
assert(!t->closing);
/* Create and initialize a new TCP connection request object */
r = RaftHeapMalloc(sizeof *r);
if (r == NULL) {
rv = RAFT_NOMEM;
ErrMsgOom(transport->errmsg);
goto err;
}
r->t = t;
r->req = req;
r->status = 0;
r->write.data = r;
r->getaddrinfo.data = r;
r->resolving = false;
r->retry = false;
r->connect.data = r;
req->cb = cb;
/* Keep track of the pending request */
QUEUE_PUSH(&t->connecting, &r->queue);
/* Start connecting */
rv = uvTcpConnectStart(r, address);
if (rv != 0) {
goto err_after_alloc;
}
return 0;
err_after_alloc:
QUEUE_REMOVE(&r->queue);
RaftHeapFree(r);
err:
return rv;
}
void UvTcpConnectClose(struct UvTcp *t)
{
while (!QUEUE_IS_EMPTY(&t->connecting)) {
struct uvTcpConnect *connect;
queue *head;
head = QUEUE_HEAD(&t->connecting);
connect = QUEUE_DATA(head, struct uvTcpConnect, queue);
uvTcpConnectAbort(connect);
}
}
raft-0.22.1/src/uv_tcp_listen.c 0000664 0000000 0000000 00000030776 14601504142 0016353 0 ustar 00root root 0000000 0000000 #include
#include "assert.h"
#include "byte.h"
#include "heap.h"
#include "uv_ip.h"
#include "uv_tcp.h"
/* The happy path of an incoming connection is:
*
* - The connection callback is fired on the listener TCP handle, and the
* incoming connection is uv_accept()'ed. We call uv_read_start() to get
* notified about received handshake data.
*
* - Once the preamble is received, we start waiting for the server address.
*
* - Once the server address is received, we fire the receive callback.
*
* Possible failure modes are:
*
* - The accept process gets canceled in the transport->close() implementation,
* by calling tcp_accept_stop(): the incoming TCP connection handle gets
* closed, preventing any further handshake data notification, and all
* allocated memory gets released in the handle close callback.
*/
/* Hold state for a connection being accepted. */
struct uvTcpHandshake
{
uint64_t preamble[3]; /* Preamble buffer */
uv_buf_t address; /* Address buffer */
size_t nread; /* Number of bytes read */
};
/* Hold handshake data for a new connection being established. */
struct uvTcpIncoming
{
struct UvTcp *t; /* Transport implementation */
struct uv_tcp_s *listener; /* The tcp handle, which accepted this socket */
struct uv_tcp_s *tcp; /* TCP connection socket handle */
struct uvTcpHandshake handshake; /* Handshake data */
queue queue; /* Pending accept queue */
};
/* Decode the handshake preamble, containing the protocol version, the ID of the
* connecting server and the length of its address. Also, allocate the buffer to
* start reading the server address. */
static int uvTcpDecodePreamble(struct uvTcpHandshake *h)
{
uint64_t protocol;
protocol = byteFlip64(h->preamble[0]);
if (protocol != UV__TCP_HANDSHAKE_PROTOCOL) {
return RAFT_MALFORMED;
}
h->address.len = (size_t)byteFlip64(h->preamble[2]);
h->address.base = RaftHeapMalloc(h->address.len);
if (h->address.base == NULL) {
return RAFT_NOMEM;
}
h->nread = 0;
return 0;
}
/* The accepted TCP client connection has been closed, release all memory
* associated with accept object. We can get here only if an error occurrent
* during the handshake or if raft_uv_transport->close() has been invoked. */
static void uvTcpIncomingCloseCb(struct uv_handle_s *handle)
{
struct uvTcpIncoming *incoming = handle->data;
struct UvTcp *t = incoming->t;
QUEUE_REMOVE(&incoming->queue);
if (incoming->handshake.address.base != NULL) {
RaftHeapFree(incoming->handshake.address.base);
}
RaftHeapFree(incoming->tcp);
RaftHeapFree(incoming);
UvTcpMaybeFireCloseCb(t);
}
/* Close an incoming TCP connection which hasn't complete the handshake yet. */
static void uvTcpIncomingAbort(struct uvTcpIncoming *incoming)
{
struct UvTcp *t = incoming->t;
/* After uv_close() returns we are guaranteed that no more alloc_cb or
* read_cb will be called. */
QUEUE_REMOVE(&incoming->queue);
QUEUE_PUSH(&t->aborting, &incoming->queue);
uv_close((struct uv_handle_s *)incoming->tcp, uvTcpIncomingCloseCb);
}
/* Read the address part of the handshake. */
static void uvTcpIncomingAllocCbAddress(struct uv_handle_s *handle,
size_t suggested_size,
uv_buf_t *buf)
{
struct uvTcpIncoming *incoming = handle->data;
(void)suggested_size;
assert(!incoming->t->closing);
buf->base = incoming->handshake.address.base + incoming->handshake.nread;
buf->len = incoming->handshake.address.len - incoming->handshake.nread;
}
static void uvTcpIncomingReadCbAddress(uv_stream_t *stream,
ssize_t nread,
const uv_buf_t *buf)
{
struct uvTcpIncoming *incoming = stream->data;
char *address;
raft_id id;
size_t n;
int rv;
(void)buf;
assert(!incoming->t->closing);
if (nread == 0) {
/* Empty read just ignore it. */
return;
}
if (nread < 0) {
uvTcpIncomingAbort(incoming);
return;
}
/* We shouldn't have read more data than the pending amount. */
n = (size_t)nread;
assert(n <= incoming->handshake.address.len - incoming->handshake.nread);
/* Advance the read window */
incoming->handshake.nread += n;
/* If there's more data to read in order to fill the current
* read buffer, just return, we'll be invoked again. */
if (incoming->handshake.nread < incoming->handshake.address.len) {
return;
}
/* If we have completed reading the address, let's fire the callback. */
rv = uv_read_stop(stream);
assert(rv == 0);
id = byteFlip64(incoming->handshake.preamble[1]);
address = incoming->handshake.address.base;
QUEUE_REMOVE(&incoming->queue);
incoming->t->accept_cb(incoming->t->transport, id, address,
(struct uv_stream_s *)incoming->tcp);
RaftHeapFree(incoming->handshake.address.base);
RaftHeapFree(incoming);
}
/* Read the preamble of the handshake. */
static void uvTcpIncomingAllocCbPreamble(struct uv_handle_s *handle,
size_t suggested_size,
uv_buf_t *buf)
{
struct uvTcpIncoming *incoming = handle->data;
(void)suggested_size;
buf->base =
(char *)incoming->handshake.preamble + incoming->handshake.nread;
buf->len = sizeof incoming->handshake.preamble - incoming->handshake.nread;
}
static void uvTcpIncomingReadCbPreamble(uv_stream_t *stream,
ssize_t nread,
const uv_buf_t *buf)
{
struct uvTcpIncoming *incoming = stream->data;
size_t n;
int rv;
(void)buf;
if (nread == 0) {
/* Empty read just ignore it. */
return;
}
if (nread < 0) {
uvTcpIncomingAbort(incoming);
return;
}
/* We shouldn't have read more data than the pending amount. */
n = (size_t)nread;
assert(n <=
sizeof incoming->handshake.preamble - incoming->handshake.nread);
/* Advance the read window */
incoming->handshake.nread += n;
/* If there's more data to read in order to fill the current
* read buffer, just return, we'll be invoked again. */
if (incoming->handshake.nread < sizeof incoming->handshake.preamble) {
return;
}
/* If we have completed reading the preamble, let's parse it. */
rv = uvTcpDecodePreamble(&incoming->handshake);
if (rv != 0) {
uvTcpIncomingAbort(incoming);
return;
}
rv = uv_read_stop(stream);
assert(rv == 0);
rv = uv_read_start((uv_stream_t *)incoming->tcp,
uvTcpIncomingAllocCbAddress, uvTcpIncomingReadCbAddress);
assert(rv == 0);
}
/* Start reading handshake data for a new incoming connection. */
static int uvTcpIncomingStart(struct uvTcpIncoming *incoming)
{
int rv;
memset(&incoming->handshake, 0, sizeof incoming->handshake);
incoming->tcp = RaftHeapMalloc(sizeof *incoming->tcp);
if (incoming->tcp == NULL) {
return RAFT_NOMEM;
}
incoming->tcp->data = incoming;
rv = uv_tcp_init(incoming->t->loop, incoming->tcp);
assert(rv == 0);
rv = uv_accept((struct uv_stream_s *)incoming->listener,
(struct uv_stream_s *)incoming->tcp);
if (rv != 0) {
rv = RAFT_IOERR;
goto err_after_tcp_init;
}
rv = uv_read_start((uv_stream_t *)incoming->tcp,
uvTcpIncomingAllocCbPreamble,
uvTcpIncomingReadCbPreamble);
assert(rv == 0);
return 0;
err_after_tcp_init:
uv_close((uv_handle_t *)incoming->tcp, (uv_close_cb)RaftHeapFree);
return rv;
}
#define IS_IN_ARRAY(elem, array, array_size) \
(const char *)(elem) >= (const char *)(array) && \
(const char *)(elem) < \
(const char *)(array) + array_size * sizeof(*array)
/* Called when there's a new incoming connection: create a new tcp_accept object
* and start receiving handshake data. */
static void uvTcpListenCb(struct uv_stream_s *stream, int status)
{
struct UvTcp *t = stream->data;
struct uvTcpIncoming *incoming;
int rv;
assert(IS_IN_ARRAY(stream, t->listeners, t->n_listeners));
if (status != 0) {
rv = RAFT_IOERR;
goto err;
}
incoming = RaftHeapMalloc(sizeof *incoming);
if (incoming == NULL) {
rv = RAFT_NOMEM;
goto err;
}
incoming->t = t;
incoming->listener = (struct uv_tcp_s *)stream;
incoming->tcp = NULL;
QUEUE_PUSH(&t->accepting, &incoming->queue);
rv = uvTcpIncomingStart(incoming);
if (rv != 0) {
goto err_after_accept_alloc;
}
return;
err_after_accept_alloc:
QUEUE_REMOVE(&incoming->queue);
RaftHeapFree(incoming);
err:
assert(rv != 0);
}
/* Do bind/listen call on the tcp handle */
static int uvTcpBindListen(struct uv_tcp_s *listener, struct sockaddr *addr)
{
if (uv_tcp_bind(listener, addr, 0) ||
uv_listen((uv_stream_t *)listener, 1, uvTcpListenCb)) {
return RAFT_IOERR;
}
return 0;
}
/* Create a tcp handle and do bind/listen for each IP */
static int uvTcpListenOnMultipleIP(struct raft_uv_transport *transport,
struct addrinfo *addr_infos)
{
struct UvTcp *t;
struct addrinfo *current;
unsigned n_listeners;
int rv;
t = transport->impl;
n_listeners = 0;
for (current = addr_infos; current; current = current->ai_next) {
++n_listeners;
}
current = addr_infos;
t->listeners = raft_malloc(n_listeners * sizeof(*t->listeners));
if (!t->listeners) {
rv = RAFT_NOMEM;
goto err;
}
t->n_listeners = n_listeners;
for (n_listeners = 0; n_listeners < t->n_listeners; ++n_listeners) {
struct uv_tcp_s *listener = &t->listeners[n_listeners];
listener->data = t;
if (uv_tcp_init(t->loop, listener) ||
uvTcpBindListen(listener, current->ai_addr)) {
rv = RAFT_IOERR;
goto err;
}
current = addr_infos->ai_next;
}
return 0;
err:
if (t->listeners) {
for (unsigned i = 0; i <= n_listeners; ++i) {
uv_close((struct uv_handle_s *)&t->listeners[i], NULL);
}
raft_free(t->listeners);
t->listeners = NULL;
t->n_listeners = 0;
}
return rv;
}
/* Ignore duplicate entries from glibc getaddrinfo due to
* https://bugzilla.redhat.com/show_bug.cgi?id=496300
* in case of resolving localhost */
static bool uvIsAddressDuplication(struct addrinfo *addr_info)
{
struct addrinfo *next = addr_info->ai_next;
/* Check, if we have a list of length 2 */
if (!next || next->ai_next) {
return false;
}
if (addr_info->ai_addrlen != next->ai_addrlen ||
bcmp(addr_info->ai_addr, next->ai_addr, addr_info->ai_addrlen)) {
return false;
}
return true;
}
int UvTcpListen(struct raft_uv_transport *transport, raft_uv_accept_cb cb)
{
struct UvTcp *t;
struct addrinfo *addr_infos;
int rv;
t = transport->impl;
t->accept_cb = cb;
if (t->bind_address == NULL) {
rv = uvIpResolveBindAddresses(t->address, &addr_infos);
} else {
rv = uvIpResolveBindAddresses(t->bind_address, &addr_infos);
}
if (rv != 0 || !addr_infos) {
return rv;
}
if (addr_infos->ai_next && uvIsAddressDuplication(addr_infos)) {
rv = uvTcpListenOnMultipleIP(transport, addr_infos->ai_next);
} else {
rv = uvTcpListenOnMultipleIP(transport, addr_infos);
}
freeaddrinfo(addr_infos);
return rv;
}
/* Close callback for uvTcp->listener. */
static void uvTcpListenCloseCbListener(struct uv_handle_s *handle)
{
struct UvTcp *t = handle->data;
assert(t->closing);
assert(t->n_listeners);
assert(t->listeners);
if (--t->n_listeners == 0) {
raft_free(t->listeners);
t->listeners = NULL;
UvTcpMaybeFireCloseCb(t);
}
}
void UvTcpListenClose(struct UvTcp *t)
{
queue *head;
assert(t->closing);
while (!QUEUE_IS_EMPTY(&t->accepting)) {
struct uvTcpIncoming *incoming;
head = QUEUE_HEAD(&t->accepting);
incoming = QUEUE_DATA(head, struct uvTcpIncoming, queue);
uvTcpIncomingAbort(incoming);
}
if (t->n_listeners) {
for (unsigned i = 0; i < t->n_listeners; ++i) {
uv_close((struct uv_handle_s *)&t->listeners[i],
uvTcpListenCloseCbListener);
}
}
}
raft-0.22.1/src/uv_truncate.c 0000664 0000000 0000000 00000012263 14601504142 0016023 0 ustar 00root root 0000000 0000000 #include
#include
#include "assert.h"
#include "byte.h"
#include "heap.h"
#include "uv.h"
#include "uv_encoding.h"
#define tracef(...) Tracef(uv->tracer, __VA_ARGS__)
/* Track a truncate request. */
struct uvTruncate
{
struct uv *uv;
struct UvBarrierReq barrier;
raft_index index;
int status;
};
/* Execute a truncate request in a thread. */
static void uvTruncateWorkCb(uv_work_t *work)
{
struct uvTruncate *truncate = work->data;
struct uv *uv = truncate->uv;
tracef("uv truncate work cb");
struct uvSnapshotInfo *snapshots;
struct uvSegmentInfo *segments;
struct uvSegmentInfo *segment;
size_t n_snapshots;
size_t n_segments;
size_t i;
size_t j;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
int rv;
/* Load all segments on disk. */
rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments, errmsg);
if (rv != 0) {
goto err;
}
if (snapshots != NULL) {
RaftHeapFree(snapshots);
}
assert(segments != NULL);
/* Find the segment that contains the truncate point. */
segment = NULL; /* Suppress warnings. */
for (i = 0; i < n_segments; i++) {
segment = &segments[i];
if (segment->is_open) {
continue;
}
if (truncate->index >= segment->first_index &&
truncate->index <= segment->end_index) {
break;
}
}
assert(i < n_segments);
/* If the truncate index is not the first of the segment, we need to
* truncate it. */
if (truncate->index > segment->first_index) {
rv = uvSegmentTruncate(uv, segment, truncate->index);
if (rv != 0) {
goto err_after_list;
}
}
/* Remove all closed segments past the one containing the truncate index. */
for (j = i; j < n_segments; j++) {
segment = &segments[j];
if (segment->is_open) {
continue;
}
rv = UvFsRemoveFile(uv->dir, segment->filename, errmsg);
if (rv != 0) {
tracef("unlink segment %s: %s", segment->filename, errmsg);
rv = RAFT_IOERR;
goto err_after_list;
}
}
rv = UvFsSyncDir(uv->dir, errmsg);
if (rv != 0) {
tracef("sync data directory: %s", errmsg);
rv = RAFT_IOERR;
goto err_after_list;
}
RaftHeapFree(segments);
truncate->status = 0;
tracef("uv truncate work cb ok");
return;
err_after_list:
RaftHeapFree(segments);
err:
assert(rv != 0);
truncate->status = rv;
}
static void uvTruncateAfterWorkCb(uv_work_t *work, int status)
{
assert(work != NULL);
struct uvTruncate *truncate = work->data;
assert(truncate != NULL);
struct uv *uv = truncate->uv;
assert(uv != NULL);
tracef("uv truncate after work cb status:%d", status);
assert(status == 0);
if (truncate->status != 0) {
uv->errored = true;
}
tracef("clear truncate work");
uv->truncate_work.data = NULL;
RaftHeapFree(truncate);
UvUnblock(uv);
}
static void uvTruncateBarrierCb(struct UvBarrierReq *barrier)
{
struct uvTruncate *truncate = barrier->data;
struct uv *uv = truncate->uv;
tracef("uv truncate barrier cb");
int rv;
/* Ensure that we don't invoke this callback more than once. */
barrier->cb = NULL;
/* If we're closing, don't perform truncation at all and abort here. */
if (uv->closing) {
tracef("closing => don't truncate");
RaftHeapFree(truncate);
uvMaybeFireCloseCb(uv);
return;
}
assert(QUEUE_IS_EMPTY(&uv->append_writing_reqs));
assert(QUEUE_IS_EMPTY(&uv->finalize_reqs));
assert(uv->finalize_work.data == NULL);
assert(uv->truncate_work.data == NULL);
tracef("set truncate work");
uv->truncate_work.data = truncate;
rv = uv_queue_work(uv->loop, &uv->truncate_work, uvTruncateWorkCb,
uvTruncateAfterWorkCb);
if (rv != 0) {
tracef("truncate index %lld: %s", truncate->index, uv_strerror(rv));
tracef("clear truncate work");
uv->truncate_work.data = NULL;
uv->errored = true;
}
}
int UvTruncate(struct raft_io *io, raft_index index)
{
struct uv *uv;
struct uvTruncate *truncate;
int rv;
uv = io->impl;
assert(!uv->closing);
/* We should truncate only entries that we were requested to append in the
* first place. If the truncation index equals the next append index, this
* is a no-op. */
assert(index > 0);
if (index >= uv->append_next_index) {
return 0;
}
tracef("uv truncate %llu", index);
truncate = RaftHeapMalloc(sizeof *truncate);
if (truncate == NULL) {
rv = RAFT_NOMEM;
goto err;
}
truncate->uv = uv;
truncate->index = index;
truncate->barrier.data = truncate;
truncate->barrier.blocking = true;
truncate->barrier.cb = uvTruncateBarrierCb;
/* Make sure that we wait for any inflight writes to finish and then close
* the current segment. */
rv = UvBarrier(uv, index, &truncate->barrier);
if (rv != 0) {
goto err_after_req_alloc;
}
return 0;
err_after_req_alloc:
RaftHeapFree(truncate);
err:
assert(rv != 0);
return rv;
}
#undef tracef
raft-0.22.1/src/uv_writer.c 0000664 0000000 0000000 00000036515 14601504142 0015520 0 ustar 00root root 0000000 0000000 #include "uv_writer.h"
#include
#include
#include "../include/raft.h"
#include "../include/raft/uv.h"
#include "assert.h"
#include "heap.h"
#include "tracing.h"
#define trace(TYPE, INFO) Trace(w->tracer, TYPE, INFO)
/* Copy the error message from the request object to the writer object. */
static void uvWriterReqTransferErrMsg(struct UvWriterReq *req)
{
ErrMsgPrintf(req->writer->errmsg, "%s", req->errmsg);
}
/* Set the request status according the given result code. */
static void uvWriterReqSetStatus(struct UvWriterReq *req, int result)
{
if (result < 0) {
ErrMsgPrintf(req->errmsg, "write failed: %d", result);
req->status = RAFT_IOERR;
} else if ((size_t)result < req->len) {
ErrMsgPrintf(req->errmsg, "short write: %d bytes instead of %zu",
result, req->len);
req->status = RAFT_NOSPACE;
} else {
req->status = 0;
}
}
/* Remove the request from the queue of inflight writes and invoke the request
* callback if set. */
static void uvWriterReqFinish(struct UvWriterReq *req)
{
struct UvWriter *w = req->writer;
trace(RAFT_UV_TRACER_WRITE_COMPLETE, NULL);
QUEUE_REMOVE(&req->queue);
if (req->status != 0) {
uvWriterReqTransferErrMsg(req);
}
req->cb(req, req->status);
}
/* Wrapper around the low-level OS syscall, providing a better error message. */
static int uvWriterIoSetup(unsigned n, aio_context_t *ctx, char *errmsg)
{
int rv;
rv = UvOsIoSetup(n, ctx);
if (rv != 0) {
switch (rv) {
case UV_EAGAIN:
ErrMsgPrintf(errmsg, "AIO events user limit exceeded");
rv = RAFT_TOOMANY;
break;
default:
UvOsErrMsg(errmsg, "io_setup", rv);
rv = RAFT_IOERR;
break;
}
return rv;
}
return 0;
}
/* Run blocking syscalls involved in a file write request.
*
* Perform a KAIO write request and synchronously wait for it to complete. */
static void uvWriterWorkCb(uv_work_t *work)
{
struct UvWriterReq *req; /* Writer request object */
struct UvWriter *w; /* Writer object */
aio_context_t ctx; /* KAIO handle */
struct iocb *iocbs; /* Pointer to KAIO request object */
struct io_event event; /* KAIO response object */
int n_events;
int rv;
req = work->data;
w = req->writer;
iocbs = &req->iocb;
/* If more than one write in parallel is allowed, submit the AIO request
* using a dedicated context, to avoid synchronization issues between
* threads when multiple writes are submitted in parallel. This is
* suboptimal but in real-world users should use file systems and kernels
* with proper async write support. */
if (w->n_events > 1) {
ctx = 0;
rv = uvWriterIoSetup(1 /* Maximum concurrent requests */, &ctx,
req->errmsg);
if (rv != 0) {
goto out;
}
} else {
ctx = w->ctx;
}
/* Submit the request */
rv = UvOsIoSubmit(ctx, 1, &iocbs);
if (rv != 0) {
/* UNTESTED: since we're not using NOWAIT and the parameters are valid,
* this shouldn't fail. */
UvOsErrMsg(req->errmsg, "io_submit", rv);
rv = RAFT_IOERR;
goto out_after_io_setup;
}
/* Wait for the request to complete */
n_events = UvOsIoGetevents(ctx, 1, 1, &event, NULL);
assert(n_events == 1);
if (n_events != 1) {
/* UNTESTED */
rv = n_events >= 0 ? -1 : n_events;
}
out_after_io_setup:
if (w->n_events > 1) {
UvOsIoDestroy(ctx);
}
out:
if (rv != 0) {
req->status = rv;
} else {
uvWriterReqSetStatus(req, (int)event.res);
}
return;
}
/* Callback run after writeWorkCb has returned. It normally invokes the write
* request callback. */
static void uvWriterAfterWorkCb(uv_work_t *work, int status)
{
struct UvWriterReq *req = work->data; /* Write file request object */
assert(status == 0); /* We don't cancel worker requests */
uvWriterReqFinish(req);
}
/* Callback fired when the event fd associated with AIO write requests should be
* ready for reading (i.e. when a write has completed). */
static void uvWriterPollCb(uv_poll_t *poller, int status, int events)
{
struct UvWriter *w = poller->data;
uint64_t completed; /* True if the write is complete */
unsigned i;
int n_events;
int rv;
assert(w->event_fd >= 0);
assert(status == 0);
if (status != 0) {
/* UNTESTED libuv docs: If an error happens while polling, status will
* be < 0 and corresponds with one of the UV_E* error codes. */
goto fail_requests;
}
assert(events & UV_READABLE);
/* Read the event file descriptor */
rv = (int)read(w->event_fd, &completed, sizeof completed);
if (rv != sizeof completed) {
/* UNTESTED: According to eventfd(2) this is the only possible failure
* mode, meaning that epoll has indicated that the event FD is not yet
* ready. */
assert(errno == EAGAIN);
return;
}
/* TODO: this assertion fails in unit tests */
/* assert(completed == 1); */
/* Try to fetch the write responses.
*
* If we got here at least one write should have completed and io_events
* should return immediately without blocking. */
n_events =
UvOsIoGetevents(w->ctx, 1, (long int)w->n_events, w->events, NULL);
assert(n_events >= 1);
if (n_events < 1) {
/* UNTESTED */
status = n_events == 0 ? -1 : n_events;
goto fail_requests;
}
for (i = 0; i < (unsigned)n_events; i++) {
struct io_event *event = &w->events[i];
struct UvWriterReq *req = (void *)((uintptr_t)event->data);
/* If we got EAGAIN, it means it was not possible to perform the write
* asynchronously, so let's fall back to the threadpool. */
if (event->res == -EAGAIN) {
req->iocb.aio_flags &= (unsigned)~IOCB_FLAG_RESFD;
req->iocb.aio_resfd = 0;
req->iocb.aio_rw_flags &= ~RWF_NOWAIT;
assert(req->work.data == NULL);
req->work.data = req;
rv = uv_queue_work(w->loop, &req->work, uvWriterWorkCb,
uvWriterAfterWorkCb);
if (rv != 0) {
/* UNTESTED: with the current libuv implementation this should
* never fail. */
UvOsErrMsg(req->errmsg, "uv_queue_work", rv);
req->status = RAFT_IOERR;
goto finish;
}
return;
}
uvWriterReqSetStatus(req, (int)event->res);
finish:
uvWriterReqFinish(req);
}
return;
fail_requests:
while (!QUEUE_IS_EMPTY(&w->poll_queue)) {
queue *head;
struct UvWriterReq *req;
head = QUEUE_HEAD(&w->poll_queue);
req = QUEUE_DATA(head, struct UvWriterReq, queue);
uvWriterReqSetStatus(req, status);
uvWriterReqFinish(req);
}
}
int UvWriterInit(struct UvWriter *w,
struct uv_loop_s *loop,
uv_file fd,
bool direct /* Whether to use direct I/O */,
bool async /* Whether async I/O is available */,
unsigned max_concurrent_writes,
char *errmsg)
{
void *data = w->data;
int rv = 0;
memset(w, 0, sizeof *w);
w->data = data;
w->loop = loop;
w->fd = fd;
w->async = async;
w->ctx = 0;
w->events = NULL;
w->n_events = max_concurrent_writes;
w->event_fd = -1;
w->event_poller.data = NULL;
w->check.data = NULL;
w->close_cb = NULL;
QUEUE_INIT(&w->poll_queue);
QUEUE_INIT(&w->work_queue);
w->closing = false;
w->errmsg = errmsg;
w->tracer = NULL;
/* Set direct I/O if available. */
if (direct) {
rv = UvOsSetDirectIo(w->fd);
if (rv != 0) {
UvOsErrMsg(errmsg, "fcntl", rv);
goto err;
}
}
/* Setup the AIO context. */
rv = uvWriterIoSetup(w->n_events, &w->ctx, errmsg);
if (rv != 0) {
goto err;
}
/* Initialize the array of re-usable event objects. */
w->events = RaftHeapCalloc(w->n_events, sizeof *w->events);
if (w->events == NULL) {
/* UNTESTED: todo */
ErrMsgOom(errmsg);
rv = RAFT_NOMEM;
goto err_after_io_setup;
}
/* Create an event file descriptor to get notified when a write has
* completed. */
rv = UvOsEventfd(0, UV_FS_O_NONBLOCK);
if (rv < 0) {
/* UNTESTED: should fail only with ENOMEM */
UvOsErrMsg(errmsg, "eventfd", rv);
rv = RAFT_IOERR;
goto err_after_events_alloc;
}
w->event_fd = rv;
rv = uv_poll_init(loop, &w->event_poller, w->event_fd);
if (rv != 0) {
/* UNTESTED: with the current libuv implementation this should never
* fail. */
UvOsErrMsg(errmsg, "uv_poll_init", rv);
rv = RAFT_IOERR;
goto err_after_event_fd;
}
w->event_poller.data = w;
rv = uv_check_init(loop, &w->check);
if (rv != 0) {
/* UNTESTED: with the current libuv implementation this should never
* fail. */
UvOsErrMsg(errmsg, "uv_check_init", rv);
rv = RAFT_IOERR;
goto err_after_event_fd;
}
w->check.data = w;
rv = uv_poll_start(&w->event_poller, UV_READABLE, uvWriterPollCb);
if (rv != 0) {
/* UNTESTED: with the current libuv implementation this should never
* fail. */
UvOsErrMsg(errmsg, "uv_poll_start", rv);
rv = RAFT_IOERR;
goto err_after_event_fd;
}
return 0;
err_after_event_fd:
UvOsClose(w->event_fd);
err_after_events_alloc:
RaftHeapFree(w->events);
err_after_io_setup:
UvOsIoDestroy(w->ctx);
err:
assert(rv != 0);
return rv;
}
static void uvWriterCleanUpAndFireCloseCb(struct UvWriter *w)
{
assert(w->closing);
UvOsClose(w->fd);
RaftHeapFree(w->events);
UvOsIoDestroy(w->ctx);
if (w->close_cb != NULL) {
w->close_cb(w);
}
}
static void uvWriterPollerCloseCb(struct uv_handle_s *handle)
{
struct UvWriter *w = handle->data;
w->event_poller.data = NULL;
/* Cancel all pending requests. */
while (!QUEUE_IS_EMPTY(&w->poll_queue)) {
queue *head;
struct UvWriterReq *req;
head = QUEUE_HEAD(&w->poll_queue);
req = QUEUE_DATA(head, struct UvWriterReq, queue);
assert(req->work.data == NULL);
req->status = RAFT_CANCELED;
uvWriterReqFinish(req);
}
if (w->check.data != NULL) {
return;
}
uvWriterCleanUpAndFireCloseCb(w);
}
static void uvWriterCheckCloseCb(struct uv_handle_s *handle)
{
struct UvWriter *w = handle->data;
w->check.data = NULL;
if (w->event_poller.data != NULL) {
return;
}
uvWriterCleanUpAndFireCloseCb(w);
}
static void uvWriterCheckCb(struct uv_check_s *check)
{
struct UvWriter *w = check->data;
if (!QUEUE_IS_EMPTY(&w->work_queue)) {
return;
}
uv_close((struct uv_handle_s *)&w->check, uvWriterCheckCloseCb);
}
void UvWriterClose(struct UvWriter *w, UvWriterCloseCb cb)
{
int rv;
assert(!w->closing);
w->closing = true;
w->close_cb = cb;
/* We can close the event file descriptor right away, but we shouldn't close
* the main file descriptor or destroy the AIO context since there might be
* threadpool requests in flight. */
UvOsClose(w->event_fd);
rv = uv_poll_stop(&w->event_poller);
assert(rv == 0); /* Can this ever fail? */
uv_close((struct uv_handle_s *)&w->event_poller, uvWriterPollerCloseCb);
/* If we have requests executing in the threadpool, we need to wait for
* them. That's done in the check callback. */
if (!QUEUE_IS_EMPTY(&w->work_queue)) {
uv_check_start(&w->check, uvWriterCheckCb);
} else {
uv_close((struct uv_handle_s *)&w->check, uvWriterCheckCloseCb);
}
}
void UvWriterSetTracer(struct UvWriter *w, struct raft_tracer *tracer)
{
w->tracer = tracer;
}
/* Return the total lengths of the given buffers. */
static size_t lenOfBufs(const uv_buf_t bufs[], unsigned n)
{
size_t len = 0;
unsigned i;
for (i = 0; i < n; i++) {
len += bufs[i].len;
}
return len;
}
int UvWriterSubmit(struct UvWriter *w,
struct UvWriterReq *req,
const uv_buf_t bufs[],
unsigned n,
size_t offset,
UvWriterReqCb cb)
{
int rv = 0;
struct iocb *iocbs = &req->iocb;
assert(!w->closing);
trace(RAFT_UV_TRACER_WRITE_SUBMIT, NULL);
/* TODO: at the moment we are not leveraging the support for concurrent
* writes, so ensure that we're getting write requests
* sequentially. */
if (w->n_events == 1) {
assert(QUEUE_IS_EMPTY(&w->poll_queue));
assert(QUEUE_IS_EMPTY(&w->work_queue));
}
assert(w->fd >= 0);
assert(w->event_fd >= 0);
assert(w->ctx != 0);
assert(req != NULL);
assert(bufs != NULL);
assert(n > 0);
req->writer = w;
req->len = lenOfBufs(bufs, n);
req->status = -1;
req->work.data = NULL;
req->cb = cb;
memset(&req->iocb, 0, sizeof req->iocb);
memset(req->errmsg, 0, sizeof req->errmsg);
req->iocb.aio_fildes = (uint32_t)w->fd;
req->iocb.aio_lio_opcode = IOCB_CMD_PWRITEV;
req->iocb.aio_reqprio = 0;
req->iocb.aio_buf = (uintptr_t)bufs;
req->iocb.aio_nbytes = n;
req->iocb.aio_offset = (int64_t)offset;
req->iocb.aio_data = (uintptr_t)req;
/* Use per-request synchronous I/O if available. */
req->iocb.aio_rw_flags |= RWF_DSYNC;
/* If io_submit can be run in a 100% non-blocking way, we'll try to write
* without using the threadpool. */
if (w->async) {
req->iocb.aio_flags |= IOCB_FLAG_RESFD;
req->iocb.aio_resfd = (uint32_t)w->event_fd;
req->iocb.aio_rw_flags |= RWF_NOWAIT;
}
/* Try to submit the write request asynchronously */
if (w->async) {
QUEUE_PUSH(&w->poll_queue, &req->queue);
rv = UvOsIoSubmit(w->ctx, 1, &iocbs);
/* If no error occurred, we're done, the write request was
* submitted. */
if (rv == 0) {
goto done;
}
QUEUE_REMOVE(&req->queue);
/* Check the reason of the error. */
switch (rv) {
case UV_EAGAIN:
break;
default:
/* Unexpected error */
UvOsErrMsg(w->errmsg, "io_submit", rv);
rv = RAFT_IOERR;
goto err;
}
/* Submitting the write would block, or NOWAIT is not
* supported. Let's run this request in the threadpool. */
req->iocb.aio_flags &= (unsigned)~IOCB_FLAG_RESFD;
req->iocb.aio_resfd = 0;
req->iocb.aio_rw_flags &= ~RWF_NOWAIT;
}
/* If we got here it means we need to run io_submit in the threadpool. */
QUEUE_PUSH(&w->work_queue, &req->queue);
req->work.data = req;
rv =
uv_queue_work(w->loop, &req->work, uvWriterWorkCb, uvWriterAfterWorkCb);
if (rv != 0) {
/* UNTESTED: with the current libuv implementation this can't fail. */
req->work.data = NULL;
QUEUE_REMOVE(&req->queue);
UvOsErrMsg(w->errmsg, "uv_queue_work", rv);
rv = RAFT_IOERR;
goto err;
}
done:
return 0;
err:
assert(rv != 0);
return rv;
}
#undef trace
raft-0.22.1/src/uv_writer.h 0000664 0000000 0000000 00000005760 14601504142 0015523 0 ustar 00root root 0000000 0000000 /* Asynchronous API to write a file. */
#ifndef UV_WRITER_H_
#define UV_WRITER_H_
#include
#include "err.h"
#include "queue.h"
#include "uv_os.h"
/* Perform asynchronous writes to a single file. */
struct UvWriter;
/* Callback called after the memory associated with a file handle can be
* released. */
typedef void (*UvWriterCloseCb)(struct UvWriter *w);
struct UvWriter
{
void *data; /* User data */
struct uv_loop_s *loop; /* Event loop */
uv_file fd; /* File handle */
bool async; /* Whether fully async I/O is supported */
aio_context_t ctx; /* KAIO handle */
struct io_event *events; /* Array of KAIO response objects */
unsigned n_events; /* Length of the events array */
int event_fd; /* Poll'ed to check if write is finished */
struct uv_poll_s
event_poller; /* Poll event_fd for completed poll requests */
struct uv_check_s check; /* Check for completed threadpool requests */
UvWriterCloseCb close_cb; /* Close callback */
queue poll_queue; /* Pollable write requests */
queue work_queue; /* Threadpool write requests */
bool closing; /* Whether we're closing or closed */
char *errmsg; /* Description of last error */
struct raft_tracer *tracer; /* Tracer to use */
};
/* Initialize a file writer. */
int UvWriterInit(struct UvWriter *w,
struct uv_loop_s *loop,
uv_file fd,
bool direct /* Whether to use direct I/O */,
bool async /* Whether async I/O is available */,
unsigned max_concurrent_writes,
char *errmsg);
/* Close the given file and release all associated resources. */
void UvWriterClose(struct UvWriter *w, UvWriterCloseCb cb);
/* Set a tracer on this writer. */
void UvWriterSetTracer(struct UvWriter *w, struct raft_tracer *tracer);
/* Write request. */
struct UvWriterReq;
/* Callback called after a write request has been completed. */
typedef void (*UvWriterReqCb)(struct UvWriterReq *req, int status);
struct UvWriterReq
{
void *data; /* User data */
struct UvWriter *writer; /* Originating writer */
size_t len; /* Total number of bytes to write */
int status; /* Request result code */
struct uv_work_s work; /* To execute logic in the threadpool */
UvWriterReqCb cb; /* Callback to invoke upon request completion */
struct iocb iocb; /* KAIO request (for writing) */
char errmsg[256]; /* Error description (for thread-safety) */
queue queue; /* Prev/next links in the inflight queue */
};
/* Asynchronously write data to the underlying file. */
int UvWriterSubmit(struct UvWriter *w,
struct UvWriterReq *req,
const uv_buf_t bufs[],
unsigned n,
size_t offset,
UvWriterReqCb cb);
#endif /* UV_WRITER_H_ */
raft-0.22.1/test/ 0000775 0000000 0000000 00000000000 14601504142 0013504 5 ustar 00root root 0000000 0000000 raft-0.22.1/test/fuzzy/ 0000775 0000000 0000000 00000000000 14601504142 0014673 5 ustar 00root root 0000000 0000000 raft-0.22.1/test/fuzzy/main_core.c 0000664 0000000 0000000 00000000606 14601504142 0016775 0 ustar 00root root 0000000 0000000 #include
#include "../lib/runner.h"
MunitSuite _main_suites[64];
int _main_suites_n = 0;
/* Test runner executable */
int main(int argc, char *argv[MUNIT_ARRAY_PARAM(argc)])
{
MunitSuite suite = {(char *)"", NULL, _main_suites, 1, 0};
if (getenv("SKIP_FUZZY_TESTS") != NULL) {
return 0;
}
return munit_suite_main(&suite, (void *)"unit", argc, argv);
}
raft-0.22.1/test/fuzzy/test_election.c 0000664 0000000 0000000 00000005166 14601504142 0017710 0 ustar 00root root 0000000 0000000 #include "../lib/legacy.h"
#include "../lib/runner.h"
/******************************************************************************
*
* Fixture
*
*****************************************************************************/
struct fixture
{
FIXTURE_CLUSTER;
};
static char *cluster_n[] = {"3", "4", "5", "7", NULL};
static char *cluster_pre_vote[] = {"0", "1", NULL};
static MunitParameterEnum _params[] = {
{CLUSTER_N_PARAM, cluster_n},
{CLUSTER_PRE_VOTE_PARAM, cluster_pre_vote},
{NULL, NULL},
};
static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER(0);
CLUSTER_BOOTSTRAP;
CLUSTER_RANDOMIZE;
CLUSTER_START();
return f;
}
static void tear_down(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER;
free(f);
}
/******************************************************************************
*
* Tests
*
*****************************************************************************/
SUITE(election)
/* A leader is eventually elected */
TEST(election, win, setup, tear_down, 0, _params)
{
struct fixture *f = data;
CLUSTER_STEP_UNTIL_HAS_LEADER(10000);
return MUNIT_OK;
}
/* A new leader is elected if the current one dies. */
TEST(election, change, setup, tear_down, 0, _params)
{
struct fixture *f = data;
CLUSTER_STEP_UNTIL_HAS_LEADER(10000);
CLUSTER_KILL_LEADER;
CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000);
CLUSTER_STEP_UNTIL_HAS_LEADER(20000);
return MUNIT_OK;
}
/* A new leader is elected if the current one dies and a previously killed
* server with an outdated log and outdated term is revived. */
TEST(election, changeReviveOutdated, setup, tear_down, 0, _params)
{
struct fixture *f = data;
unsigned i;
/* Kill a random server */
i = ((unsigned)rand()) % CLUSTER_N;
CLUSTER_KILL(i);
/* Server i's term will be lower than the term of the election. */
CLUSTER_STEP_UNTIL_HAS_LEADER(20000);
/* Add some entries to the log */
CLUSTER_MAKE_PROGRESS;
CLUSTER_MAKE_PROGRESS;
CLUSTER_KILL_LEADER;
CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000);
/* Revive server i with an outdated log and term, the cluster
* should be able to elect a new leader */
CLUSTER_REVIVE(i);
CLUSTER_STEP_UNTIL_HAS_LEADER(20000);
return MUNIT_OK;
}
/* If no majority of servers is online, no leader is elected. */
TEST(election, noQuorum, setup, tear_down, 0, _params)
{
struct fixture *f = data;
CLUSTER_KILL_MAJORITY;
CLUSTER_STEP_UNTIL_ELAPSED(30000);
munit_assert_false(CLUSTER_HAS_LEADER);
return MUNIT_OK;
}
raft-0.22.1/test/fuzzy/test_liveness.c 0000664 0000000 0000000 00000010102 14601504142 0017720 0 ustar 00root root 0000000 0000000 #include "../lib/legacy.h"
#include "../lib/runner.h"
/******************************************************************************
*
* Fixture
*
*****************************************************************************/
/* Maximum number of cluster loop iterations each test should perform. */
#define MAX_ITERATIONS 25000
/* Maximum number of cluster loop iterations a pair of servers should stay
* disconnected. */
#define MAX_DISCONNECT 150
struct disconnection
{
unsigned id1;
unsigned id2;
int start;
int duration;
};
struct fixture
{
FIXTURE_CLUSTER;
struct disconnection *disconnections;
};
static char *cluster_n[] = {"3", "4", NULL};
static char *cluster_pre_vote[] = {"0", "1", NULL};
static MunitParameterEnum _params[] = {
{CLUSTER_N_PARAM, cluster_n},
{CLUSTER_PRE_VOTE_PARAM, cluster_pre_vote},
{NULL, NULL},
};
/* Return the number of distinct server pairs in the cluster. */
static int __server_pairs(struct fixture *f)
{
return CLUSTER_N * (CLUSTER_N - 1) / 2;
}
/* Update the cluster connectivity for the given iteration. */
static void __update_connectivity(struct fixture *f, int i)
{
int p;
int pairs = __server_pairs(f);
for (p = 0; p < pairs; p++) {
struct disconnection *disconnection = &f->disconnections[p];
unsigned id1 = disconnection->id1;
unsigned id2 = disconnection->id2;
if (disconnection->start == 0) {
/* Decide whether to disconnect this pair. */
if (munit_rand_int_range(1, 10) <= 1) {
disconnection->start = i;
disconnection->duration =
munit_rand_int_range(50, MAX_DISCONNECT);
raft_fixture_saturate(&f->cluster, id1 - 1, id2 - 1);
raft_fixture_saturate(&f->cluster, id2 - 1, id1 - 1);
}
} else {
/* Decide whether to reconnect this pair. */
if (i - disconnection->start > disconnection->duration) {
raft_fixture_desaturate(&f->cluster, id1 - 1, id2 - 1);
raft_fixture_desaturate(&f->cluster, id2 - 1, id1 - 1);
disconnection->start = 0;
}
}
}
}
static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
int pairs;
size_t i, j, k;
SETUP_CLUSTER(0);
CLUSTER_BOOTSTRAP;
CLUSTER_RANDOMIZE;
CLUSTER_START();
/* Number of distinct pairs of servers. */
pairs = __server_pairs(f);
f->disconnections = munit_malloc(pairs * sizeof *f->disconnections);
k = 0;
for (i = 0; i < CLUSTER_N; i++) {
for (j = i + 1; j < CLUSTER_N; j++) {
struct disconnection *disconnection = &f->disconnections[k];
disconnection->id1 = i + 1;
disconnection->id2 = j + 1;
disconnection->start = 0;
disconnection->duration = 0;
k++;
}
}
return f;
}
static void tear_down(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER;
free(f->disconnections);
free(f);
}
/******************************************************************************
*
* Tests
*
*****************************************************************************/
SUITE(liveness)
static void apply_cb(struct raft_apply *req, int status, void *result)
{
(void)status;
(void)result;
free(req);
}
/* The system makes progress even in case of network disruptions. */
TEST(liveness, networkDisconnect, setup, tear_down, 0, _params)
{
struct fixture *f = data;
int i = 0;
(void)params;
for (i = 0; i < MAX_ITERATIONS; i++) {
__update_connectivity(f, i);
raft_fixture_step(&f->cluster);
if (CLUSTER_LEADER != CLUSTER_N) {
struct raft_apply *req = munit_malloc(sizeof *req);
CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req, 1, apply_cb);
if (CLUSTER_LAST_APPLIED(CLUSTER_LEADER) >= 2) {
break;
}
}
}
// munit_assert_int(CLUSTER_LAST_APPLIED(CLUSTER_LEADER), >=, 2);
return MUNIT_OK;
}
raft-0.22.1/test/fuzzy/test_membership.c 0000664 0000000 0000000 00000004776 14601504142 0020247 0 ustar 00root root 0000000 0000000 #include "../lib/legacy.h"
#include "../lib/runner.h"
/******************************************************************************
*
* Fixture
*
*****************************************************************************/
struct fixture
{
FIXTURE_CLUSTER;
struct raft_change req;
};
static char *cluster_n[] = {"3", "4", "5", NULL};
static MunitParameterEnum _params[] = {
{CLUSTER_N_PARAM, cluster_n},
{NULL, NULL},
};
static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER(0);
CLUSTER_BOOTSTRAP;
CLUSTER_RANDOMIZE;
CLUSTER_START();
CLUSTER_STEP_UNTIL_HAS_LEADER(10000);
return f;
}
static void tear_down(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER;
free(f);
}
/******************************************************************************
*
* Tests
*
*****************************************************************************/
SUITE(membership)
TEST(membership, addNonVoting, setup, tear_down, 0, _params)
{
struct fixture *f = data;
const struct raft_server *server;
struct raft *raft;
CLUSTER_ADD(&f->req);
CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 2, 2000);
/* Then promote it. */
CLUSTER_ASSIGN(&f->req, RAFT_STANDBY);
CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 3, 2000);
raft = CLUSTER_RAFT(CLUSTER_LEADER);
server = &raft->configuration.servers[CLUSTER_N - 1];
munit_assert_int(server->id, ==, CLUSTER_N);
return MUNIT_OK;
}
TEST(membership, addVoting, setup, tear_down, 0, _params)
{
struct fixture *f = data;
const struct raft_server *server;
struct raft *raft;
(void)params;
CLUSTER_ADD(&f->req);
CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 2, 2000);
/* Then promote it. */
CLUSTER_ASSIGN(&f->req, RAFT_VOTER);
CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 3, 2000);
raft = CLUSTER_RAFT(CLUSTER_LEADER);
server = &raft->configuration.servers[CLUSTER_N - 1];
munit_assert_int(server->role, ==, RAFT_VOTER);
return MUNIT_OK;
}
TEST(membership, removeVoting, setup, tear_down, 0, _params)
{
struct fixture *f = data;
struct raft *raft;
int rv;
(void)params;
raft = CLUSTER_RAFT(CLUSTER_LEADER);
rv = raft_remove(raft, &f->req, CLUSTER_LEADER % CLUSTER_N + 1, NULL);
munit_assert_int(rv, ==, 0);
CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 2, 2000);
munit_assert_int(raft->configuration.n, ==, CLUSTER_N - 1);
return 0;
}
raft-0.22.1/test/fuzzy/test_replication.c 0000664 0000000 0000000 00000010652 14601504142 0020413 0 ustar 00root root 0000000 0000000 #include "../lib/legacy.h"
#include "../lib/runner.h"
/******************************************************************************
*
* Fixture
*
*****************************************************************************/
struct fixture
{
FIXTURE_CLUSTER;
};
static char *cluster_n[] = {"3", "5", "7", NULL};
static MunitParameterEnum _params[] = {
{CLUSTER_N_PARAM, cluster_n},
{NULL, NULL},
};
static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER(0);
CLUSTER_BOOTSTRAP;
CLUSTER_RANDOMIZE;
CLUSTER_START();
CLUSTER_STEP_UNTIL_HAS_LEADER(10000);
return f;
}
static void tear_down(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER;
free(f);
}
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
#define APPLY_ADD_ONE(REQ) CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, REQ, 1, NULL)
/******************************************************************************
*
* Tests
*
*****************************************************************************/
SUITE(replication)
/* New entries on the leader are eventually replicated to followers. */
TEST(replication, appendEntries, setup, tear_down, 0, _params)
{
struct fixture *f = data;
struct raft_apply *req = munit_malloc(sizeof *req);
(void)params;
APPLY_ADD_ONE(req);
CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 2, 2000);
free(req);
return MUNIT_OK;
}
/* The cluster remains available even if the current leader dies and a new
* leader gets elected. */
TEST(replication, availability, setup, tear_down, 0, _params)
{
struct fixture *f = data;
struct raft_apply *req1 = munit_malloc(sizeof *req1);
struct raft_apply *req2 = munit_malloc(sizeof *req2);
(void)params;
APPLY_ADD_ONE(req1);
CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 2, 2000);
CLUSTER_KILL_LEADER;
CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000);
CLUSTER_STEP_UNTIL_HAS_LEADER(10000);
APPLY_ADD_ONE(req2);
CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 3, 2000);
free(req1);
free(req2);
return MUNIT_OK;
}
static void apply_cb(struct raft_apply *req, int status, void *result)
{
(void)status;
(void)result;
free(req);
}
/* If no quorum is available, entries don't get committed. */
TEST(replication, noQuorum, setup, tear_down, 0, _params)
{
struct fixture *f = data;
struct raft_apply *req = munit_malloc(sizeof *req);
unsigned i;
(void)params;
CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req, 1, apply_cb);
CLUSTER_KILL_MAJORITY;
CLUSTER_STEP_UNTIL_ELAPSED(10000);
for (i = 0; i < CLUSTER_N; i++) {
munit_assert_int(CLUSTER_LAST_APPLIED(i), ==, 1);
}
return MUNIT_OK;
}
/* If the cluster is partitioned, entries don't get committed. */
TEST(replication, partitioned, setup, tear_down, 0, _params)
{
struct fixture *f = data;
struct raft_apply *req1 = munit_malloc(sizeof *req1);
struct raft_apply *req2 = munit_malloc(sizeof *req2);
unsigned leader_id;
size_t i;
size_t n;
(void)params;
leader_id = CLUSTER_LEADER + 1;
/* Disconnect the leader from a majority of servers */
n = 0;
for (i = 0; n < (CLUSTER_N / 2) + 1; i++) {
struct raft *raft = CLUSTER_RAFT(i);
if (raft->id == leader_id) {
continue;
}
raft_fixture_saturate(&f->cluster, leader_id - 1, raft->id - 1);
raft_fixture_saturate(&f->cluster, raft->id - 1, leader_id - 1);
n++;
}
/* Try to append a new entry using the disconnected leader. */
CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req1, 1, apply_cb);
/* The leader gets deposed. */
CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000);
/* The entry does not get committed. */
CLUSTER_STEP_UNTIL_ELAPSED(5000);
/* Reconnect the old leader */
for (i = 0; i < CLUSTER_N; i++) {
struct raft *raft = CLUSTER_RAFT(i);
if (raft->id == leader_id) {
continue;
}
raft_fixture_desaturate(&f->cluster, leader_id - 1, raft->id - 1);
}
// TODO this fails with seed 0x3914306f
CLUSTER_STEP_UNTIL_HAS_LEADER(30000);
/* Re-try now to append the entry. */
CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req2, 1, apply_cb);
CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 2, 10000);
return MUNIT_OK;
}
raft-0.22.1/test/integration/ 0000775 0000000 0000000 00000000000 14601504142 0016027 5 ustar 00root root 0000000 0000000 raft-0.22.1/test/integration/append_helpers.h 0000664 0000000 0000000 00000011273 14601504142 0021175 0 ustar 00root root 0000000 0000000 #include "../../include/raft.h"
#include "../lib/runner.h"
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
struct result
{
int status;
bool done;
void *data;
};
static void appendCbAssertResult(struct raft_io_append *req, int status)
{
struct result *result = req->data;
munit_assert_int(status, ==, result->status);
result->done = true;
}
/* Declare and fill the entries array for the append request identified by
* I. The array will have N entries, and each entry will have a data buffer of
* SIZE bytes.*/
#define ENTRIES(I, N, SIZE) \
struct raft_entry _entries##I[N]; \
uint8_t _entries_data##I[N * SIZE]; \
{ \
int _i; \
for (_i = 0; _i < N; _i++) { \
struct raft_entry *entry = &_entries##I[_i]; \
entry->term = 1; \
entry->type = RAFT_COMMAND; \
entry->buf.base = &_entries_data##I[_i * SIZE]; \
entry->buf.len = SIZE; \
entry->batch = NULL; \
munit_assert_ptr_not_null(entry->buf.base); \
memset(entry->buf.base, 0, entry->buf.len); \
uint64_t _temporary = f->count; \
memcpy(entry->buf.base, &_temporary, 8); \
f->count++; \
} \
}
/* Submit an append request identified by I, with N_ENTRIES entries, each one of
* size ENTRY_SIZE. When the append request completes, CB will be called
* and DATA will be available in result->data. f->io.append is expected to
* return RV. */
#define APPEND_SUBMIT_CB_DATA(I, N_ENTRIES, ENTRY_SIZE, CB, DATA, RV) \
struct raft_io_append _req##I; \
struct result _result##I = {0, false, DATA}; \
int _rv##I; \
ENTRIES(I, N_ENTRIES, ENTRY_SIZE); \
_req##I.data = &_result##I; \
_rv##I = f->io.append(&f->io, &_req##I, _entries##I, N_ENTRIES, CB); \
munit_assert_int(_rv##I, ==, RV)
/* Submit an append request identified by I, with N_ENTRIES entries, each one of
* size ENTRY_SIZE. The default expectation is for the operation to succeed. A
* custom STATUS can be set with APPEND_EXPECT. */
#define APPEND_SUBMIT(I, N_ENTRIES, ENTRY_SIZE) \
APPEND_SUBMIT_CB_DATA(I, N_ENTRIES, ENTRY_SIZE, appendCbAssertResult, \
NULL, 0)
/* Try to submit an append request and assert that the given error code and
* message are returned. */
#define APPEND_ERROR(N_ENTRIES, ENTRY_SIZE, RV, ERRMSG) \
do { \
struct raft_io_append _req; \
int _rv; \
ENTRIES(0, N_ENTRIES, ENTRY_SIZE); \
_rv = f->io.append(&f->io, &_req, _entries0, N_ENTRIES, NULL); \
munit_assert_int(_rv, ==, RV); \
/* munit_assert_string_equal(f->io.errmsg, ERRMSG);*/ \
} while (0)
#define APPEND_EXPECT(I, STATUS) _result##I.status = STATUS
/* Wait for the append request identified by I to complete. */
#define APPEND_WAIT(I) LOOP_RUN_UNTIL(&_result##I.done)
/* Submit an append request with an entries array with N_ENTRIES entries, each
* one of size ENTRY_SIZE, and wait for the operation to successfully
* complete. */
#define APPEND(N_ENTRIES, ENTRY_SIZE) \
do { \
APPEND_SUBMIT(0, N_ENTRIES, ENTRY_SIZE); \
APPEND_WAIT(0); \
} while (0)
/* Submit an append request with the given parameters and wait for the operation
* to fail with the given code and message. */
#define APPEND_FAILURE(N_ENTRIES, ENTRY_SIZE, STATUS, ERRMSG) \
{ \
APPEND_SUBMIT(0, N_ENTRIES, ENTRY_SIZE); \
APPEND_EXPECT(0, STATUS); \
APPEND_WAIT(0); \
munit_assert_string_equal(f->io.errmsg, ERRMSG); \
}
raft-0.22.1/test/integration/main_core.c 0000664 0000000 0000000 00000000053 14601504142 0020125 0 ustar 00root root 0000000 0000000 #include "../lib/runner.h"
RUNNER("core")
raft-0.22.1/test/integration/main_uv.c 0000664 0000000 0000000 00000000051 14601504142 0017625 0 ustar 00root root 0000000 0000000 #include "../lib/runner.h"
RUNNER("uv")
raft-0.22.1/test/integration/test_catch_up.c 0000664 0000000 0000000 00000004552 14601504142 0021026 0 ustar 00root root 0000000 0000000 #include "../lib/cluster.h"
#include "../lib/runner.h"
struct fixture
{
FIXTURE_CLUSTER;
};
static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER();
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER();
free(f);
}
SUITE(catch_up)
/* Trying to catch-up an unresponsive server eventually fails. */
TEST(catch_up, Unresponsive, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_configuration configuration;
struct raft_entry entry;
int status;
int rv;
raft_set_max_catch_up_round_duration(CLUSTER_RAFT(1), 90);
/* Bootstrap a cluster with 1 voter and 1 spare. Server 1 has an
* additional entry. Only start server 1 */
CLUSTER_SET_TERM(1 /* ID */, 1 /* term */);
CLUSTER_FILL_CONFIGURATION(&configuration, 2, 1, 0 /* stand-by */);
entry.type = RAFT_CHANGE;
entry.term = 1;
rv = raft_configuration_encode(&configuration, &entry.buf);
munit_assert_int(rv, ==, 0);
raft_configuration_close(&configuration);
test_cluster_add_entry(&f->cluster_, 1 /* ID */, &entry);
raft_free(entry.buf.base);
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
CLUSTER_START(1 /* ID */);
CLUSTER_TRACE(
"[ 0] 1 > term 1, 2 entries (1^1..2^1)\n"
" self elect and convert to leader\n");
/* Start to catch-up server 2. */
test_cluster_catch_up(&f->cluster_, 1 /* ID */, 2 /* Catch-up ID */);
CLUSTER_TRACE(
"[ 0] 1 > catch-up server 2\n"
" probe server 2 sending a heartbeat (no entries)\n");
/* Server is now catching up server 2. */
rv = raft_catch_up(CLUSTER_RAFT(1), 2 /* ID */, &status);
munit_assert_int(rv, ==, 0);
munit_assert_int(status, ==, RAFT_CATCH_UP_RUNNING);
CLUSTER_TRACE(
"[ 50] 1 > timeout as leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 100] 1 > timeout as leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" server 2 is unresponsive -> abort catch-up\n");
rv = raft_catch_up(CLUSTER_RAFT(1), 2 /* ID */, &status);
munit_assert_int(rv, ==, 0);
munit_assert_int(status, ==, RAFT_CATCH_UP_ABORTED);
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_digest.c 0000664 0000000 0000000 00000000557 14601504142 0020520 0 ustar 00root root 0000000 0000000 #include "../../include/raft.h"
#include "../lib/runner.h"
SUITE(raft_digest)
/* Generation of the ID of the bootstrap dqlite node. */
TEST(raft_digest, bootstrapServerId, NULL, NULL, 0, NULL)
{
const char *address = "127.0.0.1:65536";
unsigned long long id;
id = raft_digest(address, 0);
munit_assert_int(id, ==, 138882483);
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_election.c 0000664 0000000 0000000 00000146174 14601504142 0021051 0 ustar 00root root 0000000 0000000 #include "../../src/configuration.h"
#include "../lib/cluster.h"
#include "../lib/runner.h"
struct fixture
{
FIXTURE_CLUSTER;
};
static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER();
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER();
free(f);
}
SUITE(election)
/* Test a successful election with 2 voters. */
TEST(election, TwoVoters, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n");
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n");
CLUSTER_TRACE(
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n");
CLUSTER_TRACE(
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
CLUSTER_TRACE(
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n");
return MUNIT_OK;
}
/* If we have already voted and the same candidate requests the vote again, the
* vote is granted. */
TEST(election, GrantAgain, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
}
/* Prevent server 2 from timing out. */
CLUSTER_SET_ELECTION_TIMEOUT(2 /* ID */, 250 /* timeout */, 0 /* delta */);
/* Now start the cluster. */
CLUSTER_START(1 /* ID */);
CLUSTER_START(2 /* ID */);
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n");
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n");
CLUSTER_TRACE(
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n");
/* Disconnect server 2 from server 1, so server 1 does not receive the
* result. Then stop and restart server 1, artifcially resetting its term,
* so it will ask again server 2's vote for term 2. */
CLUSTER_DISCONNECT(2, 1);
CLUSTER_STOP(1 /* ID */);
CLUSTER_SET_TERM(1 /* ID */, 1 /* term */);
CLUSTER_SET_VOTE(1 /* ID */, 0 /* vote */);
CLUSTER_START(1);
CLUSTER_TRACE(
"[ 110] 1 > term 1, 1 entry (1^1)\n"
"[ 210] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n");
/* Server 2 grants again its vote. */
CLUSTER_TRACE(
"[ 220] 2 > recv request vote from server 1\n"
" remote log is equal (1^1) -> grant vote\n");
return MUNIT_OK;
}
/* If the requester last log entry index is the same, the vote is granted. */
TEST(election, GrantIfLastIndexIsSame, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap a cluster with 2 voters having each 2 equal entries. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 2 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_ADD_ENTRY(id, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 2, 2 entries (1^1..2^1)\n"
"[ 0] 2 > term 2, 2 entries (1^1..2^1)\n");
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 3\n");
CLUSTER_TRACE(
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (3 vs 2) -> bump term\n"
" remote log is equal (2^1) -> grant vote\n");
return MUNIT_OK;
}
/* If the requester last log entry index is higher, the vote is granted. */
TEST(election, GrantIfRemoteLastIndexIsHigher, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
/* Bootstrap a cluster with 2 voters, the first having 2 entries. */
CLUSTER_SET_TERM(1 /* ID */, 2 /* term */);
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
CLUSTER_START(1 /* ID */);
CLUSTER_SET_TERM(2 /* ID */, 1 /* term */);
CLUSTER_ADD_ENTRY(2 /* ID */, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(2 /* ID */);
CLUSTER_TRACE(
"[ 0] 1 > term 2, 2 entries (1^1..2^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n");
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 3\n");
CLUSTER_TRACE(
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (3 vs 1) -> bump term\n"
" remote log is longer (2^1 vs 1^1) -> grant vote\n");
return MUNIT_OK;
}
/* If the requester last log entry term is higher, the vote is granted. */
TEST(election, GrantIfRemoteLastTermIsHigher, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
/* Bootstrap a cluster with 2 voters. Both have 2 entries, however server
* 1's last entry at index 2 has term 2, while server's 2 has term 1. */
CLUSTER_SET_TERM(1 /* ID */, 2 /* term */);
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_COMMAND, 2 /* term */, 0 /* payload */);
CLUSTER_START(1 /* ID */);
CLUSTER_SET_TERM(2 /* ID */, 1 /* term */);
CLUSTER_ADD_ENTRY(2 /* ID */, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_ADD_ENTRY(2 /* ID */, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
CLUSTER_START(2 /* ID */);
CLUSTER_TRACE(
"[ 0] 1 > term 2, 2 entries (1^1..2^2)\n"
"[ 0] 2 > term 1, 2 entries (1^1..2^1)\n");
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 3\n");
CLUSTER_TRACE(
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (3 vs 1) -> bump term\n"
" remote log is more recent (2^2 vs 2^1) -> grant vote\n");
return MUNIT_OK;
}
/* If a candidate receives a vote request response granting the vote but the
* quorum is not reached, it stays candidate. */
TEST(election, WaitQuorum, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start 2 servers, having a configuration with 4 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 4 /* servers */, 4 /* voters */);
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n");
/* The first server converts to candidate and sends vote requests. */
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n");
/* Server 2 receives the request, grants its vote and sends the reply. */
CLUSTER_TRACE(
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n");
/* Server 1 receives server 2's RequestVote result RPC but stays candidate
* since it has only 2 votes, and 3 are required. */
CLUSTER_TRACE(
"[ 120] 1 > recv request vote result from server 2\n"
" quorum not reached, only 2 votes out of 4\n");
return MUNIT_OK;
}
/* The vote request gets rejected if the term of the candidate is lower. */
TEST(election, RejectIfRemoteTermLower, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
/* Bootstrap a cluster with 2 voters. The second server is at term 3 */
CLUSTER_SET_TERM(1 /* ID */, 1 /* term */);
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(1 /* ID */);
CLUSTER_SET_TERM(2 /* ID */, 3 /* term */);
CLUSTER_ADD_ENTRY(2 /* ID */, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(2 /* ID */);
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 3, 1 entry (1^1)\n");
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n");
/* The second server receives a RequestVote RPC and rejects the vote for the
* first server. */
CLUSTER_TRACE(
"[ 110] 2 > recv request vote from server 1\n"
" remote term is lower (2 vs 3) -> reject\n");
/* The first server receives the RequestVote result and converts to follower
* because it discovers the newer term. */
CLUSTER_TRACE(
"[ 120] 1 > recv request vote result from server 2\n"
" remote term is higher (3 vs 2) -> bump term, step down\n"
" local server is follower -> ignore\n");
return 0;
}
/* If the server already has a leader, the vote is not granted (even if the
* request has a higher term). */
TEST(election, RejectIfHasLeader, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 3 voters. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
/* Server 1 wins the elections. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n");
/* Server 1 receives the vote from server 3 as well. */
CLUSTER_TRACE(
"[ 120] 1 > recv request vote result from server 3\n"
" local server is leader -> ignore\n");
/* Disconnect server 2, which eventually becomes candidate. */
CLUSTER_DISCONNECT(1, 2);
CLUSTER_SET_ELECTION_TIMEOUT(2 /* ID */, 30 /* timeout */, 0 /* delta */);
CLUSTER_TRACE(
"[ 130] 3 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 3\n"
"[ 140] 2 > timeout as follower\n"
" convert to candidate, start election for term 3\n");
/* Server 3 rejects the vote request because it has a leader. */
CLUSTER_TRACE(
"[ 150] 1 > recv request vote from server 2\n"
" local server is leader -> reject\n"
"[ 150] 3 > recv request vote from server 2\n"
" local server has a leader (server 1) -> reject\n");
return MUNIT_OK;
}
/* If a server has already voted, vote is not granted. */
TEST(election, RejectIfAlreadyVoted, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 3 voters. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
/* Disconnect server 2 from server 1 and change its randomized election
* timeout to match the one of server 1. This way server 2 will convert to
* candidate but not receive vote requests from server 1 (and viceversa). */
CLUSTER_DISCONNECT(1, 2);
CLUSTER_DISCONNECT(2, 1);
CLUSTER_SET_ELECTION_TIMEOUT(2 /* ID */, 100 /* timeout */, 0 /* delta */);
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n");
/* Server 1 and server 2 both become candidates. */
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 100] 2 > timeout as follower\n"
" convert to candidate, start election for term 2\n");
/* Server 3 receives the vote request from server 1 and grants it. */
CLUSTER_TRACE(
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n");
/* Server 3 receives the vote request from server 1 and rejects it because
* it has already voted. */
CLUSTER_TRACE(
"[ 110] 3 > recv request vote from server 2\n"
" already voted for server 1 -> don't grant vote\n");
return MUNIT_OK;
}
/* If the requester last log entry term is lower than ours, the vote is not
* granted. */
TEST(election, RejectIfRemoteLastTermIsLower, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned i;
/* Bootstrap a cluster with 2 voters. Both servers have a command entry at
* index 2, but server 1 has it with term 1 while server 2 has it with
* term 2. */
for (i = 1; i <= 2; i++) {
CLUSTER_SET_TERM(i, 1 /* term */);
CLUSTER_ADD_ENTRY(i, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_ADD_ENTRY(i, RAFT_COMMAND, i /* term */, 0 /* payload */);
CLUSTER_START(i);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 2 entries (1^1..2^1)\n"
"[ 0] 2 > term 1, 2 entries (1^1..2^2)\n");
/* The first server becomes candidate. */
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n");
/* The second server receives a RequestVote RPC and rejects the vote for the
* first server. */
CLUSTER_TRACE(
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log older (2^1 vs 2^2) -> don't grant vote\n");
return MUNIT_OK;
}
/* If the requester last log entry index is lower, the vote is not
* granted. */
TEST(election, RejectIfRemoteLastIndexIsLower, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap a cluster with 2 voters. Server 2 has an entry at
* index 2, while server 1 hasn't. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
if (id == 2) {
CLUSTER_ADD_ENTRY(id, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
}
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 2 entries (1^1..2^1)\n");
/* Server 1 becomes candidate. */
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n");
/* Server 2 receives a RequestVote RPC and rejects the vote for server 1. */
CLUSTER_TRACE(
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log shorter (1^1 vs 2^1) -> don't grant vote\n");
return MUNIT_OK;
}
/* If we are not a voting server, the vote is not granted. */
TEST(election, RejectIfNotVoter, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_message message;
struct raft_event event;
struct raft_update update;
int rv;
CLUSTER_SET_TERM(2 /* ID */, 1 /* term */);
CLUSTER_ADD_ENTRY(2 /* ID */, RAFT_CHANGE, 2 /* servers */, 1 /* voters */);
CLUSTER_START(2);
CLUSTER_TRACE("[ 0] 2 > term 1, 1 entry (1^1)\n");
message.type = RAFT_REQUEST_VOTE;
message.server_id = 1;
message.server_address = "1";
message.request_vote.version = 2;
message.request_vote.term = 2;
message.request_vote.candidate_id = 1;
message.request_vote.last_log_index = 1;
message.request_vote.last_log_term = 1;
message.request_vote.disrupt_leader = false;
message.request_vote.pre_vote = false;
event.time = f->cluster_.time;
event.type = RAFT_RECEIVE;
event.receive.message = &message;
rv = raft_step(CLUSTER_RAFT(2), &event, &update);
munit_assert_int(rv, ==, 0);
CLUSTER_TRACE(
"[ 0] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" local server is not voting -> don't grant vote\n");
return MUNIT_OK;
}
/* Non-voting servers are skipped when sending vote requests. */
TEST(election, SkipNonVoters, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap a cluster with 3 servers, among which only server 1 and server
* 2 are voters. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
/* Disconnect server 1 from server 2, so server 1 can't win the elections,
* since it needs the vote from 2. */
CLUSTER_DISCONNECT(1, 2);
CLUSTER_DISCONNECT(2, 1);
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n");
/* Server 1 becomes candidate. */
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n");
/* Server 1 stays candidate because it can't reach a quorum and eventually
* server 2 becomes candidate as well. */
CLUSTER_TRACE(
"[ 130] 2 > timeout as follower\n"
" convert to candidate, start election for term 2\n");
return MUNIT_OK;
}
/* If a candidate server receives a response indicating that the vote was not
* granted, nothing happens (e.g. the server has already voted for someone
* else). */
TEST(election, ReceiveRejectResult, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap a cluster with 3 servers, all voters. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
/* Disconnect server 1 from server 2 and viceversa. */
CLUSTER_DISCONNECT(1, 2);
CLUSTER_DISCONNECT(2, 1);
/* Server 1 becomes candidate, server 2 is still follower. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n");
/* Server 3 receives a RequestVote RPC and grants its vote. */
CLUSTER_TRACE(
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n");
/* Disconnect server 3 from server 1, so it doesn't receive further
* messages. */
CLUSTER_DISCONNECT(3, 1);
CLUSTER_DISCONNECT(1, 3);
/* Server 2 server eventually becomes candidate */
CLUSTER_TRACE(
"[ 130] 2 > timeout as follower\n"
" convert to candidate, start election for term 2\n");
/* Server 3 receives a RequestVote RPC from server 2 but rejects its vote
* since it has already voted for server 1. */
CLUSTER_TRACE(
"[ 140] 3 > recv request vote from server 2\n"
" already voted for server 1 -> don't grant vote\n");
/* Server 5 receives the response and stays candidate. */
CLUSTER_TRACE(
"[ 150] 2 > recv request vote result from server 3\n"
" vote not granted\n");
munit_assert_int(raft_state(CLUSTER_RAFT(2)), ==, RAFT_CANDIDATE);
return MUNIT_OK;
}
/* Test an election round with two voters and pre-vote. */
TEST(election, PreVote, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap a cluster with 5 servers, all voters with pre-vote enabled. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
raft_set_pre_vote(CLUSTER_RAFT(id), true);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start pre-election for term 2\n");
/* Server 1 did not increment its term or persist its vote.*/
munit_assert_ulong(raft_current_term(CLUSTER_RAFT(1)), ==, 1);
munit_assert_ulong(raft_voted_for(CLUSTER_RAFT(1)), ==, 0);
CLUSTER_TRACE(
"[ 110] 2 > recv request vote from server 1\n"
" remote log is equal (1^1) -> pre-vote ok\n");
/* Server 2 has not incremented its term or persisted its vote.*/
munit_assert_ulong(raft_current_term(CLUSTER_RAFT(2)), ==, 1);
munit_assert_ulong(raft_voted_for(CLUSTER_RAFT(2)), ==, 0);
CLUSTER_TRACE(
"[ 120] 1 > recv request vote result from server 2\n"
" votes quorum reached -> pre-vote successful\n");
/* Server 1 has now incremented its term and persisted its vote. */
munit_assert_ulong(raft_current_term(CLUSTER_RAFT(1)), ==, 2);
munit_assert_ulong(raft_voted_for(CLUSTER_RAFT(1)), ==, 1);
CLUSTER_TRACE(
"[ 130] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n");
munit_assert_ulong(raft_current_term(CLUSTER_RAFT(2)), ==, 2);
munit_assert_ulong(raft_voted_for(CLUSTER_RAFT(2)), ==, 1);
CLUSTER_TRACE(
"[ 140] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
return MUNIT_OK;
}
/* A candidate receives votes then crashes. */
TEST(election, PreVoteWithcandidateCrash, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap a cluster with 3 servers, all voters with pre-vote enabled. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
raft_set_pre_vote(CLUSTER_RAFT(id), true);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n");
/* Server 1 eventually times out and converts to candidate, but it does not
* increment its term yet.*/
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start pre-election for term 2\n");
munit_assert_ulong(raft_current_term(CLUSTER_RAFT(1)), ==, 1);
/* Server 2 receives the pre-vote RequestVote RPC but does not increment its
* term. */
CLUSTER_TRACE(
"[ 110] 2 > recv request vote from server 1\n"
" remote log is equal (1^1) -> pre-vote ok\n");
munit_assert_ulong(raft_current_term(CLUSTER_RAFT(2)), ==, 1);
/* Server 3 receives the pre-vote RequestVote RPC but does not increment its
* term. */
CLUSTER_TRACE(
"[ 110] 3 > recv request vote from server 1\n"
" remote log is equal (1^1) -> pre-vote ok\n");
munit_assert_ulong(raft_current_term(CLUSTER_RAFT(3)), ==, 1);
/* Server 1 receives the pre-vote RequestVote results and starts the actual
* election, incrementing its term and persisting its vote. */
CLUSTER_TRACE(
"[ 120] 1 > recv request vote result from server 2\n"
" votes quorum reached -> pre-vote successful\n"
"[ 120] 1 > recv request vote result from server 3\n"
" receive stale pre-vote response -> ignore\n");
munit_assert_ulong(raft_current_term(CLUSTER_RAFT(1)), ==, 2);
munit_assert_ulong(raft_voted_for(CLUSTER_RAFT(1)), ==, 1);
/* Server 2 receives the actual RequestVote RPC */
CLUSTER_TRACE(
"[ 130] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n");
/* Server 3 receives the actual RequestVote RPC */
CLUSTER_TRACE(
"[ 130] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n");
/* Server 1 crashes. */
CLUSTER_STOP(1);
/* Server 2 times out and starts an election. It doesn't increment its term
* yet. It also can't reset its vote since it's still in the same term. */
CLUSTER_TRACE(
"[ 260] 2 > timeout as follower\n"
" convert to candidate, start pre-election for term 3\n");
munit_assert_ulong(raft_current_term(CLUSTER_RAFT(2)), ==, 2);
munit_assert_ulong(raft_voted_for(CLUSTER_RAFT(2)), ==, 1);
/* Server 3 has already voted for server 1 in term 2, but it didn't vote yet
* for term 3, so it grants its pre-vote, albeit without bumping the term or
* resetting it previous vote. */
CLUSTER_TRACE(
"[ 270] 3 > recv request vote from server 2\n"
" remote log is equal (1^1) -> pre-vote ok\n");
munit_assert_ulong(raft_current_term(CLUSTER_RAFT(3)), ==, 2);
munit_assert_ulong(raft_voted_for(CLUSTER_RAFT(3)), ==, 1);
/* Server 2 receives the pre-vote RequestVote result from server 3. It now
* starts the actual election, bumping its term and persisting its vote for
* itself. */
CLUSTER_TRACE(
"[ 280] 2 > recv request vote result from server 3\n"
" votes quorum reached -> pre-vote successful\n");
munit_assert_ulong(raft_current_term(CLUSTER_RAFT(2)), ==, 3);
munit_assert_ulong(raft_voted_for(CLUSTER_RAFT(2)), ==, 2);
/* Server 3 receives the actual RequestVote RPC. */
CLUSTER_TRACE(
"[ 290] 3 > recv request vote from server 2\n"
" remote term is higher (3 vs 2) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n");
/* Server 2 receives the actual RequestVote result */
CLUSTER_TRACE(
"[ 300] 2 > recv request vote result from server 3\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 1 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n");
return MUNIT_OK;
}
/* Ensure delayed pre-vote responses are not counted towards the real election
* quorum. */
TEST(election, PreVoteNoStaleVotes, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap a cluster with 3 servers, all voters with pre-vote enabled.
*
* Server 3 is 1 term ahead of the other servers, this will allow it to send
* stale pre-vote responses that pass the term checks. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
raft_set_pre_vote(CLUSTER_RAFT(id), true);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
if (id == 3) {
CLUSTER_SET_TERM(3, 2);
}
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 2, 1 entry (1^1)\n");
/* Server 1 eventually times out and converts to candidate, but it does not
* increment its term yet.*/
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start pre-election for term 2\n");
munit_assert_ulong(raft_current_term(CLUSTER_RAFT(1)), ==, 1);
/* Server 2 receives the pre-vote RequestVote RPC but does not increment its
* term. */
CLUSTER_TRACE(
"[ 110] 2 > recv request vote from server 1\n"
" remote log is equal (1^1) -> pre-vote ok\n");
munit_assert_ulong(raft_current_term(CLUSTER_RAFT(2)), ==, 1);
munit_assert_ulong(raft_voted_for(CLUSTER_RAFT(2)), ==, 0);
/* Slow down responses of server 3 */
CLUSTER_SET_NETWORK_LATENCY(3 /* ID */, 20 /* latency */);
/* Server 3 receives the pre-vote RequestVote RPC but does not increment its
* term. */
CLUSTER_TRACE(
"[ 110] 3 > recv request vote from server 1\n"
" remote log is equal (1^1) -> pre-vote ok\n");
munit_assert_ulong(raft_current_term(CLUSTER_RAFT(2)), ==, 1);
munit_assert_ulong(raft_voted_for(CLUSTER_RAFT(2)), ==, 0);
/* Server 1 receives the pre-vote RequestVote result from server 2 and it
* starts the actual election. */
CLUSTER_TRACE(
"[ 120] 1 > recv request vote result from server 2\n"
" votes quorum reached -> pre-vote successful\n");
munit_assert_ulong(raft_current_term(CLUSTER_RAFT(1)), ==, 2);
munit_assert_ulong(raft_voted_for(CLUSTER_RAFT(1)), ==, 1);
/* Server one eventually receives server 3's delayed RequestVote result for
* the pre-vote message, but does not count it as real vote. */
CLUSTER_TRACE(
"[ 130] 1 > recv request vote result from server 3\n"
" receive stale pre-vote response -> ignore\n");
/* Make sure we haven't counted the pre-vote result as a real vote */
munit_assert_int(raft_state(CLUSTER_RAFT(1)), ==, RAFT_CANDIDATE);
return MUNIT_OK;
}
/* If a follower is a stand-by, it won't convert to candidate */
TEST(election, StayFollowerIfStandBy, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned i;
unsigned id;
struct raft_configuration configuration;
struct raft_entry entry;
int rv;
/* Bootstrap a cluster with 4 servers, with 3 voters and 1 stand-by. */
entry.type = RAFT_CHANGE;
entry.term = 1;
CLUSTER_FILL_CONFIGURATION(&configuration, 4 /* n servers */,
3 /* voters */, 1 /* stand-by */);
rv = raft_configuration_encode(&configuration, &entry.buf);
munit_assert_int(rv, ==, 0);
entry.batch = NULL;
for (id = 1; id <= 4; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, &entry);
CLUSTER_START(id);
}
raft_free(entry.buf.base);
/* Server 2 takes a very long time to persist entries. */
CLUSTER_SET_DISK_LATENCY(2, 1000);
/* Disconnect server 2 from server 1, so it won't vote for it. */
CLUSTER_DISCONNECT(2, 1);
CLUSTER_DISCONNECT(1, 2);
/* Increase the election timeout of server 2, so it will start just 1
* election. */
CLUSTER_SET_ELECTION_TIMEOUT(2 /* ID */, 150 /* timeout */, 0 /* delta */);
/* Server 1 wins elections for term 2, with a vote from server 3. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 0] 4 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 3\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n"
" probe server 4 sending a heartbeat (no entries)\n"
"[ 130] 3 > recv append entries from server 1\n"
" no new entries to persist\n");
/* Demote server 2 to stand-by. */
entry.term = 2;
configuration.servers[1].role = RAFT_STANDBY;
rv = raft_configuration_encode(&configuration, &entry.buf);
munit_assert_int(rv, ==, 0);
entry.batch = entry.buf.base;
CLUSTER_SUBMIT(1 /* ID */, &entry);
CLUSTER_TRACE(
"[ 130] 1 > submit 1 new client entry\n"
" replicate 1 new configuration entry (2^2)\n");
/* While the configuration change is in progress, server 2 times out and
* starts an unsuccessful election. */
CLUSTER_TRACE(
"[ 130] 4 > recv append entries from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" no new entries to persist\n"
"[ 140] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 2\n"
"[ 140] 1 > recv append entries result from server 3\n"
" pipeline server 3 sending 1 entry (2^2)\n"
"[ 140] 1 > recv append entries result from server 4\n"
" pipeline server 4 sending 1 entry (2^2)\n"
"[ 150] 3 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n"
"[ 150] 4 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n"
"[ 150] 2 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 160] 3 > persisted 1 entry (2^2)\n"
" send success result to 1\n"
"[ 160] 4 > persisted 1 entry (2^2)\n"
" send success result to 1\n"
"[ 160] 3 > recv request vote from server 2\n"
" local server has a leader (server 1) -> reject\n");
/* The configuration change is committed. */
CLUSTER_TRACE(
"[ 170] 1 > recv append entries result from server 3\n"
" commit 1 new entry (2^2)\n"
"[ 170] 1 > recv append entries result from server 4\n"
"[ 170] 2 > recv request vote result from server 3\n"
" vote not granted\n"
"[ 170] 1 > timeout as leader\n"
" probe server 2 sending 1 entry (2^2)\n");
/* Promote server 4 to voter. */
entry.term = 2;
configuration.servers[3].role = RAFT_VOTER;
rv = raft_configuration_encode(&configuration, &entry.buf);
munit_assert_int(rv, ==, 0);
raft_configuration_close(&configuration);
entry.batch = entry.buf.base;
CLUSTER_SUBMIT(1 /* ID */, &entry);
CLUSTER_TRACE(
"[ 170] 1 > submit 1 new client entry\n"
" replicate 1 new configuration entry (3^2)\n"
" pipeline server 3 sending 1 entry (3^2)\n"
" pipeline server 4 sending 1 entry (3^2)\n");
/* Wait for server 4 to persist the configuration change and apply it,
* becoming aware that it's a voter. */
CLUSTER_TRACE(
"[ 180] 1 > persisted 1 entry (3^2)\n"
" next uncommitted entry (3^2) has 1 vote out of 3\n"
"[ 180] 3 > recv append entries from server 1\n"
" start persisting 1 new entry (3^2)\n"
"[ 180] 4 > recv append entries from server 1\n"
" start persisting 1 new entry (3^2)\n"
"[ 190] 3 > persisted 1 entry (3^2)\n"
" send success result to 1\n"
"[ 190] 4 > persisted 1 entry (3^2)\n"
" send success result to 1\n");
munit_assert_int(CLUSTER_RAFT(4)->configuration.servers[3].role, ==,
RAFT_VOTER);
/* Server 2 is still candidate. */
munit_assert_int(raft_state(CLUSTER_RAFT(2)), ==, RAFT_CANDIDATE);
/* Reconnect server 2 to server 1, so it will receive up to index 4,
* although it won't persist it since it has a high disk latency. */
CLUSTER_RECONNECT(2, 1);
CLUSTER_RECONNECT(1, 2);
/* Server 2 to gets contacted by server 1, steps down and receives
* entries from it */
CLUSTER_TRACE(
"[ 200] 1 > recv append entries result from server 3\n"
" commit 1 new entry (3^2)\n"
"[ 200] 1 > recv append entries result from server 4\n"
"[ 220] 1 > timeout as leader\n"
" probe server 2 sending 2 entries (2^2..3^2)\n"
" pipeline server 3 sending a heartbeat (no entries)\n"
" pipeline server 4 sending a heartbeat (no entries)\n"
"[ 230] 2 > recv append entries from server 1\n"
" discovered leader (1) -> step down \n"
" start persisting 2 new entries (2^2..3^2)\n"
"[ 230] 3 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 230] 4 > recv append entries from server 1\n"
" no new entries to persist\n");
/* Create a network partition, with server 1 and 4 in one partition and
* server 2 and 3 in another partition. */
CLUSTER_DISCONNECT(1, 2);
CLUSTER_DISCONNECT(2, 1);
CLUSTER_DISCONNECT(1, 3);
CLUSTER_DISCONNECT(3, 1);
CLUSTER_DISCONNECT(4, 2);
CLUSTER_DISCONNECT(2, 4);
CLUSTER_DISCONNECT(4, 3);
CLUSTER_DISCONNECT(3, 4);
munit_assert_int(raft_state(CLUSTER_RAFT(2)), ==, RAFT_FOLLOWER);
/* Eventually both server 2 and server 3 time out because they have been
* disconnected from the leader.
*
* Server 3 immediately converts to candidate. Server 2 is still persisting
* entries, but it applies configuration changes immediately so it finds out
* it's a stand-by and stays follower. */
CLUSTER_TRACE(
"[ 240] 1 > recv append entries result from server 4\n"
"[ 270] 1 > timeout as leader\n"
" server 3 is unreachable -> abort pipeline\n"
" probe server 2 sending 2 entries (2^2..3^2)\n"
" probe server 3 sending a heartbeat (no entries)\n"
" pipeline server 4 sending a heartbeat (no entries)\n"
"[ 280] 4 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 290] 1 > recv append entries result from server 4\n"
"[ 320] 1 > timeout as leader\n"
" probe server 2 sending 2 entries (2^2..3^2)\n"
" probe server 3 sending a heartbeat (no entries)\n"
" pipeline server 4 sending a heartbeat (no entries)\n"
"[ 330] 4 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 340] 1 > recv append entries result from server 4\n"
"[ 370] 1 > timeout as leader\n"
" probe server 2 sending 2 entries (2^2..3^2)\n"
" probe server 3 sending a heartbeat (no entries)\n"
" pipeline server 4 sending a heartbeat (no entries)\n"
"[ 380] 4 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 380] 2 > timeout as follower\n"
" stand-by server -> stay follower\n"
"[ 390] 1 > recv append entries result from server 4\n"
"[ 390] 3 > timeout as follower\n"
" convert to candidate, start election for term 3\n");
munit_assert_int(raft_state(CLUSTER_RAFT(2)), ==, RAFT_FOLLOWER);
munit_assert_int(raft_state(CLUSTER_RAFT(3)), ==, RAFT_CANDIDATE);
/* Server 3 can't win the election, because it does not consider server 2 a
* voter, according to the configuration at index 4.
*
* Server 2 also can't win the election, because the it stays follower. */
CLUSTER_TRACE(
"[ 420] 1 > timeout as leader\n"
" probe server 2 sending 2 entries (2^2..3^2)\n"
" probe server 3 sending a heartbeat (no entries)\n"
" pipeline server 4 sending a heartbeat (no entries)\n"
"[ 430] 4 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 440] 1 > recv append entries result from server 4\n"
"[ 470] 1 > timeout as leader\n"
" probe server 2 sending 2 entries (2^2..3^2)\n"
" probe server 3 sending a heartbeat (no entries)\n"
" pipeline server 4 sending a heartbeat (no entries)\n"
"[ 480] 4 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 490] 1 > recv append entries result from server 4\n"
"[ 520] 1 > timeout as leader\n"
" probe server 2 sending 2 entries (2^2..3^2)\n"
" probe server 3 sending a heartbeat (no entries)\n"
" pipeline server 4 sending a heartbeat (no entries)\n"
"[ 530] 4 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 530] 2 > timeout as follower\n"
" stand-by server -> stay follower\n"
"[ 540] 1 > recv append entries result from server 4\n");
for (i = 0; i < 40; i++) {
test_cluster_step(&f->cluster_);
}
munit_assert_int(raft_state(CLUSTER_RAFT(2)), !=, RAFT_LEADER);
munit_assert_int(raft_state(CLUSTER_RAFT(3)), !=, RAFT_LEADER);
/* Server 1 is still leader, since it can contact server 4. */
munit_assert_int(raft_state(CLUSTER_RAFT(1)), ==, RAFT_LEADER);
return MUNIT_OK;
}
/* If a follower is installing a snapshot, it won't convert to
* candidate */
TEST(election, StayFollowerIfInstallingSnapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Set very low threshold and trailing entries number */
CLUSTER_SET_SNAPSHOT_THRESHOLD(1 /* ID */, 2 /* n. entries */);
CLUSTER_SET_SNAPSHOT_TRAILING(1 /* ID */, 0 /* n. entries */);
/* Bootstrap a cluster with 3 servers. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n");
/* Submit a couple of entries, causing server 1 to eventually take a
* snapshot. */
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
/* Disconnect server 2 from server 1, so it won't receive these entries. */
CLUSTER_DISCONNECT(1, 2);
CLUSTER_DISCONNECT(2, 2);
CLUSTER_TRACE(
"[ 120] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^2)\n"
"[ 120] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (3^2)\n"
"[ 120] 1 > recv request vote result from server 3\n"
" local server is leader -> ignore\n"
"[ 130] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 3\n"
"[ 130] 1 > persisted 1 entry (3^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 3\n"
"[ 130] 3 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 3\n"
" pipeline server 3 sending 2 entries (2^2..3^2)\n"
"[ 150] 3 > recv append entries from server 1\n"
" start persisting 2 new entries (2^2..3^2)\n"
"[ 160] 3 > persisted 2 entry (2^2..3^2)\n"
" send success result to 1\n"
"[ 170] 1 > recv append entries result from server 3\n"
" commit 2 new entries (2^2..3^2)\n"
"[ 170] 1 > new snapshot (3^2), 0 trailing entries\n");
/* Reconnect server 2 to server 1, so it will receive the snapshot. */
CLUSTER_RECONNECT(1, 2);
CLUSTER_RECONNECT(2, 2);
/* Server 2 takes a very long time to persist the snapshot. */
CLUSTER_SET_DISK_LATENCY(2, 1000);
CLUSTER_TRACE(
"[ 170] 1 > timeout as leader\n"
" missing previous entry at index 1 -> needs snapshot\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 180] 2 > recv append entries from server 1\n"
" missing previous entry (3^2) -> reject\n"
"[ 190] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" missing previous entry at index 1 -> needs snapshot\n"
" sending snapshot (3^2) to server 2\n"
"[ 190] 1 > timeout as leader\n"
" pipeline server 3 sending a heartbeat (no entries)\n"
"[ 200] 2 > recv install snapshot from server 1\n"
" start persisting snapshot (3^2)\n");
/* Disconnect server 1 from server 2 and 3. */
CLUSTER_DISCONNECT(1, 2);
CLUSTER_DISCONNECT(2, 1);
CLUSTER_DISCONNECT(1, 3);
CLUSTER_DISCONNECT(3, 1);
/* Server 2 eventually times out, but it does not convert to candidate
* because it's still persisting the snapshot. */
CLUSTER_TRACE(
"[ 240] 1 > timeout as leader\n"
" server 3 is unreachable -> abort pipeline\n"
" timeout install snapshot at index 3\n"
" missing previous entry at index 0 -> needs snapshot\n"
" sending snapshot (3^2) to server 2\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 290] 1 > timeout as leader\n"
" server 2 is unreachable -> abort snapshot\n"
" missing previous entry at index 0 -> needs snapshot\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 310] 3 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 320] 2 > recv request vote from server 3\n"
" local server has a leader (server 1) -> reject\n"
"[ 330] 3 > recv request vote result from server 2\n"
" remote term is lower (2 vs 3) -> ignore\n"
"[ 330] 2 > timeout as follower\n"
" installing snapshot -> stay follower\n");
return MUNIT_OK;
}
/* If a follower has entries that are still being persisted, it won't convert to
* candidate */
TEST(election, StayFollowerIfPersistingEntries, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap a cluster with 3 servers. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n");
/* Server 2 takes a very long time to persist entries. */
CLUSTER_SET_DISK_LATENCY(2, 1000);
/* Submit an entry and replicate it to server 2 and 3. */
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 120] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^2)\n"
"[ 120] 1 > recv request vote result from server 3\n"
" local server is leader -> ignore\n"
"[ 130] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 3\n"
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 130] 3 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 2\n"
" pipeline server 2 sending 1 entry (2^2)\n"
"[ 140] 1 > recv append entries result from server 3\n"
" pipeline server 3 sending 1 entry (2^2)\n"
"[ 150] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n"
"[ 150] 3 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n");
/* Disconnect server 1 from server 2 and 3. */
CLUSTER_DISCONNECT(1, 2);
CLUSTER_DISCONNECT(2, 1);
CLUSTER_DISCONNECT(1, 3);
CLUSTER_DISCONNECT(3, 1);
/* Server 2 eventually times out, but stays follower because it's still
* persisting the new entry. */
CLUSTER_TRACE(
"[ 160] 3 > persisted 1 entry (2^2)\n"
" send success result to 1\n"
"[ 190] 1 > timeout as leader\n"
" pipeline server 2 sending a heartbeat (no entries)\n"
" pipeline server 3 sending a heartbeat (no entries)\n"
"[ 240] 1 > timeout as leader\n"
" server 2 is unreachable -> abort pipeline\n"
" server 3 is unreachable -> abort pipeline\n"
" probe server 2 sending 1 entry (2^2)\n"
" probe server 3 sending 1 entry (2^2)\n"
"[ 280] 2 > timeout as follower\n"
" persisting 1 entries -> stay follower\n");
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_fixture.c 0000664 0000000 0000000 00000022261 14601504142 0020723 0 ustar 00root root 0000000 0000000 #include "../../include/raft/fixture.h"
#include "../lib/fsm.h"
#include "../lib/heap.h"
#include "../lib/runner.h"
#define N_SERVERS 3
/******************************************************************************
*
* Fixture
*
*****************************************************************************/
struct fixture
{
FIXTURE_HEAP;
struct raft_fsm fsms[N_SERVERS];
struct raft_fixture fixture;
};
static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_calloc(1, sizeof *f);
struct raft_configuration configuration;
unsigned i;
int rc;
SET_UP_HEAP;
for (i = 0; i < N_SERVERS; i++) {
FsmInit(&f->fsms[i], 2);
}
rc = raft_fixture_init(&f->fixture);
munit_assert_int(rc, ==, 0);
for (i = 0; i < N_SERVERS; i++) {
rc = raft_fixture_grow(&f->fixture, &f->fsms[i]);
munit_assert_int(rc, ==, 0);
}
rc = raft_fixture_configuration(&f->fixture, N_SERVERS, &configuration);
munit_assert_int(rc, ==, 0);
rc = raft_fixture_bootstrap(&f->fixture, &configuration);
munit_assert_int(rc, ==, 0);
raft_configuration_close(&configuration);
rc = raft_fixture_start(&f->fixture);
munit_assert_int(rc, ==, 0);
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
unsigned i;
raft_fixture_close(&f->fixture);
for (i = 0; i < N_SERVERS; i++) {
FsmClose(&f->fsms[i]);
}
TEAR_DOWN_HEAP;
free(f);
}
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
#define GET(I) raft_fixture_get(&f->fixture, I)
#define STEP raft_fixture_step(&f->fixture)
#define STEP_N(N) raft_fixture_step_n(&f->fixture, N)
#define STEP_UNTIL_STATE_IS(I, STATE) \
{ \
bool done_; \
done_ = raft_fixture_step_until_state_is(&f->fixture, I, STATE, 2000); \
munit_assert_true(done_); \
}
#define STATE(I) raft_state(GET(I))
#define ELECT(I) raft_fixture_elect(&f->fixture, I)
#define DEPOSE raft_fixture_depose(&f->fixture)
#define APPLY(I, REQ) \
{ \
struct raft_buffer buf; \
int rc; \
FsmEncodeAddX(1, &buf); \
rc = raft_apply(GET(I), REQ, &buf, 1, NULL); \
munit_assert_int(rc, ==, 0); \
}
#define STEP_UNTIL_APPLIED(INDEX) \
raft_fixture_step_until_applied(&f->fixture, N_SERVERS, INDEX, INDEX * 1000)
/******************************************************************************
*
* Assertions
*
*****************************************************************************/
/* Assert that the fixture time matches the given value */
#define ASSERT_TIME(TIME) \
munit_assert_int(raft_fixture_time(&f->fixture), ==, TIME)
/* Assert that the I'th server is in the given state. */
#define ASSERT_STATE(I, S) munit_assert_int(STATE(I), ==, S)
/* Assert that the x field of the FSM with the given index matches the given
* value. */
#define ASSERT_FSM_X(I, VALUE) munit_assert_int(FsmGetX(&f->fsms[I]), ==, VALUE)
/******************************************************************************
*
* raft_fixture_step
*
*****************************************************************************/
SUITE(raft_fixture_step)
/* If there is no disk I/O in progress or network messages in flight, the tick
* callbacks are called. */
TEST(raft_fixture_step, tick, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_fixture_event *event;
(void)params;
ASSERT_TIME(0);
event = STEP;
munit_assert_int(raft_fixture_event_server_index(event), ==, 0);
munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK);
ASSERT_TIME(100);
event = STEP;
munit_assert_int(raft_fixture_event_server_index(event), ==, 1);
munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK);
ASSERT_TIME(100);
event = STEP;
munit_assert_int(raft_fixture_event_server_index(event), ==, 2);
munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK);
ASSERT_TIME(100);
event = STEP;
munit_assert_int(raft_fixture_event_server_index(event), ==, 0);
munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK);
ASSERT_TIME(200);
return MUNIT_OK;
}
/* By default the election timeout of server 0 is the first to expire . */
TEST(raft_fixture_step, electionTimeout, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_fixture_event *event;
(void)params;
event = STEP_N(28);
munit_assert_int(raft_fixture_event_server_index(event), ==, 0);
munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK);
ASSERT_TIME(1000);
ASSERT_STATE(0, RAFT_CANDIDATE);
ASSERT_STATE(1, RAFT_FOLLOWER);
ASSERT_STATE(2, RAFT_FOLLOWER);
munit_log(MUNIT_LOG_INFO, "done");
return MUNIT_OK;
}
/* Send requests are flushed immediately. */
TEST(raft_fixture_step, flushSend, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_fixture_event *event;
(void)params;
STEP_UNTIL_STATE_IS(0, RAFT_CANDIDATE);
event = STEP;
munit_assert_int(raft_fixture_event_server_index(event), ==, 0);
munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_NETWORK);
ASSERT_TIME(1000);
event = STEP;
munit_assert_int(raft_fixture_event_server_index(event), ==, 0);
munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_NETWORK);
ASSERT_TIME(1000);
return MUNIT_OK;
}
/* Messages are delivered according to the current network latency. */
TEST(raft_fixture_step, deliver, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_fixture_event *event;
(void)params;
STEP_UNTIL_STATE_IS(0, RAFT_CANDIDATE); /* Server 0 starts election */
STEP_N(2); /* Server 0 sends 2 RequestVote */
STEP_N(2); /* Ticks for server 1 and 2 */
ASSERT_TIME(1000);
event = STEP;
munit_assert_int(raft_fixture_event_server_index(event), ==, 0);
munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_NETWORK);
ASSERT_TIME(1015);
return MUNIT_OK;
}
/******************************************************************************
*
* raft_fixture_elect
*
*****************************************************************************/
SUITE(raft_fixture_elect)
/* Trigger the election of the first server. */
TEST(raft_fixture_elect, first, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ELECT(0);
ASSERT_STATE(0, RAFT_LEADER);
ASSERT_STATE(1, RAFT_FOLLOWER);
ASSERT_STATE(2, RAFT_FOLLOWER);
return MUNIT_OK;
}
/* Trigger the election of the second server. */
TEST(raft_fixture_elect, second, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ELECT(1);
ASSERT_STATE(0, RAFT_FOLLOWER);
ASSERT_STATE(1, RAFT_LEADER);
ASSERT_STATE(2, RAFT_FOLLOWER);
return MUNIT_OK;
}
/* Trigger an election change. */
TEST(raft_fixture_elect, change, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ELECT(0);
DEPOSE;
ASSERT_STATE(0, RAFT_FOLLOWER);
ASSERT_STATE(1, RAFT_FOLLOWER);
ASSERT_STATE(2, RAFT_FOLLOWER);
ELECT(1);
ASSERT_STATE(0, RAFT_FOLLOWER);
ASSERT_STATE(1, RAFT_LEADER);
ASSERT_STATE(2, RAFT_FOLLOWER);
return MUNIT_OK;
}
/* Trigger an election that re-elects the same node. */
TEST(raft_fixture_elect, again, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ELECT(0);
DEPOSE;
ASSERT_STATE(0, RAFT_FOLLOWER);
ASSERT_STATE(1, RAFT_FOLLOWER);
ASSERT_STATE(2, RAFT_FOLLOWER);
ELECT(0);
ASSERT_STATE(0, RAFT_LEADER);
ASSERT_STATE(1, RAFT_FOLLOWER);
ASSERT_STATE(2, RAFT_FOLLOWER);
return MUNIT_OK;
}
/******************************************************************************
*
* raft_fixture_step_until_applied
*
*****************************************************************************/
SUITE(raft_fixture_step_until_applied)
/* Wait for one entry to be applied. */
TEST(raft_fixture_step_until_applied, one, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_apply *req = munit_malloc(sizeof *req);
ELECT(0);
APPLY(0, req);
STEP_UNTIL_APPLIED(3);
ASSERT_FSM_X(0, 1);
ASSERT_FSM_X(1, 1);
ASSERT_FSM_X(2, 1);
free(req);
return MUNIT_OK;
}
/* Wait for two entries to be applied. */
TEST(raft_fixture_step_until_applied, two, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_apply *req1 = munit_malloc(sizeof *req1);
struct raft_apply *req2 = munit_malloc(sizeof *req2);
ELECT(0);
APPLY(0, req1);
APPLY(0, req2);
STEP_UNTIL_APPLIED(4);
ASSERT_FSM_X(0, 2);
ASSERT_FSM_X(1, 2);
ASSERT_FSM_X(2, 2);
free(req1);
free(req2);
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_heap.c 0000664 0000000 0000000 00000002224 14601504142 0020147 0 ustar 00root root 0000000 0000000 #include "../../include/raft.h"
#include "../lib/runner.h"
/******************************************************************************
*
* Default heap functions
*
*****************************************************************************/
SUITE(raft_heap)
TEST(raft_heap, malloc, NULL, NULL, 0, NULL)
{
void *p;
p = raft_malloc(8);
munit_assert_ptr_not_null(p);
raft_free(p);
return MUNIT_OK;
}
TEST(raft_heap, calloc, NULL, NULL, 0, NULL)
{
void *p;
p = raft_calloc(1, 8);
munit_assert_ptr_not_null(p);
munit_assert_int(*(uint64_t *)p, ==, 0);
raft_free(p);
return MUNIT_OK;
}
TEST(raft_heap, realloc, NULL, NULL, 0, NULL)
{
void *p;
p = raft_realloc(NULL, 8);
munit_assert_ptr_not_null(p);
*(uint64_t *)p = 1;
p = raft_realloc(p, 16);
munit_assert_ptr_not_null(p);
munit_assert_int(*(uint64_t *)p, ==, 1);
raft_free(p);
return MUNIT_OK;
}
TEST(raft_heap, aligned_alloc, NULL, NULL, 0, NULL)
{
void *p;
p = raft_aligned_alloc(1024, 2048);
munit_assert_ptr_not_null(p);
munit_assert_int((uintptr_t)p % 1024, ==, 0);
raft_free(p);
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_init.c 0000664 0000000 0000000 00000003270 14601504142 0020177 0 ustar 00root root 0000000 0000000 #include "../../include/raft.h"
#include "../lib/heap.h"
#include "../lib/runner.h"
/******************************************************************************
*
* Fixture holding an unitialized raft object.
*
*****************************************************************************/
struct fixture
{
FIXTURE_HEAP;
struct raft raft;
};
static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SET_UP_HEAP;
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
TEAR_DOWN_HEAP;
free(f);
}
/******************************************************************************
*
* raft_init
*
*****************************************************************************/
SUITE(raft_init)
/* The io and fsm objects can be set to NULL */
TEST(raft_init, nullFsmAndIo, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
int rv;
rv = raft_init(&f->raft, NULL, NULL, 1, "1");
munit_assert_int(rv, ==, 0);
raft_close(&f->raft, NULL);
return MUNIT_OK;
}
static char *oom_heap_fault_delay[] = {"0", NULL};
static char *oom_heap_fault_repeat[] = {"1", NULL};
static MunitParameterEnum oom_params[] = {
{TEST_HEAP_FAULT_DELAY, oom_heap_fault_delay},
{TEST_HEAP_FAULT_REPEAT, oom_heap_fault_repeat},
{NULL, NULL},
};
/* Out of memory failures. */
TEST(raft_init, oom, setUp, tearDown, 0, oom_params)
{
struct fixture *f = data;
int rv;
HEAP_FAULT_ENABLE;
rv = raft_init(&f->raft, NULL, NULL, 1, "1");
munit_assert_int(rv, ==, RAFT_NOMEM);
munit_assert_string_equal(raft_errmsg(&f->raft), "out of memory");
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_legacy.c 0000664 0000000 0000000 00000035056 14601504142 0020507 0 ustar 00root root 0000000 0000000 #include "../lib/legacy.h"
#include "../lib/runner.h"
struct fixture
{
FIXTURE_CLUSTER;
};
SUITE(legacy)
static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER(3);
CLUSTER_BOOTSTRAP;
CLUSTER_START();
CLUSTER_ELECT(0);
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER;
free(f);
}
/* Set the snapshot threshold on all servers of the cluster */
#define SET_SNAPSHOT_THRESHOLD(VALUE) \
{ \
unsigned i; \
for (i = 0; i < CLUSTER_N; i++) { \
raft_set_snapshot_threshold(CLUSTER_RAFT(i), VALUE); \
} \
}
/* Set the snapshot trailing logs number on all servers of the cluster */
#define SET_SNAPSHOT_TRAILING(VALUE) \
{ \
unsigned i; \
for (i = 0; i < CLUSTER_N; i++) { \
raft_set_snapshot_trailing(CLUSTER_RAFT(i), VALUE); \
} \
}
static int ioMethodSnapshotPutFail(struct raft_io *raft_io,
unsigned trailing,
struct raft_io_snapshot_put *req,
const struct raft_snapshot *snapshot,
raft_io_snapshot_put_cb cb)
{
(void)raft_io;
(void)trailing;
(void)req;
(void)snapshot;
(void)cb;
return -1;
}
#define SET_FAULTY_SNAPSHOT_PUT() \
{ \
unsigned i; \
for (i = 0; i < CLUSTER_N; i++) { \
CLUSTER_RAFT(i)->io->snapshot_put = ioMethodSnapshotPutFail; \
} \
}
static char *fsm_version[] = {"1", "2", NULL};
static MunitParameterEnum fsm_snapshot_async_params[] = {
{CLUSTER_FSM_VERSION_PARAM, fsm_version},
{NULL, NULL},
};
TEST(legacy,
takeSnapshotSnapshotPutFail,
setUp,
tearDown,
0,
fsm_snapshot_async_params)
{
struct fixture *f = data;
(void)params;
SET_FAULTY_SNAPSHOT_PUT();
/* Set very low threshold and trailing entries number */
SET_SNAPSHOT_THRESHOLD(3);
SET_SNAPSHOT_TRAILING(1);
/* Apply a few of entries, to force a snapshot to be taken. */
CLUSTER_MAKE_PROGRESS;
CLUSTER_MAKE_PROGRESS;
CLUSTER_MAKE_PROGRESS;
/* No crash or leaks have occurred */
return MUNIT_OK;
}
/* A follower doesn't convert to candidate state while it's installing a
* snapshot. */
TEST(legacy, snapshotBlocksCandidate, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
(void)params;
/* Set very low threshold and trailing entries number */
SET_SNAPSHOT_THRESHOLD(3);
SET_SNAPSHOT_TRAILING(1);
/* Apply a few of entries, to force a snapshot to be taken. Drop all
* network traffic between servers 0 and 2 in order for AppendEntries
* RPCs to not be replicated */
CLUSTER_SATURATE_BOTHWAYS(0, 2);
CLUSTER_MAKE_PROGRESS;
CLUSTER_MAKE_PROGRESS;
CLUSTER_MAKE_PROGRESS;
/* Reconnect both servers and set a high disk latency on server 2 */
CLUSTER_SET_DISK_LATENCY(2, 5000);
CLUSTER_DESATURATE_BOTHWAYS(0, 2);
/* Wait a while and check that the leader has sent a snapshot */
CLUSTER_STEP_UNTIL_ELAPSED(500);
munit_assert_int(CLUSTER_N_SEND(0, RAFT_INSTALL_SNAPSHOT), ==, 1);
munit_assert_int(CLUSTER_N_RECV(2, RAFT_INSTALL_SNAPSHOT), ==, 1);
/* Disconnect the servers again so that heartbeats, etc. won't arrive */
CLUSTER_SATURATE_BOTHWAYS(0, 2);
munit_assert_int(CLUSTER_STATE(2), ==, RAFT_FOLLOWER);
munit_assert_true(CLUSTER_RAFT(2)->snapshot.installing);
CLUSTER_STEP_UNTIL_ELAPSED(4000);
munit_assert_int(CLUSTER_STATE(2), ==, RAFT_FOLLOWER);
return MUNIT_OK;
}
static void *setUpReplication(const MunitParameter params[],
MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER(2);
return f;
}
static void tearDownReplication(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER;
free(f);
}
SUITE(replication)
/* If any of the new entry has the same index of an existing entry in our log,
* but different term, and that entry index is already committed, we bail out
* with an error. */
TEST(replication,
recvPrevIndexConflict,
setUpReplication,
tearDownReplication,
0,
NULL)
{
struct fixture *f = data;
struct raft_entry entry1;
struct raft_entry entry2;
CLUSTER_BOOTSTRAP;
/* The servers have an entry with a conflicting term. */
entry1.type = RAFT_COMMAND;
entry1.term = 2;
FsmEncodeSetX(1, &entry1.buf);
CLUSTER_ADD_ENTRY(0, &entry1);
entry2.type = RAFT_COMMAND;
entry2.term = 1;
FsmEncodeSetX(2, &entry2.buf);
CLUSTER_ADD_ENTRY(1, &entry2);
CLUSTER_START();
CLUSTER_ELECT(0);
/* Artificially bump the commit index on the second server */
CLUSTER_RAFT(1)->commit_index = 2;
CLUSTER_STEP;
CLUSTER_STEP;
return MUNIT_OK;
}
/* Assert that the fixture time matches the given value */
#define ASSERT_TIME(TIME) munit_assert_int(CLUSTER_TIME, ==, TIME)
/* Standard startup sequence, bootstrapping the cluster and electing server 0 */
#define BOOTSTRAP_START_AND_ELECT \
CLUSTER_BOOTSTRAP; \
CLUSTER_START(); \
CLUSTER_ELECT(0); \
ASSERT_TIME(1045)
/* A leader with slow disk commits an entry that it hasn't persisted yet,
* because enough followers to have a majority have aknowledged that they have
* appended the entry. The leader's last_stored field hence lags behind its
* commit_index. A new leader gets elected, with a higher commit index and sends
* first a new entry than a heartbeat to the old leader, that needs to update
* its commit_index taking into account its lagging last_stored.
*
* XXX: this test duplicates the one above, but it's kept because the change it
* is associated with was fixing an assertion in the legacy compat layer. */
TEST(replication,
lastStoredLaggingBehindCommitIndex,
setUpReplication,
tearDownReplication,
0,
NULL)
{
struct fixture *f = data;
CLUSTER_GROW;
/* Server 0 takes a long time to persist entry 2 (the barrier) */
CLUSTER_SET_DISK_LATENCY(0, 10000);
/* Server 0 gets elected. */
BOOTSTRAP_START_AND_ELECT;
/* Create an entry at index 2. Server 0 commits and applies it even if
* it not persist it yet. */
CLUSTER_MAKE_PROGRESS;
munit_assert_int(CLUSTER_RAFT(0)->last_stored, ==, 1);
munit_assert_int(CLUSTER_RAFT(0)->commit_index, ==, 2);
munit_assert_int(CLUSTER_RAFT(0)->last_applied, ==, 2);
/* Server 1 stored barrier entry 2, but did not yet receive a
* notification from server 0 about the new commit index. */
munit_assert_int(CLUSTER_RAFT(1)->last_stored, ==, 2);
munit_assert_int(CLUSTER_RAFT(1)->commit_index, ==, 1);
munit_assert_int(CLUSTER_RAFT(1)->last_applied, ==, 1);
/* Disconnect server 0 from server 1 and 2. */
CLUSTER_DISCONNECT(0, 1);
CLUSTER_DISCONNECT(0, 2);
/* Set a very high election timeout on server 0, so it won't step down
* for a while, even if disconnected. */
raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 10000);
raft_set_election_timeout(CLUSTER_RAFT(0), 10000);
/* Server 1 and 2 eventually timeout and start an election, server 1
* wins. */
CLUSTER_STEP_UNTIL_HAS_NO_LEADER(4000);
CLUSTER_STEP_UNTIL_HAS_LEADER(2000);
munit_assert_int(CLUSTER_LEADER, ==, 1);
/* Server 1 commits the barrier entry at index 3 that it created at the
* start of its term. */
CLUSTER_STEP_UNTIL_APPLIED(1, 3, 2000);
/* Reconnect server 0 to server 1, which will start replicating entry 3
* to it. */
CLUSTER_RECONNECT(0, 1);
CLUSTER_STEP_UNTIL_APPLIED(0, 3, 20000);
return MUNIT_OK;
}
/* A leader with faulty disk fails to persist the barrier entry upon election.
*/
TEST(replication,
failPersistBarrier,
setUpReplication,
tearDownReplication,
0,
NULL)
{
struct fixture *f = data;
CLUSTER_GROW;
/* Server 0 will fail to persist entry 2, a barrier */
CLUSTER_IO_FAULT(0, 10, 1);
/* Server 0 gets elected and creates a barrier entry at index 2 */
CLUSTER_BOOTSTRAP;
CLUSTER_START();
CLUSTER_START_ELECT(0);
/* Cluster recovers. */
CLUSTER_STEP_UNTIL_HAS_LEADER(20000);
return MUNIT_OK;
}
/******************************************************************************
*
* raft_assign
*
*****************************************************************************/
static void *setUpAssign(const MunitParameter params[],
MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER(2);
CLUSTER_BOOTSTRAP;
CLUSTER_START();
CLUSTER_ELECT(0);
return f;
}
static void tearDownAssign(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER;
free(f);
}
struct result
{
int status;
bool done;
};
static void changeCbAssertResult(struct raft_change *req, int status)
{
struct result *result = req->data;
munit_assert_int(status, ==, result->status);
result->done = true;
}
static bool changeCbHasFired(struct raft_fixture *f, void *arg)
{
struct result *result = arg;
(void)f;
return result->done;
}
/* Invoke raft_assign() against the I'th server and assert it the given error
* code. */
#define ASSIGN_ERROR(I, ID, ROLE, RV, ERRMSG) \
{ \
struct raft_change __req; \
int __rv; \
__rv = raft_assign(CLUSTER_RAFT(I), &__req, ID, ROLE, NULL); \
munit_assert_int(__rv, ==, RV); \
munit_assert_string_equal(ERRMSG, CLUSTER_ERRMSG(I)); \
}
/* Add a an empty server to the cluster and start it. */
#define GROW \
{ \
int rv__; \
CLUSTER_GROW; \
rv__ = raft_start(CLUSTER_RAFT(2)); \
munit_assert_int(rv__, ==, 0); \
}
/* Submit an add request. */
#define ADD_SUBMIT(I, ID) \
struct raft_change _req; \
char _address[16]; \
struct result _result = {0, false}; \
int _rv; \
_req.data = &_result; \
sprintf(_address, "%d", ID); \
_rv = \
raft_add(CLUSTER_RAFT(I), &_req, ID, _address, changeCbAssertResult); \
munit_assert_int(_rv, ==, 0);
#define ADD(I, ID) \
do { \
ADD_SUBMIT(I, ID); \
CLUSTER_STEP_UNTIL(changeCbHasFired, &_result, 2000); \
} while (0)
/* Submit an assign role request. */
#define ASSIGN_SUBMIT(I, ID, ROLE) \
struct raft_change _req; \
struct result _result = {0, false}; \
int _rv; \
_req.data = &_result; \
_rv = raft_assign(CLUSTER_RAFT(I), &_req, ID, ROLE, changeCbAssertResult); \
munit_assert_int(_rv, ==, 0);
/* Expect the request callback to fire with the given status. */
#define ASSIGN_EXPECT(STATUS) _result.status = STATUS;
/* Wait until a promote request completes. */
#define ASSIGN_WAIT CLUSTER_STEP_UNTIL(changeCbHasFired, &_result, 10000)
SUITE(raft_assign)
/* Trying to change the role of a server whose ID is unknown results in an
* error. */
TEST(raft_assign, unknownId, setUpAssign, tearDownAssign, 0, NULL)
{
struct fixture *f = data;
ASSIGN_ERROR(0, 3, RAFT_VOTER, RAFT_NOTFOUND, "no server has ID 3");
return MUNIT_OK;
}
/* Trying to promote a server to an unknown role in an. */
TEST(raft_assign, badRole, setUpAssign, tearDownAssign, 0, NULL)
{
struct fixture *f = data;
ASSIGN_ERROR(0, 3, 999, RAFT_BADROLE, "server role is not valid");
return MUNIT_OK;
}
/* Trying to assign the voter role to a server which has already it results in
* an error. */
TEST(raft_assign, alreadyHasRole, setUpAssign, tearDownAssign, 0, NULL)
{
struct fixture *f = data;
ASSIGN_ERROR(0, 1, RAFT_VOTER, RAFT_BADROLE, "server is already voter");
return MUNIT_OK;
}
/* Trying to assign a new role to a server while a configuration change is in
* progress results in an error. */
TEST(raft_assign, alreadyInProgressAssign, setUpAssign, tearDown, 0, NULL)
{
struct fixture *f = data;
GROW;
ADD(0, 3);
ASSIGN_SUBMIT(0, 3, RAFT_VOTER);
ASSIGN_ERROR(0, 3, RAFT_VOTER, RAFT_CANTCHANGE,
"a configuration change is already in progress");
ASSIGN_WAIT;
return MUNIT_OK;
}
SUITE(raft_init)
TEST(raft_init, ioVersionNotSet, NULL, NULL, 0, NULL)
{
struct raft r = {0};
struct raft_io io = {0};
struct raft_fsm fsm = {0};
io.version = 0;
fsm.version = 3;
int rc;
rc = raft_init(&r, &io, &fsm, 1, "1");
munit_assert_int(rc, ==, -1);
munit_assert_string_equal(r.errmsg, "io->version must be set");
return MUNIT_OK;
}
TEST(raft_init, fsmVersionNotSet, NULL, NULL, 0, NULL)
{
struct raft r = {0};
struct raft_io io = {0};
struct raft_fsm fsm = {0};
io.version = 2;
fsm.version = 0;
int rc;
rc = raft_init(&r, &io, &fsm, 1, "1");
munit_assert_int(rc, ==, -1);
munit_assert_string_equal(r.errmsg, "fsm->version must be set");
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_membership.c 0000664 0000000 0000000 00000047105 14601504142 0021374 0 ustar 00root root 0000000 0000000 #include "../../src/configuration.h"
#include "../../src/progress.h"
#include "../lib/cluster.h"
#include "../lib/runner.h"
/******************************************************************************
*
* Fixture
*
*****************************************************************************/
struct fixture
{
FIXTURE_CLUSTER;
};
/* Set up a cluster of 2 servers, with the first as leader. */
static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER();
return f;
}
static void tear_down(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER();
free(f);
}
/* Assert the values of the committed and uncommitted configuration indexes on
* the raft instance with the given index. */
#define ASSERT_CONFIGURATION_INDEXES(I, COMMITTED, UNCOMMITTED) \
{ \
struct raft *raft_ = CLUSTER_RAFT(I); \
munit_assert_ullong(raft_->configuration_committed_index, ==, \
COMMITTED); \
munit_assert_ullong(raft_->configuration_uncommitted_index, ==, \
UNCOMMITTED); \
}
/******************************************************************************
*
* raft_add
*
*****************************************************************************/
SUITE(raft_add)
/* After a request to add a new non-voting server is committed, the new
* configuration is not marked as uncommitted anymore */
TEST(raft_add, Committed, setup, tear_down, 0, NULL)
{
struct fixture *f = data;
struct raft *raft = CLUSTER_RAFT(1);
const struct raft_server *server;
unsigned id;
/* Start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
CLUSTER_SUBMIT(1 /* ID */, CHANGE, 3 /* n */, 2 /* V */, 0 /* S */);
CLUSTER_TRACE(
"[ 120] 1 > submit 1 new client entry\n"
" replicate 1 new configuration entry (2^2)\n");
/* The new configuration is already effective. */
munit_assert_uint(raft->configuration.n, ==, 3);
server = &raft->configuration.servers[2];
munit_assert_ullong(server->id, ==, 3);
munit_assert_string_equal(server->address, "3");
munit_assert_int(server->role, ==, RAFT_SPARE);
/* The new configuration is marked as uncommitted. */
ASSERT_CONFIGURATION_INDEXES(1, 1 /* committed */, 2 /* uncomitted */);
/* The next/match indexes now include an entry for the new server. */
munit_assert_ullong(raft->leader_state.progress[2].next_index, ==, 3);
munit_assert_ullong(raft->leader_state.progress[2].match_index, ==, 0);
CLUSTER_TRACE(
"[ 130] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 2\n"
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 2\n"
" pipeline server 2 sending 1 entry (2^2)\n"
"[ 150] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n"
"[ 160] 2 > persisted 1 entry (2^2)\n"
" send success result to 1\n"
"[ 170] 1 > recv append entries result from server 2\n"
" commit 1 new entry (2^2)\n");
test_cluster_step(&f->cluster_);
/* The new configuration is marked as committed. */
ASSERT_CONFIGURATION_INDEXES(1, 2 /* committed */, 0 /* uncommitted */);
return MUNIT_OK;
}
/* Trying to add a server on a node which is not the leader results in an
* error. */
TEST(raft_add, NotLeader, setup, tear_down, 0, NULL)
{
struct fixture *f = data;
struct raft_configuration configuration;
struct raft_entry entry;
struct raft_event event;
struct raft_update update;
unsigned id;
int rv;
/* Start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
CLUSTER_FILL_CONFIGURATION(&configuration, 3, 2 /* V */, 0 /* S */);
entry.type = RAFT_CHANGE;
entry.term = 2;
rv = raft_configuration_encode(&configuration, &entry.buf);
munit_assert_int(rv, ==, 0);
raft_configuration_close(&configuration);
entry.batch = entry.buf.base;
event.time = f->cluster_.time;
event.type = RAFT_SUBMIT;
event.submit.n = 1;
event.submit.entries = &entry;
rv = raft_step(CLUSTER_RAFT(2), &event, &update);
munit_assert_int(rv, ==, RAFT_NOTLEADER);
raft_free(entry.buf.base);
return MUNIT_OK;
}
/* Trying to add a server while a configuration change is already in progress
* results in an error. */
TEST(raft_add, Busy, setup, tear_down, 0, NULL)
{
struct fixture *f = data;
struct raft_configuration configuration;
struct raft_entry entry;
unsigned id;
int rv;
/* Start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
CLUSTER_SUBMIT(1 /* ID */, CHANGE, 3 /* n */, 2 /* V */, 0 /* S */);
CLUSTER_FILL_CONFIGURATION(&configuration, 3, 2 /* V */, 1 /* S */);
entry.type = RAFT_CHANGE;
entry.term = 2;
rv = raft_configuration_encode(&configuration, &entry.buf);
munit_assert_int(rv, ==, 0);
raft_configuration_close(&configuration);
entry.batch = entry.buf.base;
rv = test_cluster_submit(&f->cluster_, 1 /* ID */, &entry);
munit_assert_int(rv, ==, RAFT_CANTCHANGE);
raft_free(entry.buf.base);
return MUNIT_OK;
}
/* Trying to add a server with an ID which is already in use results in an
* error. */
TEST(raft_add, DuplicateId, setup, tear_down, 0, NULL)
{
struct raft_configuration configuration;
int rv;
CLUSTER_FILL_CONFIGURATION(&configuration, 2, 2 /* V */, 0 /* S */);
rv = raft_configuration_add(&configuration, 2, "3", RAFT_SPARE);
munit_assert_int(rv, ==, RAFT_DUPLICATEID);
raft_configuration_close(&configuration);
return MUNIT_OK;
}
/******************************************************************************
*
* raft_remove
*
*****************************************************************************/
SUITE(raft_remove)
/* After a request to remove server is committed, the new configuration is not
* marked as uncommitted anymore */
TEST(raft_remove, Committed, setup, tear_down, 0, NULL)
{
struct fixture *f = data;
struct raft *raft = CLUSTER_RAFT(1);
const struct raft_server *server;
unsigned id;
/* Start a cluster with 3 voters. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 120] 1 > recv request vote result from server 3\n"
" local server is leader -> ignore\n");
CLUSTER_SUBMIT(1 /* ID */, CHANGE, 2 /* n */, 2 /* V */, 0 /* S */);
CLUSTER_TRACE(
"[ 120] 1 > submit 1 new client entry\n"
" replicate 1 new configuration entry (2^2)\n");
/* The new configuration is already effective. */
munit_assert_uint(raft->configuration.n, ==, 2);
server = &raft->configuration.servers[1];
munit_assert_ullong(server->id, ==, 2);
/* The new configuration is marked as uncommitted. */
ASSERT_CONFIGURATION_INDEXES(1, 1 /* committed */, 2 /* uncomitted */);
CLUSTER_TRACE(
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 130] 3 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 130] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 2\n"
"[ 140] 1 > recv append entries result from server 2\n"
" pipeline server 2 sending 1 entry (2^2)\n"
"[ 140] 1 > recv append entries result from server 3\n"
" unknown server -> ignore\n"
"[ 150] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n"
"[ 160] 2 > persisted 1 entry (2^2)\n"
" send success result to 1\n"
"[ 170] 1 > recv append entries result from server 2\n"
" commit 1 new entry (2^2)\n");
test_cluster_step(&f->cluster_);
/* The new configuration is marked as committed. */
ASSERT_CONFIGURATION_INDEXES(1, 2 /* committed */, 0 /* uncommitted */);
return MUNIT_OK;
}
/* A leader gets a request to remove itself. */
TEST(raft_remove, Self, setup, tear_down, 0, NULL)
{
struct fixture *f = data;
struct raft_configuration configuration;
struct raft_entry entry;
unsigned id;
int rv;
/* Start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
raft_configuration_init(&configuration);
rv = raft_configuration_add(&configuration, 2, "2", RAFT_VOTER);
munit_assert_int(rv, ==, 0);
entry.type = RAFT_CHANGE;
entry.term = 2;
rv = raft_configuration_encode(&configuration, &entry.buf);
munit_assert_int(rv, ==, 0);
raft_configuration_close(&configuration);
entry.batch = entry.buf.base;
CLUSTER_SUBMIT(1 /* ID */, &entry);
CLUSTER_TRACE(
"[ 120] 1 > submit 1 new client entry\n"
" replicate 1 new configuration entry (2^2)\n"
"[ 130] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 0 votes out of 1\n"
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 2\n"
" pipeline server 2 sending 1 entry (2^2)\n"
"[ 150] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n"
"[ 160] 2 > persisted 1 entry (2^2)\n"
" send success result to 1\n"
"[ 170] 1 > recv append entries result from server 2\n"
" commit 1 new entry (2^2)\n"
" leader removed from config -> step down\n"
"[ 270] 1 > timeout as follower\n"
" server not in current configuration -> stay follower\n"
"[ 280] 2 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
" self elect and convert to leader\n");
return MUNIT_OK;
}
/* A leader gets a request to remove itself from a 3-node cluster */
TEST(raft_remove, SelfThreeNodeCluster, setup, tear_down, 0, NULL)
{
struct fixture *f = data;
struct raft_configuration configuration;
struct raft_entry entry;
unsigned id;
raft_id leader_id;
const char *leader_address;
int rv;
/* Start a cluster with 3 voters. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 120] 1 > recv request vote result from server 3\n"
" local server is leader -> ignore\n");
raft_configuration_init(&configuration);
rv = raft_configuration_add(&configuration, 2, "2", RAFT_VOTER);
munit_assert_int(rv, ==, 0);
rv = raft_configuration_add(&configuration, 3, "3", RAFT_VOTER);
munit_assert_int(rv, ==, 0);
entry.type = RAFT_CHANGE;
entry.term = 2;
rv = raft_configuration_encode(&configuration, &entry.buf);
munit_assert_int(rv, ==, 0);
raft_configuration_close(&configuration);
entry.batch = entry.buf.base;
CLUSTER_SUBMIT(1 /* ID */, &entry);
/* The removed- leader should still replicate entries.
*
* Raft dissertation 4.2.2:
*
* `First, there will be a period of time (while it is committing Cnew) when
* a leader can manage a cluster that does not include itself; it replicates
* log entries but does not count itself in majorities.`
*/
CLUSTER_TRACE(
"[ 120] 1 > submit 1 new client entry\n"
" replicate 1 new configuration entry (2^2)\n");
/* Verify node with id 1 is the leader */
raft_leader(CLUSTER_RAFT(1), &leader_id, &leader_address);
munit_assert_ulong(leader_id, ==, 1);
munit_assert_string_equal(leader_address, "1");
/* The removed leader eventually steps down */
CLUSTER_TRACE(
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 130] 3 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 130] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 0 votes out of 2\n"
"[ 140] 1 > recv append entries result from server 2\n"
" pipeline server 2 sending 1 entry (2^2)\n"
"[ 140] 1 > recv append entries result from server 3\n"
" pipeline server 3 sending 1 entry (2^2)\n"
"[ 150] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n"
"[ 150] 3 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n"
"[ 160] 2 > persisted 1 entry (2^2)\n"
" send success result to 1\n"
"[ 160] 3 > persisted 1 entry (2^2)\n"
" send success result to 1\n"
"[ 170] 1 > recv append entries result from server 2\n"
" next uncommitted entry (2^2) has 1 vote out of 2\n"
"[ 170] 1 > recv append entries result from server 3\n"
" commit 1 new entry (2^2)\n"
" leader removed from config -> step down\n");
raft_leader(CLUSTER_RAFT(1), &leader_id, &leader_address);
munit_assert_ulong(leader_id, ==, 0);
munit_assert_ptr_null(leader_address);
return MUNIT_OK;
}
SUITE(raft_assign)
/* Trying to promote a server on a raft instance which is not the leader results
* in an error. */
TEST(raft_assign, NotLeader, setup, tear_down, 0, NULL)
{
struct fixture *f = data;
struct raft_configuration configuration;
struct raft_entry entry;
struct raft_event event;
struct raft_update update;
int rv;
/* Start the non-voter server of 2-server cluster with a signle voter. */
CLUSTER_SET_TERM(2 /* ID */, 1 /* term */);
CLUSTER_ADD_ENTRY(2 /* ID */, RAFT_CHANGE, 2 /* servers */, 1 /* voters */);
CLUSTER_START(2 /* ID */);
CLUSTER_FILL_CONFIGURATION(&configuration, 2, 2 /* V */, 0 /* S */);
entry.type = RAFT_CHANGE;
entry.term = 2;
rv = raft_configuration_encode(&configuration, &entry.buf);
munit_assert_int(rv, ==, 0);
raft_configuration_close(&configuration);
entry.batch = entry.buf.base;
event.time = f->cluster_.time;
event.type = RAFT_SUBMIT;
event.submit.n = 1;
event.submit.entries = &entry;
rv = raft_step(CLUSTER_RAFT(2), &event, &update);
munit_assert_int(rv, ==, RAFT_NOTLEADER);
raft_free(entry.buf.base);
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_replication.c 0000664 0000000 0000000 00000277475 14601504142 0021571 0 ustar 00root root 0000000 0000000 #include "../../src/progress.h"
#include "../lib/cluster.h"
#include "../lib/runner.h"
struct fixture
{
FIXTURE_CLUSTER;
};
static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER();
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER();
free(f);
}
SUITE(replication)
/* A leader doesn't send an initial no-op barrier entry if its committed index
* is as big as its last log index. */
TEST(replication, NoInitialBarrier, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n");
/* Server 1 becomes candidate and sends a vote request to server 2. */
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n");
/* Server 1 receives the vote result and becomes leader. It does not append
* any barrier entry. */
CLUSTER_TRACE(
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
return MUNIT_OK;
}
/* A leader sends an initial no-op barrier entry if its committed index
* is behind its last log index. */
TEST(replication, InitialBarrier, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. Server 1 has an additioanl
* entry. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
if (id == 1) {
CLUSTER_ADD_ENTRY(id, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
}
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 2 entries (1^1..2^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n");
/* Server 1 becomes candidate and sends a vote request to server 2. */
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is longer (2^1 vs 1^1) -> grant vote\n");
/* Server 1 receives the vote result and becomes leader. It appends
* a barrier in order to commit all entries from previous terms. */
CLUSTER_TRACE(
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new barrier entry (3^2)\n"
" probe server 2 sending 1 entry (3^2)\n");
return MUNIT_OK;
}
/* After receiving an AppendEntriesResult, a leader has set the feature flags of
* a node. */
TEST(replication, FeatureFlags, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft *raft;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader and sends the initial heartbeat. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
/* Features were already populated via RequestVote result. */
raft = CLUSTER_RAFT(1);
munit_assert_uint(raft->leader_state.progress[1].features, ==, 1);
/* Server 2 receives the heartbeat and replies. When server 1 receives the
* response, the feature flags are set. */
CLUSTER_TRACE(
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 2\n");
munit_assert_uint(raft->leader_state.progress[1].features, ==, 1);
return MUNIT_OK;
}
/* A leader keeps sending heartbeat messages at regular intervals to
* maintain leadership. */
TEST(replication, Heartbeat, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
/* Server 2 receives the first the heartbeat. */
CLUSTER_TRACE(
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n");
/* Server 2 receives a second heartbeat. */
CLUSTER_TRACE(
"[ 140] 1 > recv append entries result from server 2\n"
"[ 170] 1 > timeout as leader\n"
" pipeline server 2 sending a heartbeat (no entries)\n"
"[ 180] 2 > recv append entries from server 1\n"
" no new entries to persist\n");
return MUNIT_OK;
}
/* If a leader replicates some entries during a given heartbeat interval, it
* skips sending the heartbeat for that interval. */
TEST(replication, SkipHeartbeatIfEntriesHaveSent, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
struct raft *raft;
struct raft_entry entry;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 2\n");
raft = CLUSTER_RAFT(1);
munit_assert_ullong(raft->leader_state.progress[1].last_send, ==, 120);
/* Server 1 starts replicating a new entry after 5 milliseconds. The
* heartbeat timeout gets postponed. */
CLUSTER_ELAPSE(5);
munit_assert_ullong(raft_timeout(CLUSTER_RAFT(1)), ==, 170);
entry.term = 2;
entry.type = RAFT_COMMAND;
entry.buf.len = 8;
entry.buf.base = raft_malloc(entry.buf.len);
munit_assert_not_null(entry.buf.base);
entry.batch = entry.buf.base;
CLUSTER_SUBMIT(1 /* ID */, &entry);
CLUSTER_TRACE(
"[ 145] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^2)\n"
" pipeline server 2 sending 1 entry (2^2)\n");
munit_assert_ullong(raft->leader_state.progress[1].last_send, ==, 145);
munit_assert_ullong(raft_timeout(CLUSTER_RAFT(1)), ==, 195);
CLUSTER_TRACE(
"[ 155] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 2\n"
"[ 155] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n"
"[ 165] 2 > persisted 1 entry (2^2)\n"
" send success result to 1\n"
"[ 175] 1 > recv append entries result from server 2\n"
" commit 1 new entry (2^2)\n");
/* When the heartbeat timeout expires again, server 1 sends a fresh
* heartbeat round.
*
* XXX: should we immediately send a heartbeat after the commit index
* changes? In order to notify followers. */
CLUSTER_TRACE(
"[ 195] 1 > timeout as leader\n"
" pipeline server 2 sending a heartbeat (no entries)\n");
munit_assert_ullong(raft->leader_state.progress[1].last_send, ==, 195);
munit_assert_ullong(raft_timeout(CLUSTER_RAFT(1)), ==, 245);
return MUNIT_OK;
}
/* The leader doesn't send replication messages to idle servers. */
TEST(replication, SkipSpare, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with one voter and one spare. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 1 /* voters */);
CLUSTER_START(id);
}
/* Server 1 self-elects, but it does not replicate any entry or send any
* heartbeat to server 2. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
" self elect and convert to leader\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as leader\n");
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 100] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^1)\n"
"[ 110] 1 > persisted 1 entry (2^1)\n"
" commit 1 new entry (2^1)\n");
munit_assert_ullong(raft_commit_index(CLUSTER_RAFT(1)), ==, 2);
munit_assert_ullong(raft_commit_index(CLUSTER_RAFT(2)), ==, 1);
return MUNIT_OK;
}
/* A follower remains in probe mode until the leader receives a successful
* AppendEntries response. */
TEST(replication, Probe, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
/* Set a network latency higher than the heartbeat timeout for server 2, so
* server 1 will send a second probe AppendEntries without transitioning to
* pipeline mode. */
CLUSTER_SET_NETWORK_LATENCY(2 /* ID */, 100 /* msecs */);
CLUSTER_TRACE(
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n");
/* Server 1 receives a new entry after a few milliseconds. Since the
* follower is still in probe mode and since an AppendEntries message was
* already sent recently, it does not send the new entry immediately. */
CLUSTER_ELAPSE(5);
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 135] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^2)\n");
/* A heartbeat timeout elapses without receiving a response, so server 1
* sends an new AppendEntries to server 2. This time it includes also the
* new entry that was accepted in the meantime. */
CLUSTER_TRACE(
"[ 145] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 2\n"
"[ 170] 1 > timeout as leader\n"
" probe server 2 sending 1 entry (2^2)\n");
/* Now lower the network latency of server 2, so the AppendEntries result
* for this last AppendEntries request will get delivered before the
* response for the original heartbeat AppendEntries request . */
CLUSTER_SET_NETWORK_LATENCY(2 /* ID */, 10 /* msecs */);
CLUSTER_TRACE(
"[ 180] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n"
"[ 190] 2 > persisted 1 entry (2^2)\n"
" send success result to 1\n");
/* Server 1 receives a second entry. Since the follower is still in probe
* mode and since an AppendEntries message was already sent recently, it
* does not send the new entry immediately. */
CLUSTER_ELAPSE(5);
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 195] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (3^2)\n");
/* Eventually server 1 receives the AppendEntries result for the second
* request, at that point it transitions to pipeline mode and sends
* the second entry immediately. */
CLUSTER_TRACE(
"[ 200] 1 > recv append entries result from server 2\n"
" pipeline server 2 sending 1 entry (3^2)\n"
" commit 1 new entry (2^2)\n");
return MUNIT_OK;
}
/* A follower transitions to pipeline mode after the leader receives a
* successful AppendEntries response from it. */
TEST(replication, Pipeline, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft *raft;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 2\n");
/* Server 1 receives a new entry after 5 milliseconds, just before the
* heartbeat timeout expires. Since the follower has transitioned to
* pipeline mode the new entry is sent immediately and the next index is
* optimistically increased. */
CLUSTER_ELAPSE(5);
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 145] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^2)\n"
" pipeline server 2 sending 1 entry (2^2)\n");
raft = CLUSTER_RAFT(1);
munit_assert_ullong(raft->leader_state.progress[1].next_index, ==, 3);
/* After another 15 milliseconds, before receiving the response for this
* last AppendEntries RPC and before the heartbeat timeout expires, server 1
* accepts a second entry, which is also replicated immediately. */
CLUSTER_TRACE(
"[ 155] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 2\n"
"[ 155] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n"
"[ 165] 2 > persisted 1 entry (2^2)\n"
" send success result to 1\n");
CLUSTER_ELAPSE(5);
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 170] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (3^2)\n"
" pipeline server 2 sending 1 entry (3^2)\n");
munit_assert_ullong(raft->leader_state.progress[1].next_index, ==, 4);
/* Eventually server 1 receives AppendEntries results for both entries. */
CLUSTER_TRACE(
"[ 175] 1 > recv append entries result from server 2\n"
" commit 1 new entry (2^2)\n"
"[ 180] 1 > persisted 1 entry (3^2)\n"
" next uncommitted entry (3^2) has 1 vote out of 2\n"
"[ 180] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (3^2)\n"
"[ 190] 2 > persisted 1 entry (3^2)\n"
" send success result to 1\n"
"[ 200] 1 > recv append entries result from server 2\n"
" commit 1 new entry (3^2)\n");
return MUNIT_OK;
}
/* A follower disconnects while in probe mode. */
TEST(replication, Disconnect, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
CLUSTER_DISCONNECT(1, 2);
/* After the heartbeat timeout server 1 retries, this time it succeeds. */
CLUSTER_TRACE(
"[ 170] 1 > timeout as leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
CLUSTER_RECONNECT(1, 2);
CLUSTER_TRACE(
"[ 180] 2 > recv append entries from server 1\n"
" no new entries to persist\n");
return MUNIT_OK;
}
/* A follower disconnects while in pipeline mode. */
TEST(replication, PipelineDisconnect, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft *raft;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader and then sends a first round of heartbeats,
* transitioning server 2 into pipeline mode. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 2\n");
/* Server 1 starts to replicate a few entries, however server 2 disconnects
* before it can receive them. */
CLUSTER_ELAPSE(10);
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 150] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^2)\n"
" pipeline server 2 sending 1 entry (2^2)\n"
"[ 150] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (3^2)\n"
" pipeline server 2 sending 1 entry (3^2)\n");
raft = CLUSTER_RAFT(1);
munit_assert_ullong(raft->leader_state.progress[1].next_index, ==, 4);
CLUSTER_DISCONNECT(1, 2);
/* A full election timeout eventually elapses, and since server 1 did not
* receive any message from server 2, it transitions server 2 back to probe
* mode. */
CLUSTER_TRACE(
"[ 160] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 2\n"
"[ 160] 1 > persisted 1 entry (3^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 2\n"
"[ 200] 1 > timeout as leader\n"
" pipeline server 2 sending a heartbeat (no entries)\n"
"[ 250] 1 > timeout as leader\n"
" server 2 is unreachable -> abort pipeline\n"
" probe server 2 sending 2 entries (2^2..3^2)\n");
munit_assert_ullong(raft->leader_state.progress[1].next_index, ==, 2);
/* After reconnection the follower eventually replicates the entries and
* reports back. */
CLUSTER_RECONNECT(1, 2);
CLUSTER_TRACE(
"[ 260] 2 > recv append entries from server 1\n"
" start persisting 2 new entries (2^2..3^2)\n"
"[ 270] 2 > persisted 2 entry (2^2..3^2)\n"
" send success result to 1\n");
return MUNIT_OK;
}
/* Receive the same entry a second time, before the first has been persisted. */
TEST(replication, ReceiveSameEntryTwice, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. The both have an additional
* uncommitted entry at index 2. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_ADD_ENTRY(id, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
CLUSTER_START(id);
}
/* Set a high disk latency for server 2, so persisting the entry at index 2
* will takes a long time. */
CLUSTER_SET_DISK_LATENCY(2 /* ID */, 60 /* msecs */);
/* Server 1 becomes leader and then sends a barrier since it has an
* uncommitted entry at index 1. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 2 entries (1^1..2^1)\n"
"[ 0] 2 > term 1, 2 entries (1^1..2^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (2^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new barrier entry (3^2)\n"
" probe server 2 sending 1 entry (3^2)\n");
/* Server 2 takes a long time to persist the entry, and since replication
* for server 2 is still in the probe state, server 1 eventually sends again
* the same entry. Server 2 receives it, but it doesn't persist it again to
* disk, since the first persist request is still in flight. */
CLUSTER_TRACE(
"[ 130] 1 > persisted 1 entry (3^2)\n"
" next uncommitted entry (2^1) has 1 vote out of 2\n"
"[ 130] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (3^2)\n"
"[ 170] 1 > timeout as leader\n"
" probe server 2 sending 1 entry (3^2)\n");
/* Eventually the original persist entries request from server 2 succeeds,
* and is reported back to server 1. */
CLUSTER_TRACE(
"[ 180] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 190] 2 > persisted 1 entry (3^2)\n"
" send success result to 1\n"
"[ 190] 1 > recv append entries result from server 2\n"
" pipeline server 2 sending 1 entry (3^2)\n"
" next uncommitted entry (2^1) has 2 votes out of 2\n"
"[ 200] 1 > recv append entries result from server 2\n"
" commit 2 new entries (2^1..3^2)\n");
return MUNIT_OK;
}
/* If the term in the request is stale, the server rejects it. */
TEST(replication, AppendEntriesRequestHasStaleTerm, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 3 voters. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
/* Server 1 wins the elections. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n");
/* Partition server 1 from the other two and set a very high election
* timeout on it, so it will keep sending heartbeats. */
CLUSTER_SET_ELECTION_TIMEOUT(1 /* ID */, 250 /* timeout */, 0 /* delta */);
CLUSTER_DISCONNECT(1, 2);
CLUSTER_DISCONNECT(2, 1);
CLUSTER_DISCONNECT(1, 3);
CLUSTER_DISCONNECT(3, 1);
/* Server 2 eventually times out and starts an election. */
CLUSTER_SET_ELECTION_TIMEOUT(2 /* ID */, 30 /* timeout */, 0 /* delta */);
CLUSTER_TRACE(
"[ 140] 2 > timeout as follower\n"
" convert to candidate, start election for term 3\n");
/* Reconnect server 1 with server 3, so server 3 will receive the next
* hearbeat that server 1 sends to it */
CLUSTER_RECONNECT(1, 3);
CLUSTER_RECONNECT(3, 1);
/* Eventually server 2 gets elected and server 1 sends a new heartbeat. */
CLUSTER_TRACE(
"[ 150] 3 > recv request vote from server 2\n"
" remote term is higher (3 vs 2) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 160] 2 > recv request vote result from server 3\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 1 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n");
/* Server 3 receives the heartbeat from server 1 and rejects it. */
CLUSTER_TRACE(
"[ 170] 3 > recv append entries from server 2\n"
" no new entries to persist\n"
"[ 170] 1 > timeout as leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 180] 2 > recv append entries result from server 3\n"
"[ 180] 3 > recv append entries from server 1\n"
" local term is higher (3 vs 2) -> reject\n");
/* Server 1 receives the reject message and steps down. */
CLUSTER_TRACE(
"[ 190] 1 > recv append entries result from server 3\n"
" remote term is higher (3 vs 2) -> bump term, step down\n");
return MUNIT_OK;
}
/* If the log of the receiving server is shorter than prevLogIndex, the request
* is rejected . */
TEST(replication, FollowerHasMissingEntries, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. Server 1 has an entry that
* server 2 doesn't have. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
if (id == 1) {
CLUSTER_ADD_ENTRY(id, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
}
CLUSTER_START(id);
}
/* Server 1 wins the election because it has a longer log. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 2 entries (1^1..2^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is longer (2^1 vs 1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new barrier entry (3^2)\n"
" probe server 2 sending 1 entry (3^2)\n"
"[ 130] 1 > persisted 1 entry (3^2)\n"
" next uncommitted entry (2^1) has 1 vote out of 2\n");
/* Server 1 replicates a no-op entry to server 2, which initially rejects
* it, because it's missing the one before. */
CLUSTER_TRACE(
"[ 130] 2 > recv append entries from server 1\n"
" missing previous entry (2^1) -> reject\n");
/* Server 1 sends the missing entry. */
CLUSTER_TRACE(
"[ 140] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" probe server 2 sending 2 entries (2^1..3^2)\n"
"[ 150] 2 > recv append entries from server 1\n"
" start persisting 2 new entries (2^1..3^2)\n"
"[ 160] 2 > persisted 2 entry (2^1..3^2)\n"
" send success result to 1\n"
"[ 170] 1 > recv append entries result from server 2\n"
" commit 2 new entries (2^1..3^2)\n");
return MUNIT_OK;
}
/* If the term of the last log entry on the server is different from the one
* in prevLogTerm, and value of prevLogIndex is greater than the server's commit
* index (i.e. this is a normal inconsistency), we reject the request. */
TEST(replication, PrevLogTermMismatch, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
return 0;
/* Bootstrap and start a cluster with 2 voters. The two servers have an
* entry with conflicting terms at index 2. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 3 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
}
CLUSTER_ADD_ENTRY(1, RAFT_COMMAND, 3 /* term */, 0 /* payload */);
CLUSTER_ADD_ENTRY(2, RAFT_COMMAND, 2 /* term */, 0 /* payload */);
CLUSTER_START(1);
CLUSTER_START(2);
/* Server 1 becomes leader because its last entry has a higher term. */
CLUSTER_TRACE(
"[ 0] 1 > term 3, 2 entries (1^1..2^3)\n"
"[ 0] 2 > term 3, 2 entries (1^1..2^2)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 4\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (4 vs 3) -> bump term\n"
" local log older (2^2 vs 2^3) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new entry (3^4)\n"
" probe server 2 sending 1 entry (3^4)\n");
/* Server 2 rejects the initial AppendEntries request from server 1. */
CLUSTER_TRACE(
"[ 130] 1 > persisted 1 entry (3^4)\n"
" next uncommitted entry (3^4) has 1 vote out of 2\n"
"[ 130] 2 > recv append entries from server 1\n"
" previous term mismatch -> reject\n");
/* Server 1 overwrites server 2's log. */
CLUSTER_TRACE(
"[ 140] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" probe server 2 sending 2 entries (2.3..3.4)\n"
"[ 150] 2 > recv append entries from server 1\n"
" log mismatch (2^2 vs 2^3) -> truncate\n"
" start persisting 2 new entries (2^3..3^4)\n"
"[ 160] 2 > persisted 2 entry (2^3..3^4)\n"
" send success result to 1\n"
"[ 160] 1 > timeout as leader\n"
"[ 170] 1 > recv append entries result from server 2\n"
" commit 2 new entries (2^3..3^4)\n");
return MUNIT_OK;
}
/* The follower has an uncommitted log entry that conflicts with a new one sent
* by the leader (same index but different term). The follower's conflicting log
* entry happens to be a configuration change. In that case the follower
* discards the conflicting entry from its log and rolls back its configuration
* to the initial one contained in the log entry at index 1. */
TEST(replication, RollbackConfigurationToInitial, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
struct raft_configuration conf; /* Uncommitted configuration at index 2 */
struct raft_entry entry;
struct raft *raft;
int rv;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 2 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
}
/* Both servers have an entry at index 2, but with conflicting terms. The
* entry of the second server is a configuration change. */
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_COMMAND, 2 /* term */, 0 /* payload */);
CLUSTER_FILL_CONFIGURATION(&conf, 2 /* n */, 2 /* voters */, 0 /* stand */);
entry.type = RAFT_CHANGE;
entry.term = 1;
rv = raft_configuration_add(&conf, 3, "3", 2);
munit_assert_int(rv, ==, 0);
raft_configuration_encode(&conf, &entry.buf);
munit_assert_int(rv, ==, 0);
CLUSTER_ADD_ENTRY(2 /* ID */, &entry);
raft_free(entry.buf.base);
raft_configuration_close(&conf);
/* At startup the server 2 uses the most recent configuration, i.e. the
* one contained in the entry that we just added. The server can't know yet
* if it's committed or not, and regards it as pending configuration
* change. */
CLUSTER_START(1 /* ID */);
CLUSTER_START(2 /* ID */);
CLUSTER_TRACE(
"[ 0] 1 > term 2, 2 entries (1^1..2^2)\n"
"[ 0] 2 > term 2, 2 entries (1^1..2^1)\n");
raft = CLUSTER_RAFT(2);
munit_assert_uint(raft->configuration.n, ==, 3);
munit_assert_ullong(raft->configuration_uncommitted_index, ==, 2);
munit_assert_ullong(raft->configuration_committed_index, ==, 1);
/* Server 1 gets elected. */
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (3 vs 2) -> bump term\n"
" remote log is more recent (2^2 vs 2^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new barrier entry (3^3)\n"
" probe server 2 sending 1 entry (3^3)\n");
/* Server 2 eventually replicates the server 1's log entry at index 2,
* truncating its own log and rolling back to the configuration contained in
* the log entry at index 1. */
CLUSTER_TRACE(
"[ 130] 1 > persisted 1 entry (3^3)\n"
" next uncommitted entry (2^2) has 1 vote out of 2\n"
"[ 130] 2 > recv append entries from server 1\n"
" previous term mismatch -> reject\n"
"[ 140] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" probe server 2 sending 2 entries (2^2..3^3)\n"
"[ 150] 2 > recv append entries from server 1\n"
" log mismatch (2^1 vs 2^2) -> truncate\n"
" roll back uncommitted configuration (2^1)\n"
" start persisting 2 new entries (2^2..3^3)\n");
munit_assert_uint(raft->configuration.n, ==, 2);
munit_assert_ullong(raft->configuration_uncommitted_index, ==, 0);
munit_assert_ullong(raft->configuration_committed_index, ==, 1);
return MUNIT_OK;
}
/* The follower has an uncommitted log entry that conflicts with a new one sent
* by the leader (same index but different term). The follower's conflicting log
* entry happens to be a configuration change. There's also an older committed
* configuration entry present. In that case the follower discards the
* conflicting entry from its log and rolls back its configuration to the
* committed one in the older configuration entry. */
TEST(replication, RollbackConfigurationToPrevious, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
struct raft_entry entry;
struct raft_configuration conf; /* Uncommitted configuration at index 3 */
struct raft *raft;
int rv;
/* Bootstrap a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 3 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
}
/* Both servers have a matching configuration entry at index 2. */
CLUSTER_ADD_ENTRY(1, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_ADD_ENTRY(2, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
/* Both servers have an entry at index 3, but with conflicting terms. The
* entry of the second server is a configuration change. */
CLUSTER_ADD_ENTRY(1, RAFT_COMMAND, 2 /* term */, 0 /* payload */);
CLUSTER_FILL_CONFIGURATION(&conf, 2 /* n */, 2 /* voters */, 0 /* stand */);
entry.type = RAFT_CHANGE;
entry.term = 1;
raft_configuration_add(&conf, 3, "3", 2);
rv = raft_configuration_encode(&conf, &entry.buf);
munit_assert_int(rv, ==, 0);
CLUSTER_ADD_ENTRY(2, &entry);
raft_configuration_close(&conf);
raft_free(entry.buf.base);
CLUSTER_START(1);
CLUSTER_START(2);
/* At startup the second server uses the most recent configuration, i.e. the
* one contained in the log entry at index 3. The server can't know yet if
* it's committed or not, and regards it as pending configuration change. */
CLUSTER_TRACE(
"[ 0] 1 > term 3, 3 entries (1^1..3^2)\n"
"[ 0] 2 > term 3, 3 entries (1^1..3^1)\n");
raft = CLUSTER_RAFT(2);
munit_assert_uint(raft->configuration.n, ==, 3);
munit_assert_ullong(raft->configuration_uncommitted_index, ==, 3);
munit_assert_ullong(raft->configuration_committed_index, ==, 2);
/* The first server gets elected. */
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 4\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (4 vs 3) -> bump term\n"
" remote log is more recent (3^2 vs 3^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new barrier entry (4^4)\n"
" probe server 2 sending 1 entry (4^4)\n");
return 0;
/* Server 2 eventually replicates the server 1's log entry at index 3,
* truncating its own log and rolling back to the configuration contained in
* the log entry at index 2. */
CLUSTER_TRACE(
"[ 130] 1 > persisted 1 entry (4^4)\n"
" next uncommitted entry (4^4) has 1 vote out of 2\n"
"[ 130] 2 > recv append entries from server 1\n"
" previous term mismatch -> reject\n"
"[ 140] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" probe server 2 sending 2 entries (3.2..4.4)\n"
"[ 150] 2 > recv append entries from server 1\n"
" log mismatch (3^1 vs 3^2) -> truncate\n"
" roll back uncommitted configuration (3^1)\n"
" start persisting 2 new entries (3^2..4^4)\n")
munit_assert_uint(raft->configuration.n, ==, 2);
munit_assert_ullong(raft->configuration_uncommitted_index, ==, 0);
munit_assert_ullong(raft->configuration_committed_index, ==, 2);
return MUNIT_OK;
}
/* The follower has an uncommitted log entry that conflicts with a new one sent
* by the leader (same index but different term). The follower's conflicting log
* entry happens to be a configuration change. The follower's log has been
* truncated after a snashot and does not contain the previous committed
* configuration anymore. In that case the follower discards the conflicting
* entry from its log and rolls back its configuration to the previous committed
* one, which was cached when the snapshot was restored. */
TEST(replication, RollbackConfigurationToSnapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_entry entry;
struct raft_configuration conf; /* Uncommitted configuration at index 2 */
struct raft *raft;
int rv;
/* Bootstrap server 1, creating a log entry at index 1 containing
* the initial configuration. */
CLUSTER_SET_TERM(1 /* ID */, 3 /* term */);
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
/* Server 2 has a snapshot up to entry 1. Entry 1 is not present in the
* log. */
CLUSTER_SET_TERM(2 /* ID */, 3 /* term */);
CLUSTER_SET_SNAPSHOT(2 /* */,
1 /* last index */,
1 /* last term */,
2 /* N servers */,
2 /* N voting */,
1 /* conf index */);
/* Both servers have an entry at index 2, but with conflicting terms. The
* entry of the second server is a configuration change and gets appended to
* the truncated log. */
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_COMMAND, 3 /* term */, 0 /* payload */);
CLUSTER_FILL_CONFIGURATION(&conf, 2 /* n */, 2 /* voters */, 0 /* stand */);
entry.type = RAFT_CHANGE;
entry.term = 2;
rv = raft_configuration_add(&conf, 3, "3", 2);
munit_assert_int(rv, ==, 0);
rv = raft_configuration_encode(&conf, &entry.buf);
munit_assert_int(rv, ==, 0);
CLUSTER_ADD_ENTRY(2 /* ID */, &entry);
raft_configuration_close(&conf);
raft_free(entry.buf.base);
/* At startup server 2 uses the most recent configuration, i.e. the one
* contained in the log entry at index 2. The server can't know yet if it's
* committed or not, and regards it as pending configuration change. */
CLUSTER_START(1 /* ID */);
CLUSTER_START(2 /* ID */);
CLUSTER_TRACE(
"[ 0] 1 > term 3, 2 entries (1^1..2^3)\n"
"[ 0] 2 > term 3, 1 snapshot (1^1), 1 entry (2^2)\n");
raft = CLUSTER_RAFT(2);
munit_assert_uint(raft->configuration.n, ==, 3);
munit_assert_ullong(raft->configuration_uncommitted_index, ==, 2);
munit_assert_ullong(raft->configuration_committed_index, ==, 1);
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 4\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (4 vs 3) -> bump term\n"
" remote log is more recent (2^3 vs 2^2) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new barrier entry (3^4)\n"
" probe server 2 sending 1 entry (3^4)\n");
/* Server 2 eventually replicates the server 1's log entry at index 3,
* truncating its own log and rolling back to the configuration contained in
* the snapshot, which is not present in the log anymore but was cached at
* startup. */
CLUSTER_TRACE(
"[ 130] 1 > persisted 1 entry (3^4)\n"
" next uncommitted entry (2^3) has 1 vote out of 2\n"
"[ 130] 2 > recv append entries from server 1\n"
" previous term mismatch -> reject\n"
"[ 140] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" probe server 2 sending 2 entries (2^3..3^4)\n"
"[ 150] 2 > recv append entries from server 1\n"
" log mismatch (2^2 vs 2^3) -> truncate\n"
" roll back uncommitted configuration (2^2)\n"
" start persisting 2 new entries (2^3..3^4)\n");
munit_assert_uint(raft->configuration.n, ==, 2);
munit_assert_ullong(raft->configuration_uncommitted_index, ==, 0);
munit_assert_ullong(raft->configuration_committed_index, ==, 1);
return MUNIT_OK;
}
/* A write log request is submitted for outstanding log entries. If some entries
* are already existing in the log, they will be skipped. */
TEST(replication, SkipExistingEntries, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
/* The follower receives and responds to the first heartbeat, and the leader
* transitions it to pipeline mode. */
CLUSTER_TRACE(
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 2\n");
/* Submit a new entry. */
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
/* The follower eventually receive the entry. */
CLUSTER_TRACE(
"[ 140] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^2)\n"
" pipeline server 2 sending 1 entry (2^2)\n"
"[ 150] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 2\n"
"[ 150] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n"
"[ 160] 2 > persisted 1 entry (2^2)\n"
" send success result to 1\n");
/* The follower disconnects and the leader does not get notified about
* the result. The leader tries to send a heartbeat but fails, so it
* transitions the follower back to probe mode and eventually sends the
* entry again. */
CLUSTER_DISCONNECT(1, 2);
CLUSTER_DISCONNECT(2, 1);
CLUSTER_TRACE(
"[ 190] 1 > timeout as leader\n"
" pipeline server 2 sending a heartbeat (no entries)\n"
"[ 240] 1 > timeout as leader\n"
" server 2 is unreachable -> abort pipeline\n"
" probe server 2 sending 1 entry (2^2)\n");
/* The follower reconnects, it receives the duplicate entry but does not
* persist again. */
CLUSTER_RECONNECT(1, 2);
CLUSTER_RECONNECT(2, 1);
CLUSTER_TRACE(
"[ 250] 2 > recv append entries from server 1\n"
" no new entries to persist\n");
return MUNIT_OK;
}
/* If the index and term of the last snapshot on the server matches prevLogIndex
* and prevLogTerm the request is accepted. */
TEST(replication, MatchingLastSnapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
/* Server 1 has 2 entries, at index 1 and. 2 */
CLUSTER_SET_TERM(1 /* ID */, 2 /* term */);
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_COMMAND, 2 /* term */, 0 /* payload */);
/* Server 2 has a snapshot up to entry 2 */
CLUSTER_SET_TERM(2 /* ID */, 2 /* term */);
CLUSTER_SET_SNAPSHOT(2, /* ID */
2, /* last index */
2, /* last term */
2, /* N servers */
2, /* N voting */
1 /* conf index */);
CLUSTER_START(1 /* ID */);
CLUSTER_START(2 /* ID */);
CLUSTER_TRACE(
"[ 0] 1 > term 2, 2 entries (1^1..2^2)\n"
"[ 0] 2 > term 2, 1 snapshot (2^2)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (3 vs 2) -> bump term\n"
" remote log is equal (2^2) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new barrier entry (3^3)\n"
" probe server 2 sending 1 entry (3^3)\n");
CLUSTER_TRACE(
"[ 130] 1 > persisted 1 entry (3^3)\n"
" next uncommitted entry (2^2) has 1 vote out of 2\n"
"[ 130] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (3^3)\n"
"[ 140] 2 > persisted 1 entry (3^3)\n"
" send success result to 1\n"
"[ 150] 1 > recv append entries result from server 2\n"
" commit 2 new entries (2^2..3^3)\n");
return MUNIT_OK;
}
/* If a candidate server receives a request containing the same term as its
* own, it it steps down to follower and accept the request . */
TEST(replication, CandidateRecvRequestWithSameTerm, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 3 voters and one existing entry. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 2 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_ADD_ENTRY(id, RAFT_COMMAND, 2 /* term */, 0 /* payload */);
CLUSTER_START(id);
}
/* Disconnect server 3 from the other two and set a low election timeout on
* it, so it will immediately start an election. */
CLUSTER_DISCONNECT(3, 1);
CLUSTER_DISCONNECT(1, 3);
CLUSTER_DISCONNECT(3, 2);
CLUSTER_DISCONNECT(2, 3);
CLUSTER_SET_ELECTION_TIMEOUT(3 /* ID */, 50 /* timeout */, 40 /* delta */);
CLUSTER_TRACE(
"[ 0] 1 > term 2, 2 entries (1^1..2^2)\n"
"[ 0] 2 > term 2, 2 entries (1^1..2^2)\n"
"[ 0] 3 > term 2, 2 entries (1^1..2^2)\n"
"[ 90] 3 > timeout as follower\n"
" convert to candidate, start election for term 3\n");
/* Server 1 wins the election and start replicating a no-op entry since
* this an uncommitted log entry from another term. */
CLUSTER_TRACE(
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (3 vs 2) -> bump term\n"
" remote log is equal (2^2) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" replicate 1 new barrier entry (3^3)\n"
" probe server 2 sending 1 entry (3^3)\n"
" probe server 3 sending 1 entry (3^3)\n");
/* Now reconnect server 3, which eventually steps down and replicates the
* barrier entry. */
CLUSTER_RECONNECT(3, 1);
CLUSTER_RECONNECT(1, 3);
CLUSTER_TRACE(
"[ 130] 1 > persisted 1 entry (3^3)\n"
" next uncommitted entry (2^2) has 1 vote out of 3\n"
"[ 130] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (3^3)\n"
"[ 130] 3 > recv append entries from server 1\n"
" discovered leader (1) -> step down \n"
" start persisting 1 new entry (3^3)\n");
return MUNIT_OK;
}
/* If a candidate server receives an append entries request contaning an higher
* term than its own, it it steps down to follower and accept the request. */
TEST(replication, CandidateRecvRequestWithHigherTerm, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 3 voters. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
/* Set a high election timeout on server 2, so it won't become candidate */
CLUSTER_SET_ELECTION_TIMEOUT(2 /* ID */, 250 /* timeout */, 0 /* delta */);
/* Disconnect server 3 from the other two. */
CLUSTER_DISCONNECT(3, 1);
CLUSTER_DISCONNECT(1, 3);
CLUSTER_DISCONNECT(3, 2);
CLUSTER_DISCONNECT(2, 3);
/* Set a low election timeout on server 1, and disconnect it from server 2,
* so by the time it starts the second round, server 3 will have turned
* candidate */
CLUSTER_SET_ELECTION_TIMEOUT(1 /* ID */, 50 /* timeout */, 47 /* delta */);
CLUSTER_DISCONNECT(1, 2);
CLUSTER_DISCONNECT(2, 1);
/* Server 3 becomes candidate, and server 1 already is candidate. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 97] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 160] 3 > timeout as follower\n"
" convert to candidate, start election for term 2\n");
/* Server 1 starts a new election, while server 3 is still candidate */
CLUSTER_SET_ELECTION_TIMEOUT(1 /* ID */, 50 /* timeout */, 14 /* delta */);
CLUSTER_TRACE(
"[ 161] 1 > timeout as candidate\n"
" stay candidate, start election for term 3\n");
/* Reconnect the server 1 and server 2, let the election succeed and
* the initial heartbeat to be sent. */
CLUSTER_RECONNECT(1, 2);
CLUSTER_RECONNECT(2, 1);
CLUSTER_TRACE(
"[ 171] 2 > recv request vote from server 1\n"
" remote term is higher (3 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 181] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n");
/* Now reconnect the server 3, which eventually steps down when it receives
* the heartbeat. */
CLUSTER_RECONNECT(3, 1);
CLUSTER_RECONNECT(1, 3);
CLUSTER_TRACE(
"[ 191] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 191] 3 > recv append entries from server 1\n"
" remote term is higher (3 vs 2) -> bump term, step down\n"
" no new entries to persist\n");
return MUNIT_OK;
}
/* If the server handling the response is not the leader, the result is
* ignored. */
TEST(replication, ReceiveResultButNotLeader, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
/* Set a very high-latency for server 2's outgoing messages, so server 1
* won't get notified about the results for a while. */
CLUSTER_SET_NETWORK_LATENCY(2 /* ID */, 100 /* latency */);
/* Set a low election timeout on server 1 so it will step down very soon. */
CLUSTER_SET_ELECTION_TIMEOUT(1 /* ID */, 30 /* timeout */, 15 /* delta */);
/* Eventually server 1 steps down becomes candidate. */
CLUSTER_TRACE(
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 150] 1 > timeout as leader\n"
" unable to contact majority of cluster -> step down\n"
"[ 195] 1 > timeout as follower\n"
" convert to candidate, start election for term 3\n");
/* The AppendEntries result eventually gets delivered, but the candidate
* ignores it. */
CLUSTER_TRACE(
"[ 205] 2 > recv request vote from server 1\n"
" local server has a leader (server 1) -> reject\n"
"[ 230] 1 > recv append entries result from server 2\n"
" local server is not leader -> ignore\n");
return MUNIT_OK;
}
/* If the response has a term which is lower than the server's one, it's
* ignored. */
TEST(replication, ReceiveResultWithLowerTerm, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 3 voters. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n");
/* Set a very high-latency for the server 2's outgoing messages, so server 1
* won't get notified about the results for a while. */
CLUSTER_SET_NETWORK_LATENCY(2 /* ID */, 80 /* latency */);
/* Set a high election timeout on server 2, so it won't become candidate */
CLUSTER_SET_ELECTION_TIMEOUT(2 /* ID */, 500 /* timeout */, 0 /* delta */);
/* Disconnect server 1 from server 3 and set a low election timeout on it so
* it will step down very soon. */
CLUSTER_DISCONNECT(1, 3);
CLUSTER_DISCONNECT(3, 1);
CLUSTER_SET_ELECTION_TIMEOUT(1 /* ID */, 20 /* timeout */, 20 /* delta */);
CLUSTER_TRACE(
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > timeout as leader\n"
" unable to contact majority of cluster -> step down\n");
/* Make server 1 become leader again. */
CLUSTER_RECONNECT(1, 3);
CLUSTER_RECONNECT(3, 1);
CLUSTER_TRACE(
"[ 180] 1 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 190] 2 > recv request vote from server 1\n"
" local server has a leader (server 1) -> reject\n"
"[ 190] 3 > recv request vote from server 1\n"
" remote term is higher (3 vs 2) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 200] 1 > recv request vote result from server 3\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n");
/* Eventually deliver to server 1 the result message sent by server 2 in the
* previous term. */
CLUSTER_TRACE(
"[ 210] 1 > recv append entries result from server 2\n"
" local term is higher (3 vs 2) -> ignore\n");
return MUNIT_OK;
}
/* If the response has a term which is higher than the server's one, step down
* to follower. */
TEST(replication, ReceiveResultWithHigherTerm, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 3 voters. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n");
/* Set a very high election timeout for server 1 so it won't step down.
*/
CLUSTER_SET_ELECTION_TIMEOUT(1 /* ID */, 500 /* timeout */, 0 /* delta */);
/* Disconnect the server 1 from the rest of the cluster. */
CLUSTER_DISCONNECT(1, 2);
CLUSTER_DISCONNECT(2, 1);
CLUSTER_DISCONNECT(1, 3);
CLUSTER_DISCONNECT(3, 1);
/* Eventually a new leader gets electected */
CLUSTER_TRACE(
"[ 170] 1 > timeout as leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 220] 1 > timeout as leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 240] 2 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 250] 3 > recv request vote from server 2\n"
" remote term is higher (3 vs 2) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 260] 2 > recv request vote result from server 3\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 1 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n");
/* Reconnect the old leader server 1 to the current follower server 3,
* which eventually replies with an AppendEntries result containing an
* higher term. */
CLUSTER_RECONNECT(1, 3);
CLUSTER_RECONNECT(3, 1);
CLUSTER_TRACE(
"[ 270] 3 > recv append entries from server 2\n"
" no new entries to persist\n"
"[ 270] 1 > timeout as leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 280] 2 > recv append entries result from server 3\n"
"[ 280] 3 > recv append entries from server 1\n"
" local term is higher (3 vs 2) -> reject\n"
"[ 290] 1 > recv append entries result from server 3\n"
" remote term is higher (3 vs 2) -> bump term, step down\n");
return MUNIT_OK;
}
/* A leader with slow disk commits an entry that it hasn't persisted yet,
* because enough followers to have a majority have aknowledged that they have
* appended the entry. The leader's last_stored field hence lags behind its
* commit_index. A new leader gets elected, with a higher commit index, and
* sends an an entry to the old leader, that needs to update its commit_index
* taking into account its lagging last_stored. */
TEST(replication, LastStoredLaggingBehindCommitIndex, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 3 voters. All servers have an
* uncommitted log entry. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_ADD_ENTRY(id, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
CLUSTER_START(id);
}
/* Server 1 will take a long time to persist the initial barrier entry at
* index 3. */
CLUSTER_SET_DISK_LATENCY(1 /* ID */, 1000 /* latency */);
/* Server 1 gets elected and creates a no-op entry at index 2 */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 2 entries (1^1..2^1)\n"
"[ 0] 2 > term 1, 2 entries (1^1..2^1)\n"
"[ 0] 3 > term 1, 2 entries (1^1..2^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (2^1) -> grant vote\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (2^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" replicate 1 new barrier entry (3^2)\n"
" probe server 2 sending 1 entry (3^2)\n"
" probe server 3 sending 1 entry (3^2)\n");
/* Server 1 commits the no-op entry at index 2 even if it did not
* persist it yet. */
CLUSTER_TRACE(
"[ 120] 1 > recv request vote result from server 3\n"
" local server is leader -> ignore\n"
"[ 130] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (3^2)\n"
"[ 130] 3 > recv append entries from server 1\n"
" start persisting 1 new entry (3^2)\n"
"[ 140] 2 > persisted 1 entry (3^2)\n"
" send success result to 1\n"
"[ 140] 3 > persisted 1 entry (3^2)\n"
" send success result to 1\n"
"[ 150] 1 > recv append entries result from server 2\n"
" next uncommitted entry (2^1) has 2 votes out of 3\n"
"[ 150] 1 > recv append entries result from server 3\n"
" commit 2 new entries (2^1..3^2)\n");
munit_assert_ullong(CLUSTER_RAFT(1)->last_stored, ==, 2);
munit_assert_ullong(CLUSTER_RAFT(1)->commit_index, ==, 3);
/* Server 2 stored barrier entry 3, but did not yet receive a
* notification from server 1 about the new commit index. */
munit_assert_ullong(CLUSTER_RAFT(2)->last_stored, ==, 3);
munit_assert_ullong(CLUSTER_RAFT(2)->commit_index, ==, 1);
/* Disconnect server 1 from server 2 and 3. */
CLUSTER_DISCONNECT(1, 2);
CLUSTER_DISCONNECT(1, 3);
/* Set a very high election timeout on server 1, so it won't step down
* for a while, even if disconnected. */
CLUSTER_SET_ELECTION_TIMEOUT(1, 500 /* timeout */, 0 /* delta */);
/* Server 2 and 3 eventually timeout and start an election, server 2
* wins (lower election timeouts to make that happen faster). */
CLUSTER_SET_ELECTION_TIMEOUT(2, 40 /* timeout */, 0 /* delta */);
CLUSTER_SET_ELECTION_TIMEOUT(3, 40 /* timeout */, 10 /* delta */);
CLUSTER_TRACE(
"[ 170] 1 > timeout as leader\n"
" pipeline server 2 sending a heartbeat (no entries)\n"
" pipeline server 3 sending a heartbeat (no entries)\n"
"[ 170] 2 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 180] 1 > recv request vote from server 2\n"
" local server is leader -> reject\n"
"[ 180] 3 > recv request vote from server 2\n"
" local server has a leader (server 1) -> reject\n"
"[ 180] 3 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 190] 2 > recv request vote result from server 3\n"
" remote term is lower (2 vs 3) -> ignore\n"
"[ 190] 1 > recv request vote from server 3\n"
" local server is leader -> reject\n"
"[ 190] 2 > recv request vote from server 3\n"
" already voted for server 2 -> don't grant vote\n"
"[ 200] 3 > recv request vote result from server 2\n"
" vote not granted\n"
"[ 210] 2 > timeout as candidate\n"
" stay candidate, start election for term 4\n"
"[ 220] 1 > recv request vote from server 2\n"
" local server is leader -> reject\n"
"[ 220] 3 > recv request vote from server 2\n"
" remote term is higher (4 vs 3) -> bump term, step down\n"
" remote log is equal (3^2) -> grant vote\n"
"[ 220] 1 > timeout as leader\n"
" server 2 is unreachable -> abort pipeline\n"
" server 3 is unreachable -> abort pipeline\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 230] 2 > recv request vote result from server 3\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" replicate 1 new barrier entry (4^4)\n"
" probe server 1 sending 1 entry (4^4)\n"
" probe server 3 sending 1 entry (4^4)\n");
/* Server 2 commits the barrier entry at index 4 that it created at the
* start of its term. */
CLUSTER_TRACE(
"[ 240] 2 > persisted 1 entry (4^4)\n"
" next uncommitted entry (3^2) has 1 vote out of 3\n"
"[ 240] 1 > recv append entries from server 2\n"
" remote term is higher (4 vs 2) -> bump term, step down\n"
" start persisting 1 new entry (4^4)\n"
"[ 240] 3 > recv append entries from server 2\n"
" start persisting 1 new entry (4^4)\n"
"[ 250] 3 > persisted 1 entry (4^4)\n"
" send success result to 2\n"
"[ 260] 2 > recv append entries result from server 3\n"
" commit 3 new entries (2^1..4^4)\n");
/* Reconnect server 1 to server 2, which will replicate entry 4 to
* it. */
CLUSTER_RECONNECT(1, 2);
CLUSTER_TRACE(
"[ 270] 2 > timeout as leader\n"
"[ 280] 2 > timeout as leader\n"
" server 3 is unreachable -> abort pipeline\n"
" probe server 1 sending 1 entry (4^4)\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 290] 1 > recv append entries from server 2\n"
" no new entries to persist\n");
return MUNIT_OK;
}
/* A leader with slow disk commits an entry that it hasn't persisted yet,
* because enough followers to have a majority have aknowledged that they have
* appended the entry. The leader's last_stored field hence lags behind its
* commit_index. A new leader gets elected, with a higher commit index and sends
* first a new entry than a heartbeat to the old leader, that needs to update
* its commit_index taking into account its lagging last_stored.
*
* XXX: this test duplicates the one above, but it's kept because the change it
* is associated with was fixing an assertion in the legacy compat layer. */
/* A follower finds that is has no leader anymore after it completes persisting
* entries. No AppendEntries RPC result is sent in that case. */
TEST(replication, NoLeaderAfterPersistingEntries, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. Both have an additional
* entry at index 2.*/
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_ADD_ENTRY(id, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
CLUSTER_START(id);
}
/* Make sure that persisting entries on server 1 will take a long time. */
CLUSTER_SET_DISK_LATENCY(1 /* ID */, 50 /* latency */);
/* Server 1 becomes the leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 2 entries (1^1..2^1)\n"
"[ 0] 2 > term 1, 2 entries (1^1..2^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (2^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new barrier entry (3^2)\n"
" probe server 2 sending 1 entry (3^2)\n");
/* Disconnect server 1, so it will step down and become follower (lower
* its election timeout to make this happen sooner). */
CLUSTER_DISCONNECT(1, 2);
CLUSTER_DISCONNECT(2, 1);
CLUSTER_SET_ELECTION_TIMEOUT(1 /* ID */, 40 /* timeout */, 0 /* delta*/);
CLUSTER_TRACE(
"[ 160] 1 > timeout as leader\n"
" unable to contact majority of cluster -> step down\n");
/* Server 1 has stepped down and is now a follower, however its hasn't
* persisted the barrier entry yet. */
munit_assert_int(raft_state(CLUSTER_RAFT(1)), ==, RAFT_FOLLOWER);
munit_assert_ullong(CLUSTER_RAFT(1)->last_stored, ==, 2);
/* Wait for the long disk write to complete */
CLUSTER_TRACE("[ 170] 1 > persisted 1 entry (3^2)\n");
/* The writes have now completed, one for the barrier entry at the start
* of the term and one for the command entry we submitted. */
munit_assert_ullong(CLUSTER_RAFT(1)->last_stored, ==, 3);
return MUNIT_OK;
}
/* While pipelining entries, the leader receives an AppendEntries response with
* a stale reject index. */
TEST(replication, PipelineStaleRejectedIndex, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. Server 1 has an additional
* entry at index 2.*/
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
if (id == 1) {
CLUSTER_ADD_ENTRY(id, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
}
CLUSTER_START(id);
}
/* Server 1 becomes the leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 2 entries (1^1..2^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is longer (2^1 vs 1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new barrier entry (3^2)\n"
" probe server 2 sending 1 entry (3^2)\n"
"[ 130] 1 > persisted 1 entry (3^2)\n"
" next uncommitted entry (2^1) has 1 vote out of 2\n");
/* Server 2 receives a heartbeat with an entry that it does not have, but
* the network is slow, so server 1 will receive the response much later. */
CLUSTER_SET_NETWORK_LATENCY(2 /* ID */, 100 /* latency */);
CLUSTER_TRACE(
"[ 130] 2 > recv append entries from server 1\n"
" missing previous entry (2^1) -> reject\n"
"[ 170] 1 > timeout as leader\n"
" probe server 2 sending 1 entry (3^2)\n");
/* This time the network is faster, and server 2's response to the second
* heartbeat will arrive before then first response. */
CLUSTER_SET_NETWORK_LATENCY(2 /* ID */, 10 /* latency */);
CLUSTER_TRACE(
"[ 180] 2 > recv append entries from server 1\n"
" missing previous entry (2^1) -> reject\n"
"[ 190] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" probe server 2 sending 2 entries (2^1..3^2)\n"
"[ 200] 2 > recv append entries from server 1\n"
" start persisting 2 new entries (2^1..3^2)\n"
"[ 210] 2 > persisted 2 entry (2^1..3^2)\n"
" send success result to 1\n"
"[ 220] 1 > recv append entries result from server 2\n"
" commit 2 new entries (2^1..3^2)\n"
"[ 220] 1 > timeout as leader\n"
"[ 230] 1 > recv append entries result from server 2\n"
" stale rejected index (2 vs match index 3) -> ignore\n");
return MUNIT_OK;
}
/* The raft_max_inflight_entries() setting controls how many un-acknowledged
* entries can be in-flight in pipeline mode. */
TEST(replication, PipelineMaxInflight, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Set the maximum in-flight entries number to 2. */
raft_set_max_inflight_entries(CLUSTER_RAFT(1), 2);
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader and eventually switches server 2 to pipeline
* mode. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 2\n"
"[ 170] 1 > timeout as leader\n"
" pipeline server 2 sending a heartbeat (no entries)\n");
/* Two entries are submitted and immediately sent to server 2. */
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 170] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^2)\n"
" pipeline server 2 sending 1 entry (2^2)\n"
"[ 170] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (3^2)\n"
" pipeline server 2 sending 1 entry (3^2)\n");
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
/* A third entry is submitted, but it's not replicated immediately, because
* the inflight limit has been reached. */
CLUSTER_TRACE(
"[ 170] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (4^2)\n"
" pipeline server 2 sending a heartbeat (no entries)\n");
return MUNIT_OK;
}
/* After having sent a snapshot and waiting for a response, the leader receives
* an AppendEntries response with a stale reject index. */
TEST(replication, StaleRejectedIndexSnapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_configuration configuration;
struct raft_entry entry;
unsigned id;
int rv;
/* Set very low threshold and trailing entries number. */
CLUSTER_SET_SNAPSHOT_THRESHOLD(1 /* ID */, 3 /* n. entries */);
CLUSTER_SET_SNAPSHOT_TRAILING(1 /* ID */, 0 /* n. entries */);
/* Bootstrap and start a cluster with 1 voter and 1 stand-by. Server 1 has
* an additional entry. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_FILL_CONFIGURATION(&configuration, 2, 1, 1 /* stand-by */);
entry.type = RAFT_CHANGE;
entry.term = 1;
rv = raft_configuration_encode(&configuration, &entry.buf);
munit_assert_int(rv, ==, 0);
raft_configuration_close(&configuration);
test_cluster_add_entry(&f->cluster_, id, &entry);
raft_free(entry.buf.base);
if (id == 1) {
CLUSTER_ADD_ENTRY(id, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
}
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 2 entries (1^1..2^1)\n"
" self elect and convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n");
/* Server 2 receives a heartbeat with an entry that it does not have, but
* the network is slow, so server 1 will receive the response much later. */
CLUSTER_SET_NETWORK_LATENCY(2 /* ID */, 80 /* latency */);
CLUSTER_TRACE(
"[ 10] 2 > recv append entries from server 1\n"
" missing previous entry (2^1) -> reject\n");
/* Server 1 commits a new entry and takes a snapshot. */
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 10] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (3^1)\n"
"[ 20] 1 > persisted 1 entry (3^1)\n"
" commit 1 new entry (3^1)\n"
"[ 20] 1 > new snapshot (3^1), 0 trailing entries\n");
/* Eventually server 2 receives the snapshot, but takes a long time to
* persist it. */
CLUSTER_SET_NETWORK_LATENCY(2 /* ID */, 10 /* latency */);
CLUSTER_SET_DISK_LATENCY(2 /* ID */, 100 /* latency */);
CLUSTER_TRACE(
"[ 50] 1 > timeout as leader\n"
" missing previous entry at index 2 -> needs snapshot\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 60] 2 > recv append entries from server 1\n"
" missing previous entry (3^1) -> reject\n"
"[ 70] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" missing previous entry at index 1 -> needs snapshot\n"
" sending snapshot (3^1) to server 2\n"
"[ 80] 2 > recv install snapshot from server 1\n"
" start persisting snapshot (3^1)\n");
/* Server 1 finally receives the original AppendEntries response and ignores
* it. */
CLUSTER_TRACE(
"[ 90] 1 > recv append entries result from server 2\n"
" stale rejected index (2 vs snapshot index 3) -> ignore\n");
return MUNIT_OK;
}
/* If a follower receives a heartbeat containing a prev_log_index which is
* behind its last_stored index, it sets the last_log_index to the value of
* prev_log_index. This prevents the follower from telling the leader that it
* reached a certain index without first checking the log matching property. */
TEST(replication, LastStoredAheadOfPrevLogIndex, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_configuration configuration;
struct raft_entry entry;
unsigned id;
int rv;
/* Bootstrap and start a cluster with 1 voters and 1 stand-by. The stand-by
* has an additional entry. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_FILL_CONFIGURATION(&configuration, 2, 1, 1 /* stand-by */);
entry.type = RAFT_CHANGE;
entry.term = 1;
rv = raft_configuration_encode(&configuration, &entry.buf);
munit_assert_int(rv, ==, 0);
raft_configuration_close(&configuration);
test_cluster_add_entry(&f->cluster_, id, &entry);
raft_free(entry.buf.base);
if (id == 2) {
CLUSTER_ADD_ENTRY(id, RAFT_COMMAND, 2 /* term */, 0 /* payload */);
}
CLUSTER_START(id);
}
/* Start the cluster, server 1 will self elect and send an heartbeat to
* server 2. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
" self elect and convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 0] 2 > term 1, 2 entries (1^1..2^2)\n");
/* Submit an entry, which won't be immediately sent to server 2, because
* it's still in probe mode. */
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 0] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^1)\n"
"[ 10] 1 > persisted 1 entry (2^1)\n"
" commit 1 new entry (2^1)\n");
/* Server 2 receives the heartbeat from server 1. The two servers have now a
* conflicting entry at term 2, but since this heartbeat only contains up to
* entry 1, no conflict is detected yet by server 2, which sends a success
* result. */
CLUSTER_TRACE(
"[ 10] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 20] 1 > recv append entries result from server 2\n"
" pipeline server 2 sending 1 entry (2^1)\n");
/* Disconnect server 2, which eventually gets back to probe mode. */
CLUSTER_DISCONNECT(1, 2);
CLUSTER_TRACE(
"[ 70] 1 > timeout as leader\n"
" pipeline server 2 sending a heartbeat (no entries)\n"
"[ 120] 1 > timeout as leader\n"
" server 2 is unreachable -> abort pipeline\n"
" probe server 2 sending 1 entry (2^1)\n");
/* Reconnect server 2, which eventually receives the conflicting entry at
* index 2 and truncates its log. */
CLUSTER_RECONNECT(1, 2);
CLUSTER_TRACE(
"[ 130] 2 > recv append entries from server 1\n"
" log mismatch (2^2 vs 2^1) -> truncate\n"
" start persisting 1 new entry (2^1)\n");
return MUNIT_OK;
}
/* If a follower completes persisting an entry at an index that was not yet sent
* by the current leader and checked via the log matching property, no
* successful result for it will be sent. */
TEST(replication, LastStoredAheadOfLastMatched, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 5 voters. */
for (id = 1; id <= 5; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 5 /* servers */, 5 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 0] 4 > term 1, 1 entry (1^1)\n"
"[ 0] 5 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 110] 4 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 110] 5 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum not reached, only 2 votes out of 5\n"
"[ 120] 1 > recv request vote result from server 3\n"
" quorum reached with 3 votes out of 5 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n"
" probe server 4 sending a heartbeat (no entries)\n"
" probe server 5 sending a heartbeat (no entries)\n");
/* Submit a new entry. */
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
/* Disconnect server 1 from all servers except server 3, which will be the
* only one receiving it. */
CLUSTER_DISCONNECT(1, 2);
CLUSTER_DISCONNECT(2, 1);
CLUSTER_DISCONNECT(1, 4);
CLUSTER_DISCONNECT(4, 1);
CLUSTER_DISCONNECT(1, 5);
CLUSTER_DISCONNECT(5, 1);
/* Server 3 receives it but will take a long time to persist it. */
CLUSTER_SET_DISK_LATENCY(3 /* ID */, 130 /* latency */);
CLUSTER_TRACE(
"[ 120] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^2)\n"
"[ 130] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 5\n"
"[ 130] 3 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 3\n"
" pipeline server 3 sending 1 entry (2^2)\n"
"[ 150] 3 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n");
/* Crash server 1. Eventually server 2 becomes leader with votes from server
* 4 and 5. */
CLUSTER_STOP(1 /* ID */);
CLUSTER_TRACE(
"[ 240] 2 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 250] 3 > recv request vote from server 2\n"
" local server has a leader (server 1) -> reject\n"
"[ 250] 4 > recv request vote from server 2\n"
" remote term is higher (3 vs 2) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 250] 5 > recv request vote from server 2\n"
" remote term is higher (3 vs 2) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 260] 2 > recv request vote result from server 3\n"
" remote term is lower (2 vs 3) -> ignore\n"
"[ 260] 2 > recv request vote result from server 4\n"
" quorum not reached, only 2 votes out of 5\n"
"[ 260] 2 > recv request vote result from server 5\n"
" quorum reached with 3 votes out of 5 -> convert to leader\n"
" probe server 1 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n"
" probe server 4 sending a heartbeat (no entries)\n"
" probe server 5 sending a heartbeat (no entries)\n");
/* Server 3 receives the heartbeat from server 2 and chances its term and
* leader. */
CLUSTER_TRACE(
"[ 270] 3 > recv append entries from server 2\n"
" remote term is higher (3 vs 2) -> bump term\n"
" no new entries to persist\n");
CLUSTER_DISCONNECT(3, 2);
/* Server 3 completes persisting the entry at index 2 that was sent to it by
* server 1 at term 2. */
CLUSTER_TRACE(
"[ 270] 4 > recv append entries from server 2\n"
" no new entries to persist\n"
"[ 270] 5 > recv append entries from server 2\n"
" no new entries to persist\n"
"[ 280] 3 > persisted 1 entry (2^2)\n"
" send success result to 2\n");
CLUSTER_RECONNECT(3, 2);
/* Submit a new entry to server 2, which will have the same index of the
* entry that server 3 just persisted. */
CLUSTER_SUBMIT(2 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 280] 2 > submit 1 new client entry\n"
" replicate 1 new command entry (2^3)\n"
"[ 280] 2 > recv append entries result from server 4\n"
" pipeline server 4 sending 1 entry (2^3)\n"
"[ 280] 2 > recv append entries result from server 5\n"
" pipeline server 5 sending 1 entry (2^3)\n"
"[ 290] 2 > persisted 1 entry (2^3)\n"
" next uncommitted entry (2^3) has 1 vote out of 5\n");
/* Server 2 receives the result from server 3, which does *not* contain the
* conflicting entry. */
CLUSTER_TRACE(
"[ 290] 2 > recv append entries result from server 3\n"
" pipeline server 3 sending 1 entry (2^3)\n");
/* Server 3 receives the conflicting entry and replaces its own one. */
CLUSTER_TRACE(
"[ 290] 4 > recv append entries from server 2\n"
" start persisting 1 new entry (2^3)\n"
"[ 290] 5 > recv append entries from server 2\n"
" start persisting 1 new entry (2^3)\n"
"[ 300] 3 > recv append entries from server 2\n"
" log mismatch (2^2 vs 2^3) -> truncate\n"
" start persisting 1 new entry (2^3)\n");
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_snapshot.c 0000664 0000000 0000000 00000070045 14601504142 0021077 0 ustar 00root root 0000000 0000000 #include "../lib/cluster.h"
#include "../lib/runner.h"
struct fixture
{
FIXTURE_CLUSTER;
};
static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER();
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER();
free(f);
}
SUITE(snapshot)
/* Install a snapshot on a follower that has fallen behind. */
TEST(snapshot, Install, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_configuration configuration;
struct raft_entry entry;
unsigned id;
int rv;
/* Set very low threshold and trailing entries number */
CLUSTER_SET_SNAPSHOT_THRESHOLD(1 /* ID */, 2 /* n. entries */);
CLUSTER_SET_SNAPSHOT_TRAILING(1 /* ID */, 0 /* n. entries */);
/* Don't let server 2 time out (just for terser traces). */
raft_set_election_timeout(CLUSTER_RAFT(2), 200);
/* Bootstrap and start a cluster with 1 voter and 1 stand-by. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_FILL_CONFIGURATION(&configuration, 2, 1, 1 /* stand-by */);
entry.type = RAFT_CHANGE;
entry.term = 1;
rv = raft_configuration_encode(&configuration, &entry.buf);
munit_assert_int(rv, ==, 0);
raft_configuration_close(&configuration);
test_cluster_add_entry(&f->cluster_, id, &entry);
raft_free(entry.buf.base);
CLUSTER_START(id);
}
/* Server 2 won't receive any entry from server 1. */
CLUSTER_DISCONNECT(1, 2);
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
" self elect and convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n");
/* Submit an entry which will to force a snapshot to be taken. */
CLUSTER_ELAPSE(10);
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 10] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^1)\n"
"[ 20] 1 > persisted 1 entry (2^1)\n"
" commit 1 new entry (2^1)\n"
"[ 20] 1 > new snapshot (2^1), 0 trailing entries\n");
/* Reconnect server 2, which eventually receives the snapshot. */
CLUSTER_RECONNECT(1, 2);
CLUSTER_TRACE(
"[ 50] 1 > timeout as leader\n"
" missing previous entry at index 1 -> needs snapshot\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 60] 2 > recv append entries from server 1\n"
" missing previous entry (2^1) -> reject\n"
"[ 70] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" missing previous entry at index 1 -> needs snapshot\n"
" sending snapshot (2^1) to server 2\n"
"[ 80] 2 > recv install snapshot from server 1\n"
" start persisting snapshot (2^1)\n"
"[ 90] 2 > persisted snapshot (2^1)\n"
" send success result to 1\n"
"[ 100] 1 > recv append entries result from server 2\n"
" pipeline server 2 sending a heartbeat (no entries)\n");
return MUNIT_OK;
}
/* Install snapshot times out and leader retries */
TEST(snapshot, InstallTimeout, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_configuration configuration;
struct raft_entry entry;
unsigned id;
int rv;
/* Set very low threshold and trailing entries number */
CLUSTER_SET_SNAPSHOT_THRESHOLD(1 /* ID */, 2 /* n. entries */);
CLUSTER_SET_SNAPSHOT_TRAILING(1 /* ID */, 0 /* n. entries */);
/* Don't let server 2 time out (just for terser traces). */
raft_set_election_timeout(CLUSTER_RAFT(2), 200);
/* Bootstrap and start a cluster with 1 voter and 1 stand-by. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_FILL_CONFIGURATION(&configuration, 2, 1, 1 /* stand-by */);
entry.type = RAFT_CHANGE;
entry.term = 1;
rv = raft_configuration_encode(&configuration, &entry.buf);
munit_assert_int(rv, ==, 0);
raft_configuration_close(&configuration);
test_cluster_add_entry(&f->cluster_, id, &entry);
raft_free(entry.buf.base);
CLUSTER_START(id);
}
/* Server 2 won't receive any entry from server 1. */
CLUSTER_DISCONNECT(1, 2);
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
" self elect and convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n");
/* Submit an entry which will to force a snapshot to be taken. */
CLUSTER_ELAPSE(10);
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 10] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^1)\n"
"[ 20] 1 > persisted 1 entry (2^1)\n"
" commit 1 new entry (2^1)\n"
"[ 20] 1 > new snapshot (2^1), 0 trailing entries\n");
/* Reconnect server 2, which eventually receives the snapshot. Set a very
* high disk latency on it, so it won't reply to server 1 fast enough,
* making server 1 retry. */
CLUSTER_RECONNECT(1, 2);
CLUSTER_SET_DISK_LATENCY(2, 80);
CLUSTER_TRACE(
"[ 50] 1 > timeout as leader\n"
" missing previous entry at index 1 -> needs snapshot\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 60] 2 > recv append entries from server 1\n"
" missing previous entry (2^1) -> reject\n"
"[ 70] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" missing previous entry at index 1 -> needs snapshot\n"
" sending snapshot (2^1) to server 2\n"
"[ 80] 2 > recv install snapshot from server 1\n"
" start persisting snapshot (2^1)\n"
"[ 100] 1 > timeout as leader\n"
" missing previous entry at index 2 -> needs snapshot\n"
" snapshot server 2 sending a heartbeat (no entries)\n"
"[ 110] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 120] 1 > recv append entries result from server 2\n"
"[ 150] 1 > timeout as leader\n"
" timeout install snapshot at index 2\n"
" missing previous entry at index 0 -> needs snapshot\n"
" sending snapshot (2^1) to server 2\n");
return MUNIT_OK;
}
/* Snapshots are not sent an offline nodes */
TEST(snapshot, SkipOffline, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_configuration configuration;
struct raft_entry entry;
int rv;
/* Set very low threshold and trailing entries number */
CLUSTER_SET_SNAPSHOT_THRESHOLD(1 /* ID */, 2 /* n. entries */);
CLUSTER_SET_SNAPSHOT_TRAILING(1 /* ID */, 0 /* n. entries */);
/* Bootstrap and start a cluster with 1 voter and 1 stand-by. Just start
* server 1. */
CLUSTER_SET_TERM(1, 1 /* term */);
CLUSTER_FILL_CONFIGURATION(&configuration, 2, 1, 1 /* stand-by */);
entry.type = RAFT_CHANGE;
entry.term = 1;
rv = raft_configuration_encode(&configuration, &entry.buf);
munit_assert_int(rv, ==, 0);
raft_configuration_close(&configuration);
test_cluster_add_entry(&f->cluster_, 1, &entry);
raft_free(entry.buf.base);
CLUSTER_START(1);
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
" self elect and convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
/* Submit an entry which will to force a snapshot to be taken. */
CLUSTER_ELAPSE(10);
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 10] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^1)\n"
"[ 20] 1 > persisted 1 entry (2^1)\n"
" commit 1 new entry (2^1)\n"
"[ 20] 1 > new snapshot (2^1), 0 trailing entries\n");
/* Server 2 never comes online, so server 1 doesn't send it any
* snapshot. */
CLUSTER_TRACE(
"[ 50] 1 > timeout as leader\n"
" missing previous entry at index 1 -> needs snapshot\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 100] 1 > timeout as leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 150] 1 > timeout as leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
return MUNIT_OK;
}
/* A follower crashes while persisting a snapshot. After it resumes it sends a
* reject response of the snapshot index. */
TEST(snapshot, AbortIfRejected, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
/* Server 1 has a snapshot with last index 2. */
CLUSTER_SET_TERM(1 /* ID */, 2 /* term */);
CLUSTER_SET_SNAPSHOT(1, /* ID */
2, /* last index */
2, /* last term */
2, /* N servers */
2, /* N voting */
1 /* conf index */);
CLUSTER_START(1 /* ID */);
/* Server 2 has just the initial configuration entry at index 1. */
CLUSTER_SET_TERM(2, 1 /* term */);
CLUSTER_ADD_ENTRY(2, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(2);
/* Server 1 becomes leader and eventually sends its snapshot to server 2. */
CLUSTER_TRACE(
"[ 0] 1 > term 2, 1 snapshot (2^2)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (3 vs 1) -> bump term\n"
" remote log is more recent (2^2 vs 1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 130] 2 > recv append entries from server 1\n"
" missing previous entry (2^2) -> reject\n"
"[ 140] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" missing previous entry at index 1 -> needs snapshot\n"
" sending snapshot (2^2) to server 2\n");
/* Server 2 receives the snapshot, but crashes while persisting it. Then it
* restarts. */
CLUSTER_TRACE(
"[ 150] 2 > recv install snapshot from server 1\n"
" start persisting snapshot (2^2)\n");
CLUSTER_STOP(2);
CLUSTER_START(2);
CLUSTER_TRACE("[ 150] 2 > term 3, voted for 1, 1 entry (1^1)\n");
/* Server 1 eventually sends server 2 a heartbeat, which server 2 rejects.
* At that point server 1 sends again the snapshot. */
CLUSTER_TRACE(
"[ 170] 1 > timeout as leader\n"
" missing previous entry at index 2 -> needs snapshot\n"
" snapshot server 2 sending a heartbeat (no entries)\n"
"[ 180] 2 > recv append entries from server 1\n"
" missing previous entry (2^2) -> reject\n"
"[ 190] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" missing previous entry at index 0 -> needs snapshot\n"
" sending snapshot (2^2) to server 2\n");
return MUNIT_OK;
}
/* A follower receives an AppendEntries message while installing a snapshot . */
TEST(snapshot, ReceiveAppendEntriesWhileInstalling, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Set a very low threshold and trailing entries number on server 1. */
CLUSTER_SET_SNAPSHOT_THRESHOLD(1 /* ID */, 2 /* n. entries */);
CLUSTER_SET_SNAPSHOT_TRAILING(1 /* ID */, 1 /* n. entries */);
raft_set_install_snapshot_timeout(CLUSTER_RAFT(1), 100);
/* Prevent server 3 from receving messages from server 1. */
CLUSTER_DISCONNECT(1, 3);
/* Set a high disk latency on server 3, so it will take a while to
* complete installing the snapshot. */
CLUSTER_SET_DISK_LATENCY(3, 250);
/* Increase the election timeout on server 3, so it won't convert to
* candidate. */
raft_set_election_timeout(CLUSTER_RAFT(3), 250);
/* Bootstrap and start a cluster with 3 voters. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n");
/* Apply a few of entries, to force a snapshot to be taken. */
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 120] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^2)\n"
"[ 120] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (3^2)\n"
"[ 130] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 3\n"
"[ 130] 1 > persisted 1 entry (3^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 3\n"
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 2\n"
" pipeline server 2 sending 2 entries (2^2..3^2)\n"
"[ 150] 2 > recv append entries from server 1\n"
" start persisting 2 new entries (2^2..3^2)\n"
"[ 160] 2 > persisted 2 entry (2^2..3^2)\n"
" send success result to 1\n"
"[ 170] 1 > recv append entries result from server 2\n"
" commit 2 new entries (2^2..3^2)\n"
"[ 170] 1 > new snapshot (3^2), 1 trailing entry\n");
/* Reconnect server 3 and wait for it to receive the snapshot. */
CLUSTER_RECONNECT(1, 3);
CLUSTER_TRACE(
"[ 170] 1 > timeout as leader\n"
" missing previous entry at index 1 -> needs snapshot\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 180] 3 > recv append entries from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" missing previous entry (3^2) -> reject\n"
"[ 190] 1 > recv append entries result from server 3\n"
" log mismatch -> send old entries\n"
" missing previous entry at index 1 -> needs snapshot\n"
" sending snapshot (3^2) to server 3\n"
"[ 190] 1 > timeout as leader\n"
" pipeline server 2 sending a heartbeat (no entries)\n"
"[ 200] 3 > recv install snapshot from server 1\n"
" start persisting snapshot (3^2)\n");
/* Apply a new entry, server 1 won't send it to server 3 since it is
* waiting for it to complete installing the snapshot. */
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 200] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (4^2)\n"
" pipeline server 2 sending 1 entry (4^2)\n"
"[ 200] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 210] 1 > persisted 1 entry (4^2)\n"
" next uncommitted entry (4^2) has 1 vote out of 3\n"
"[ 210] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (4^2)\n"
"[ 210] 1 > recv append entries result from server 2\n"
"[ 220] 2 > persisted 1 entry (4^2)\n"
" send success result to 1\n"
"[ 230] 1 > recv append entries result from server 2\n"
" commit 1 new entry (4^2)\n");
/* Transfer leadership from server 0 to server 1. */
test_cluster_transfer(&f->cluster_, 1, 2);
CLUSTER_TRACE(
"[ 230] 1 > transfer leadership to 2\n"
" send timeout to 2\n"
"[ 240] 2 > recv timeout now from server 1\n"
" convert to candidate, start election for term 3\n"
"[ 240] 1 > timeout as leader\n"
" missing previous entry at index 3 -> needs snapshot\n"
" snapshot server 3 sending a heartbeat (no entries)\n"
"[ 250] 1 > recv request vote from server 2\n"
" remote term is higher (3 vs 2) -> bump term, step down\n"
" remote log is equal (4^2) -> grant vote\n"
"[ 250] 3 > recv request vote from server 2\n"
" remote term is higher (3 vs 2) -> bump term\n"
"[ 250] 3 > recv append entries from server 1\n"
" local term is higher (3 vs 2) -> reject\n"
"[ 260] 2 > recv request vote result from server 1\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" replicate 1 new barrier entry (5^3)\n"
" probe server 1 sending 1 entry (5^3)\n"
" probe server 3 sending 1 entry (5^3)\n"
"[ 260] 2 > recv request vote result from server 3\n"
" local server is leader -> ignore\n"
"[ 260] 1 > recv append entries result from server 3\n"
" local server is not leader -> ignore\n"
"[ 270] 2 > persisted 1 entry (5^3)\n"
" next uncommitted entry (4^2) has 1 vote out of 3\n"
"[ 270] 1 > recv append entries from server 2\n"
" start persisting 1 new entry (5^3)\n"
"[ 270] 3 > recv append entries from server 2\n"
" snapshot install in progress -> ignore\n");
return MUNIT_OK;
}
/* An InstallSnapshot RPC arrives while persisting Entries */
TEST(snapshot, InstallDuringEntriesWrite, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_configuration configuration;
struct raft_entry entry;
unsigned id;
int rv;
/* Set very low threshold and trailing entries number */
CLUSTER_SET_SNAPSHOT_THRESHOLD(1 /* ID */, 3 /* n. entries */);
CLUSTER_SET_SNAPSHOT_TRAILING(1 /* ID */, 0 /* n. entries */);
/* Don't let server 2 time out (just for terser traces). */
raft_set_election_timeout(CLUSTER_RAFT(2), 200);
/* Bootstrap and start a cluster with 1 voter and 1 stand-by. Server 1 has
* an additional entry that server 2 does not have. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_FILL_CONFIGURATION(&configuration, 2, 1, 1 /* stand-by */);
entry.type = RAFT_CHANGE;
entry.term = 1;
rv = raft_configuration_encode(&configuration, &entry.buf);
munit_assert_int(rv, ==, 0);
raft_configuration_close(&configuration);
test_cluster_add_entry(&f->cluster_, id, &entry);
raft_free(entry.buf.base);
if (id == 1) {
CLUSTER_ADD_ENTRY(id, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
}
CLUSTER_START(id);
}
/* Server 1 starts and eventually replicates the entry that server 2 is
* missing. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 2 entries (1^1..2^1)\n"
" self elect and convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 10] 2 > recv append entries from server 1\n"
" missing previous entry (2^1) -> reject\n"
"[ 20] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" probe server 2 sending 1 entry (2^1)\n");
/* Set a large disk latency on server 2, so later the InstallSnapshot
* message will arrive while the entry is still being persisted. */
CLUSTER_SET_DISK_LATENCY(2, 60);
CLUSTER_TRACE(
"[ 30] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (2^1)\n");
/* Apply an entry, to force a snapshot to be taken. */
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 30] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (3^1)\n"
"[ 40] 1 > persisted 1 entry (3^1)\n"
" commit 1 new entry (3^1)\n"
"[ 40] 1 > new snapshot (3^1), 0 trailing entries\n");
/* Eventually server 1 replicates the snapshot to server 2. The inital write
* never gets fired because it's stale. */
CLUSTER_TRACE(
"[ 70] 1 > timeout as leader\n"
" missing previous entry at index 1 -> needs snapshot\n"
" sending snapshot (3^1) to server 2\n"
"[ 80] 2 > recv install snapshot from server 1\n"
" start persisting snapshot (3^1)\n"
"[ 120] 1 > timeout as leader\n"
" server 2 is unreachable -> abort snapshot\n"
" missing previous entry at index 0 -> needs snapshot\n"
" probe server 2 sending a heartbeat (no entries)\n");
return MUNIT_OK;
}
/* A new term starts while a node is installing a snapshot. */
TEST(snapshot, NewTermWhileInstalling, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Set very low threshold and trailing entries number */
CLUSTER_SET_SNAPSHOT_THRESHOLD(1 /* ID */, 2 /* n. entries */);
CLUSTER_SET_SNAPSHOT_TRAILING(1 /* ID */, 0 /* n. entries */);
/* Bootstrap and start a cluster with 3 voters. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 120] 1 > recv request vote result from server 3\n"
" local server is leader -> ignore\n");
/* Disconnect server 3, so it won't get any new entry. */
CLUSTER_DISCONNECT(1, 3);
/* Submit a new entry, to trigger a snapshot on server 1. */
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 120] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^2)\n"
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 130] 1 > persisted 1 entry (2^2)\n"
" next uncommitted entry (2^2) has 1 vote out of 3\n"
"[ 140] 1 > recv append entries result from server 2\n"
" pipeline server 2 sending 1 entry (2^2)\n"
"[ 150] 2 > recv append entries from server 1\n"
" start persisting 1 new entry (2^2)\n"
"[ 160] 2 > persisted 1 entry (2^2)\n"
" send success result to 1\n"
"[ 170] 1 > recv append entries result from server 2\n"
" commit 1 new entry (2^2)\n"
"[ 170] 1 > new snapshot (2^2), 0 trailing entries\n");
/* Reconnect server 3, so it receive the snapshot. */
CLUSTER_RECONNECT(1, 3);
/* Set a very high disk latency so server 3 will take a lot of time to
* install the snapshot and server 1 will have stepped down in the
* meantime. */
CLUSTER_SET_DISK_LATENCY(3, 250);
CLUSTER_TRACE(
"[ 170] 1 > timeout as leader\n"
" missing previous entry at index 1 -> needs snapshot\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 180] 3 > recv append entries from server 1\n"
" missing previous entry (2^2) -> reject\n"
"[ 190] 1 > recv append entries result from server 3\n"
" log mismatch -> send old entries\n"
" missing previous entry at index 1 -> needs snapshot\n"
" sending snapshot (2^2) to server 3\n"
"[ 190] 1 > timeout as leader\n"
" pipeline server 2 sending a heartbeat (no entries)\n"
"[ 200] 3 > recv install snapshot from server 1\n"
" start persisting snapshot (2^2)\n");
/* Disconnect server 1 from server 2 and 3, so it will step down. */
CLUSTER_SET_ELECTION_TIMEOUT(1 /* ID */, 20 /* timeout */, 0 /* delta */);
CLUSTER_SET_ELECTION_TIMEOUT(2 /* ID */, 80 /* timeout */, 0 /* delta */);
CLUSTER_DISCONNECT(2, 1);
CLUSTER_DISCONNECT(1, 2);
CLUSTER_DISCONNECT(3, 1);
CLUSTER_DISCONNECT(1, 3);
CLUSTER_TRACE(
"[ 210] 1 > timeout as leader\n"
" server 2 is unreachable -> abort pipeline\n"
"[ 230] 1 > timeout as leader\n"
" server 3 is unreachable -> abort snapshot\n"
" unable to contact majority of cluster -> step down\n");
/* Let server 2 win the elections */
CLUSTER_RECONNECT(2, 1);
CLUSTER_RECONNECT(1, 2);
CLUSTER_RECONNECT(3, 1);
CLUSTER_RECONNECT(1, 3);
CLUSTER_TRACE(
"[ 230] 2 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 240] 1 > recv request vote from server 2\n"
" remote term is higher (3 vs 2) -> bump term\n"
" remote log is equal (2^2) -> grant vote\n"
"[ 240] 3 > recv request vote from server 2\n"
" local server has a leader (server 1) -> reject\n"
"[ 250] 2 > recv request vote result from server 1\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" replicate 1 new barrier entry (3^3)\n"
" probe server 1 sending 1 entry (3^3)\n"
" probe server 3 sending 1 entry (3^3)\n"
"[ 250] 2 > recv request vote result from server 3\n"
" local server is leader -> ignore\n"
"[ 260] 2 > persisted 1 entry (3^3)\n"
" next uncommitted entry (2^2) has 1 vote out of 3\n"
"[ 260] 1 > recv append entries from server 2\n"
" start persisting 1 new entry (3^3)\n"
"[ 260] 3 > recv append entries from server 2\n"
" remote term is higher (3 vs 2) -> bump term\n"
" snapshot install in progress -> ignore\n");
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_start.c 0000664 0000000 0000000 00000020622 14601504142 0020371 0 ustar 00root root 0000000 0000000 #include "../lib/cluster.h"
#include "../lib/runner.h"
struct fixture
{
FIXTURE_CLUSTER;
};
static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER();
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER();
free(f);
}
SUITE(start)
/* Start a server that has no persisted state whatsoever. */
TEST(start, NoState, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
CLUSTER_START(1);
CLUSTER_TRACE("[ 0] 1 > no state\n");
munit_assert_ullong(raft_timeout(CLUSTER_RAFT(1)), ==, 100);
return MUNIT_OK;
}
/* Start a server that has a persisted its term. */
TEST(start, PersistedTerm, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
CLUSTER_SET_TERM(1 /* ID */, 1 /* term */);
CLUSTER_START(1 /* ID */);
CLUSTER_TRACE("[ 0] 1 > term 1\n");
return MUNIT_OK;
}
/* Start a server that has a persisted its term and has a snapshot. */
TEST(start, PersistedTermAndSnapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
CLUSTER_SET_TERM(1 /* ID */, 2 /* term */);
CLUSTER_SET_SNAPSHOT(1, /* ID */
6, /* last index */
2, /* last term */
2, /* N servers */
2, /* N voting */
1 /* conf index */);
CLUSTER_START(1 /* ID */);
CLUSTER_TRACE("[ 0] 1 > term 2, 1 snapshot (6^2)\n");
return MUNIT_OK;
}
/* Start a server that has a persisted its term and has the initial bootstrap
* log entry. */
TEST(start, PersistedTermAndEntries, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
CLUSTER_SET_TERM(1 /* ID */, 1 /* term */);
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(1 /* ID */);
CLUSTER_TRACE("[ 0] 1 > term 1, 1 entry (1^1)\n");
return MUNIT_OK;
}
/* There are two servers. The first has a snapshot present and no other
* entries. */
TEST(start, OneSnapshotAndNoEntries, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
CLUSTER_SET_TERM(1 /* ID */, 2 /* term */);
CLUSTER_SET_SNAPSHOT(1, /* ID */
6, /* last index */
2, /* last term */
2, /* N servers */
2, /* N voting */
1 /* conf index */);
CLUSTER_SET_TERM(2 /* ID */, 1 /* term */);
CLUSTER_ADD_ENTRY(2 /* ID */, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
/* Server 1 becomes leader. */
CLUSTER_START(1 /* ID */);
CLUSTER_START(2 /* ID */);
CLUSTER_TRACE(
"[ 0] 1 > term 2, 1 snapshot (6^2)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (3 vs 1) -> bump term\n"
" remote log is more recent (6^2 vs 1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
/* It eventually replicates the snapshot. */
CLUSTER_TRACE(
"[ 130] 2 > recv append entries from server 1\n"
" missing previous entry (6^2) -> reject\n"
"[ 140] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" missing previous entry at index 1 -> needs snapshot\n"
" sending snapshot (6^2) to server 2\n"
"[ 150] 2 > recv install snapshot from server 1\n"
" start persisting snapshot (6^2)\n"
"[ 160] 2 > persisted snapshot (6^2)\n"
" send success result to 1\n");
/* When the server 1 receives the result it immediately transition server 2
* to pipeline mode. */
CLUSTER_TRACE(
"[ 170] 1 > recv append entries result from server 2\n"
" pipeline server 2 sending a heartbeat (no entries)\n");
return MUNIT_OK;
}
/* There are two servers. The first has a snapshot along with some follow-up
* entries. */
TEST(start, OneSnapshotAndSomeFollowUpEntries, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
CLUSTER_SET_TERM(1 /* ID */, 2 /* term */);
CLUSTER_SET_SNAPSHOT(1, /* ID */
6, /* last index */
2, /* last term */
2, /* N servers */
2, /* N voting */
1 /* conf index */);
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
CLUSTER_SET_TERM(2 /* ID */, 1 /* term */);
CLUSTER_ADD_ENTRY(2 /* ID */, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
/* Server 1 becomes leader. */
CLUSTER_START(1 /* ID */);
CLUSTER_START(2 /* ID */);
CLUSTER_TRACE(
"[ 0] 1 > term 2, 1 snapshot (6^2), 2 entries (7^1..8^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (3 vs 1) -> bump term\n"
" remote log is longer (8^1 vs 1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new barrier entry (9^3)\n"
" probe server 2 sending 1 entry (9^3)\n"
"[ 130] 1 > persisted 1 entry (9^3)\n"
" next uncommitted entry (8^1) has 1 vote out of 2\n"
"[ 130] 2 > recv append entries from server 1\n"
" missing previous entry (8^1) -> reject\n"
"[ 140] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" missing previous entry at index 1 -> needs snapshot\n"
" sending snapshot (6^2) to server 2\n");
test_cluster_step(&f->cluster_);
test_cluster_step(&f->cluster_);
test_cluster_step(&f->cluster_);
test_cluster_step(&f->cluster_);
return MUNIT_OK;
}
/* There is a single voting server in the cluster, which immediately elects
* itself when starting. */
TEST(start, SingleVotingSelfElect, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
CLUSTER_SET_TERM(1 /* ID */, 1 /* term */);
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_CHANGE, 1 /* servers */, 1 /* voters */);
CLUSTER_START(1 /* ID */);
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
" self elect and convert to leader\n");
munit_assert_int(raft_state(CLUSTER_RAFT(1)), ==, RAFT_LEADER);
/* The server can make progress alone. */
CLUSTER_SUBMIT(1 /* ID */, COMMAND, 8 /* size */);
CLUSTER_TRACE(
"[ 0] 1 > submit 1 new client entry\n"
" replicate 1 new command entry (2^1)\n"
"[ 10] 1 > persisted 1 entry (2^1)\n"
" commit 1 new entry (2^1)\n");
return MUNIT_OK;
}
/* There are two servers in the cluster, one is voting and the other is
* not. When started, the non-voting server does not elects itself. */
TEST(start, SingleVotingNotUs, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
CLUSTER_SET_TERM(2 /* ID */, 1 /* term */);
CLUSTER_ADD_ENTRY(2 /* ID */, RAFT_CHANGE, 2 /* servers */, 1 /* voters */);
CLUSTER_START(2 /* ID */);
CLUSTER_TRACE("[ 0] 2 > term 1, 1 entry (1^1)\n");
munit_assert_int(raft_state(CLUSTER_RAFT(2)), ==, RAFT_FOLLOWER);
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_strerror.c 0000664 0000000 0000000 00000002637 14601504142 0021124 0 ustar 00root root 0000000 0000000 #include "../../include/raft.h"
#include "../lib/runner.h"
/******************************************************************************
*
* raft_strerror
*
*****************************************************************************/
SUITE(raft_strerror)
#define ERR_CODE_MAP(X) \
X(RAFT_NOMEM) \
X(RAFT_BADID) \
X(RAFT_DUPLICATEID) \
X(RAFT_DUPLICATEADDRESS) \
X(RAFT_BADROLE) \
X(RAFT_MALFORMED) \
X(RAFT_NOTLEADER) \
X(RAFT_LEADERSHIPLOST) \
X(RAFT_SHUTDOWN) \
X(RAFT_CANTBOOTSTRAP) \
X(RAFT_CANTCHANGE) \
X(RAFT_CORRUPT) \
X(RAFT_CANCELED) \
X(RAFT_NAMETOOLONG) \
X(RAFT_TOOBIG) \
X(RAFT_NOCONNECTION) \
X(RAFT_BUSY) \
X(RAFT_IOERR)
#define TEST_CASE_STRERROR(CODE) \
TEST(raft_strerror, CODE, NULL, NULL, 0, NULL) \
{ \
(void)data; \
(void)params; \
munit_assert_not_null(raft_strerror(CODE)); \
return MUNIT_OK; \
}
ERR_CODE_MAP(TEST_CASE_STRERROR)
TEST(raft_strerror, default, NULL, NULL, 0, NULL)
{
(void)data;
(void)params;
munit_assert_string_equal(raft_strerror(666), "unknown error");
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_submit.c 0000664 0000000 0000000 00000005173 14601504142 0020543 0 ustar 00root root 0000000 0000000 #include "../lib/cluster.h"
#include "../lib/runner.h"
struct fixture
{
FIXTURE_CLUSTER;
};
static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER();
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER();
free(f);
}
SUITE(submit)
/* If there isn't a majority of voting servers with enough capacity, an error is
* returned. */
TEST(submit, CapacityBelowThreshold, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
struct raft_entry entry;
char buf[8];
int rv;
/* Set a capacity threshold close to the disk capacity. */
raft_set_capacity_threshold(CLUSTER_RAFT(1), 240);
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
raft_set_capacity_threshold(CLUSTER_RAFT(1), 240);
/* Submitting an entry fails because there's not enough capacity. */
entry.type = RAFT_COMMAND;
entry.term = raft_current_term(CLUSTER_RAFT(1));
entry.buf.len = 8;
entry.buf.base = buf;
munit_assert_not_null(entry.buf.base);
entry.batch = entry.buf.base;
rv = test_cluster_submit(&f->cluster_, 1 /* ID */, &entry);
munit_assert_int(rv, ==, RAFT_NOSPACE);
CLUSTER_TRACE(
"[ 120] 1 > submit 1 new client entry\n"
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 2\n");
/* Trying to submit again after the first round of heartbeat still fails,
* because the follower is still reporting the same capacity in the
* AppendEntries result. */
rv = test_cluster_submit(&f->cluster_, 1 /* ID */, &entry);
munit_assert_int(rv, ==, RAFT_NOSPACE);
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_tick.c 0000664 0000000 0000000 00000013675 14601504142 0020200 0 ustar 00root root 0000000 0000000 #include "../lib/cluster.h"
#include "../lib/runner.h"
/******************************************************************************
*
* Fixture
*
*****************************************************************************/
struct fixture
{
FIXTURE_CLUSTER;
};
static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER();
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER();
free(f);
}
SUITE(tick)
/* If the election timeout expires, the follower is a voting server, and it
* hasn't voted yet in this term, then become candidate and start a new
* election. */
TEST(tick, ConvertToCandidate, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft *raft;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
/* Stop server 1, eventually server 2 converts to candidate. */
CLUSTER_STOP(1);
CLUSTER_TRACE(
"[ 240] 2 > timeout as follower\n"
" convert to candidate, start election for term 3\n");
/* The term has been incremeted. */
raft = CLUSTER_RAFT(2);
munit_assert_ullong(raft_current_term(raft), ==, 3);
/* We have voted for ouselves. */
munit_assert_ullong(raft_voted_for(raft), ==, 2);
/* We are candidate */
munit_assert_int(raft_state(raft), ==, RAFT_CANDIDATE);
/* The vote results array is initialized */
munit_assert_ptr_not_null(raft->candidate_state.votes);
munit_assert_false(raft->candidate_state.votes[0].grant);
munit_assert_true(raft->candidate_state.votes[1].grant);
return MUNIT_OK;
}
static char *elapse_non_voter_n_voting[] = {"1", NULL};
static MunitParameterEnum elapse_non_voter_params[] = {
{"n_voting", elapse_non_voter_n_voting},
{NULL, NULL},
};
/* If the election timeout has elapsed, but we're not part of the current
* configuration, stay follower. */
TEST(tick, NotInCurrentConfiguration, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
CLUSTER_SET_TERM(2, 1 /* term */);
CLUSTER_ADD_ENTRY(2, RAFT_CHANGE, 1 /* servers */, 1 /* voters */);
CLUSTER_START(2);
CLUSTER_TRACE(
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 130] 2 > timeout as follower\n"
" server not in current configuration -> stay follower\n");
munit_assert_int(raft_state(CLUSTER_RAFT(2)), ==, RAFT_FOLLOWER);
return MUNIT_OK;
}
/* If the election timeout has elapsed, but we're not voters, stay follower. */
TEST(tick, NotVoter, setUp, tearDown, 0, elapse_non_voter_params)
{
struct fixture *f = data;
CLUSTER_SET_TERM(2, 1 /* term */);
CLUSTER_ADD_ENTRY(2, RAFT_CHANGE, 2 /* servers */, 1 /* voters */);
CLUSTER_START(2);
CLUSTER_TRACE(
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 130] 2 > timeout as follower\n"
" spare server -> stay follower\n");
munit_assert_int(raft_state(CLUSTER_RAFT(2)), ==, RAFT_FOLLOWER);
return MUNIT_OK;
}
/* If we're leader and an election timeout elapses without hearing from a
* majority of the cluster, step down. */
TEST(tick, StepDownIfNoContact, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
/* Stop server 2, eventually server 1 steps down. */
CLUSTER_STOP(2);
CLUSTER_TRACE(
"[ 170] 1 > timeout as leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 220] 1 > timeout as leader\n"
" unable to contact majority of cluster -> step down\n");
return MUNIT_OK;
}
/* If we're candidate and the election timeout has elapsed, start a new
* election. */
TEST(tick, NewElection, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
CLUSTER_SET_TERM(1, 1 /* term */);
CLUSTER_ADD_ENTRY(1, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(1);
/* Server 1 becomes candidate. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n");
CLUSTER_TRACE(
"[ 200] 1 > timeout as candidate\n"
" stay candidate, start election for term 3\n");
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_transfer.c 0000664 0000000 0000000 00000044413 14601504142 0021064 0 ustar 00root root 0000000 0000000 #include "../lib/cluster.h"
#include "../lib/runner.h"
struct fixture
{
FIXTURE_CLUSTER;
};
static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER();
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER();
free(f);
}
/******************************************************************************
*
* raft_transfer
*
*****************************************************************************/
SUITE(raft_transfer)
/* The follower we ask to transfer leadership to is up-to-date. */
TEST(raft_transfer, UpToDate, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 2\n");
test_cluster_transfer(&f->cluster_, 1, 2);
munit_assert_ullong(raft_transferee(CLUSTER_RAFT(1)), ==, 2);
CLUSTER_TRACE(
"[ 140] 1 > transfer leadership to 2\n"
" send timeout to 2\n"
"[ 150] 2 > recv timeout now from server 1\n"
" convert to candidate, start election for term 3\n"
"[ 160] 1 > recv request vote from server 2\n"
" remote term is higher (3 vs 2) -> bump term, step down\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 170] 2 > recv request vote result from server 1\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 1 sending a heartbeat (no entries)\n");
munit_assert_ullong(raft_transferee(CLUSTER_RAFT(1)), ==, 0);
return MUNIT_OK;
}
/* The follower we ask to transfer leadership to needs to catch up. */
TEST(raft_transfer, CatchUp, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters. Server 1 has an additioanl
* entry. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
if (id == 1) {
CLUSTER_ADD_ENTRY(id, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
}
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 2 entries (1^1..2^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is longer (2^1 vs 1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new barrier entry (3^2)\n"
" probe server 2 sending 1 entry (3^2)\n");
/* Fire a transfer event. Since server 2 is not up-to-date, server 1 will
* not send it a TimeoutNow message immediately. */
test_cluster_transfer(&f->cluster_, 1, 2);
munit_assert_ullong(raft_transferee(CLUSTER_RAFT(1)), ==, 2);
CLUSTER_TRACE(
"[ 120] 1 > transfer leadership to 2\n"
" wait for transferee to catch up\n"
"[ 130] 1 > persisted 1 entry (3^2)\n"
" next uncommitted entry (2^1) has 1 vote out of 2\n"
"[ 130] 2 > recv append entries from server 1\n"
" missing previous entry (2^1) -> reject\n"
"[ 140] 1 > recv append entries result from server 2\n"
" log mismatch -> send old entries\n"
" probe server 2 sending 2 entries (2^1..3^2)\n"
"[ 150] 2 > recv append entries from server 1\n"
" start persisting 2 new entries (2^1..3^2)\n"
"[ 160] 2 > persisted 2 entry (2^1..3^2)\n"
" send success result to 1\n");
munit_assert_ullong(raft_transferee(CLUSTER_RAFT(1)), ==, 2);
/* Server 2 is now up-to-date, so server 1 sends it a timeout. */
CLUSTER_TRACE(
"[ 170] 1 > recv append entries result from server 2\n"
" send timeout to 2\n"
" commit 2 new entries (2^1..3^2)\n"
"[ 180] 2 > recv timeout now from server 1\n"
" convert to candidate, start election for term 3\n"
"[ 190] 1 > recv request vote from server 2\n"
" remote term is higher (3 vs 2) -> bump term, step down\n"
" remote log is equal (3^2) -> grant vote\n"
"[ 200] 2 > recv request vote result from server 1\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" replicate 1 new barrier entry (4^3)\n"
" probe server 1 sending 1 entry (4^3)\n");
munit_assert_ullong(raft_transferee(CLUSTER_RAFT(1)), ==, 0);
return MUNIT_OK;
}
/* The follower we ask to transfer leadership to is down and the leadership
* transfer does not succeed. */
TEST(raft_transfer, Expire, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 3 voters. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 120] 1 > recv request vote result from server 3\n"
" local server is leader -> ignore\n"
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 130] 3 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 2\n"
"[ 140] 1 > recv append entries result from server 3\n");
/* Stop server 2 and try to transfer leadership to it. */
CLUSTER_STOP(2 /* ID */);
test_cluster_transfer(&f->cluster_, 1, 2);
munit_assert_ullong(raft_transferee(CLUSTER_RAFT(1)), ==, 2);
/* Eventually server 1 stops trying to transfer its leadership. */
CLUSTER_TRACE(
"[ 140] 1 > transfer leadership to 2\n"
" send timeout to 2\n"
"[ 170] 1 > timeout as leader\n"
" pipeline server 2 sending a heartbeat (no entries)\n"
" pipeline server 3 sending a heartbeat (no entries)\n"
"[ 180] 3 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 190] 1 > recv append entries result from server 3\n"
"[ 220] 1 > timeout as leader\n"
" server 2 is unreachable -> abort pipeline\n"
" probe server 2 sending a heartbeat (no entries)\n"
" pipeline server 3 sending a heartbeat (no entries)\n"
"[ 230] 3 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 240] 1 > recv append entries result from server 3\n"
"[ 270] 1 > timeout as leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" pipeline server 3 sending a heartbeat (no entries)\n"
" server 2 not replicating fast enough -> abort transfer\n");
munit_assert_ullong(raft_transferee(CLUSTER_RAFT(1)), ==, 0);
return MUNIT_OK;
}
/* The given ID doesn't match any server in the current configuration. */
TEST(raft_transfer, UnknownServer, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_event event;
struct raft_update update;
int rv;
/* Start a single-node cluster. */
CLUSTER_SET_TERM(1 /* ID */, 1 /* term */);
CLUSTER_ADD_ENTRY(1 /* ID */, RAFT_CHANGE, 1 /* servers */, 1 /* voters */);
CLUSTER_START(1 /* ID */);
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
" self elect and convert to leader\n");
event.time = f->cluster_.time;
event.type = RAFT_TRANSFER;
event.transfer.server_id = 2;
rv = raft_step(CLUSTER_RAFT(1), &event, &update);
munit_assert_int(rv, ==, RAFT_BADID);
return MUNIT_OK;
}
/* Submitting a transfer request twice is an error. */
TEST(raft_transfer, Twice, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_event event;
struct raft_update update;
unsigned id;
int rv;
/* Bootstrap and start a cluster with 2 voters. */
for (id = 1; id <= 2; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n");
test_cluster_transfer(&f->cluster_, 1, 2);
event.time = f->cluster_.time;
event.type = RAFT_TRANSFER;
event.transfer.server_id = 2;
rv = raft_step(CLUSTER_RAFT(1), &event, &update);
munit_assert_int(rv, ==, RAFT_NOTLEADER);
return MUNIT_OK;
}
/* If the given ID is zero, the target is selected automatically. */
TEST(raft_transfer, AutoSelect, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 3 voters. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 120] 1 > recv request vote result from server 3\n"
" local server is leader -> ignore\n"
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 130] 3 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 2\n");
test_cluster_transfer(&f->cluster_, 1, 0);
munit_assert_ullong(raft_transferee(CLUSTER_RAFT(1)), ==, 2);
return MUNIT_OK;
}
/* If the given ID is zero, the target is selected automatically. Followers that
* are up-to-date are preferred. */
TEST(raft_transfer, AutoSelectUpToDate, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 3 voters. Server 1 has an additional
* entry. */
for (id = 1; id <= 3; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 3 /* servers */, 3 /* voters */);
if (id == 1) {
CLUSTER_ADD_ENTRY(id, RAFT_COMMAND, 1 /* term */, 0 /* payload */);
}
CLUSTER_START(id);
}
/* Server 1 becomes leader and starts replicating the additional entry. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 2 entries (1^1..2^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is longer (2^1 vs 1^1) -> grant vote\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is longer (2^1 vs 1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" replicate 1 new barrier entry (3^2)\n"
" probe server 2 sending 1 entry (3^2)\n"
" probe server 3 sending 1 entry (3^2)\n");
/* Stop server 2, so it won't get the extra entry, while server 3 does. */
CLUSTER_STOP(2 /* ID */);
CLUSTER_TRACE(
"[ 120] 1 > recv request vote result from server 3\n"
" local server is leader -> ignore\n"
"[ 130] 1 > persisted 1 entry (3^2)\n"
" next uncommitted entry (2^1) has 1 vote out of 3\n"
"[ 130] 3 > recv append entries from server 1\n"
" missing previous entry (2^1) -> reject\n"
"[ 140] 1 > recv append entries result from server 3\n"
" log mismatch -> send old entries\n"
" probe server 3 sending 2 entries (2^1..3^2)\n"
"[ 150] 3 > recv append entries from server 1\n"
" start persisting 2 new entries (2^1..3^2)\n"
"[ 160] 3 > persisted 2 entry (2^1..3^2)\n"
" send success result to 1\n"
"[ 170] 1 > recv append entries result from server 3\n"
" commit 2 new entries (2^1..3^2)\n");
/* The auto-selection logic prefers server 3, since it's more up-to-date. */
test_cluster_transfer(&f->cluster_, 1, 0);
munit_assert_ullong(raft_transferee(CLUSTER_RAFT(1)), ==, 3);
return MUNIT_OK;
}
/* It's possible to transfer leadership also when pre-vote is active */
TEST(raft_transfer, PreVote, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned id;
/* Bootstrap and start a cluster with 2 voters, enabling pre-vote.*/
for (id = 1; id <= 2; id++) {
raft_set_pre_vote(CLUSTER_RAFT(id), true);
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, 2 /* servers */, 2 /* voters */);
CLUSTER_START(id);
}
/* Server 1 becomes leader. */
CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start pre-election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote log is equal (1^1) -> pre-vote ok\n"
"[ 120] 1 > recv request vote result from server 2\n"
" votes quorum reached -> pre-vote successful\n"
"[ 130] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 140] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
"[ 150] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 160] 1 > recv append entries result from server 2\n");
/* Perform a successful leadership transfer. */
test_cluster_transfer(&f->cluster_, 1, 2);
CLUSTER_TRACE(
"[ 160] 1 > transfer leadership to 2\n"
" send timeout to 2\n"
"[ 170] 2 > recv timeout now from server 1\n"
" convert to candidate, start election for term 3\n"
"[ 180] 1 > recv request vote from server 2\n"
" remote term is higher (3 vs 2) -> bump term, step down\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 190] 2 > recv request vote result from server 1\n"
" quorum reached with 2 votes out of 2 -> convert to leader\n"
" probe server 1 sending a heartbeat (no entries)\n");
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_uv_append.c 0000664 0000000 0000000 00000103702 14601504142 0021216 0 ustar 00root root 0000000 0000000 #include "../lib/aio.h"
#include "../lib/runner.h"
#include "../lib/uv.h"
#include "append_helpers.h"
#include
/* Maximum number of blocks a segment can have */
#define MAX_SEGMENT_BLOCKS 4
/* This block size should work fine for all file systems. */
#define SEGMENT_BLOCK_SIZE 4096
/* Default segment size */
#define SEGMENT_SIZE 4096 * MAX_SEGMENT_BLOCKS
/* XX: Define the symbols below only to let the source code compile. All tests
* making use of them will be skipped. */
struct uv
{
char dir[8];
struct uv_loop_s *loop; /* UV event loop */
raft_index append_next_index; /* Index of next entry to append */
};
struct UvBarrierReq;
typedef void (*UvBarrierCb)(struct UvBarrierReq *req);
struct UvBarrierReq
{
void *data;
bool blocking; /* Whether this barrier should block future writes */
UvBarrierCb cb; /* Completion callback */
};
int UvBarrier(struct uv *uv, raft_index next_index, struct UvBarrierReq *req)
{
(void)uv;
(void)next_index;
(void)req;
return -1;
}
void UvUnblock(struct uv *uv)
{
(void)uv;
}
/******************************************************************************
*
* Fixture with a libuv-based raft_io instance.
*
*****************************************************************************/
struct fixture
{
FIXTURE_UV_DEPS;
FIXTURE_UV;
int count; /* To generate deterministic entry data */
};
/******************************************************************************
*
* Set up and tear down.
*
*****************************************************************************/
static void *setUp(const MunitParameter params[], void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_UV_DEPS;
SETUP_UV;
raft_uv_set_block_size(&f->io, SEGMENT_BLOCK_SIZE);
raft_uv_set_segment_size(&f->io, SEGMENT_SIZE);
f->count = 0;
return f;
}
static void tearDownDeps(void *data)
{
struct fixture *f = data;
if (f == NULL) {
return;
}
TEAR_DOWN_UV_DEPS;
free(f);
}
static void tearDown(void *data)
{
struct fixture *f = data;
if (f == NULL) {
return;
}
TEAR_DOWN_UV;
tearDownDeps(f);
}
/******************************************************************************
*
* Assertions
*
*****************************************************************************/
/* Shutdown the fixture's raft_io instance, then load all entries on disk using
* a new raft_io instance, and assert that there are N entries with a total data
* size of TOTAL_DATA_SIZE bytes. */
#define ASSERT_ENTRIES(N, TOTAL_DATA_SIZE) \
TEAR_DOWN_UV; \
do { \
struct uv_loop_s _loop; \
struct raft_uv_transport _transport; \
struct raft_io _io; \
raft_term _term; \
raft_id _voted_for; \
struct raft_snapshot *_snapshot; \
raft_index _start_index; \
struct raft_entry *_entries; \
size_t _i; \
size_t _n; \
void *_batch = NULL; \
size_t _total_data_size = 0; \
int _rv; \
\
_rv = uv_loop_init(&_loop); \
munit_assert_int(_rv, ==, 0); \
_transport.version = 1; \
_rv = raft_uv_tcp_init(&_transport, &_loop); \
munit_assert_int(_rv, ==, 0); \
_rv = raft_uv_init(&_io, &_loop, f->dir, &_transport); \
munit_assert_int(_rv, ==, 0); \
_rv = _io.init(&_io, 1, "1"); \
if (_rv != 0) { \
munit_errorf("io->init(): %s (%d)", _io.errmsg, _rv); \
} \
_rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index, \
&_entries, &_n); \
if (_rv != 0) { \
munit_errorf("io->load(): %s (%d)", _io.errmsg, _rv); \
} \
_io.close(&_io, NULL); \
uv_run(&_loop, UV_RUN_NOWAIT); \
raft_uv_close(&_io); \
raft_uv_tcp_close(&_transport); \
uv_loop_close(&_loop); \
\
munit_assert_ptr_null(_snapshot); \
munit_assert_int(_n, ==, N); \
for (_i = 0; _i < _n; _i++) { \
struct raft_entry *_entry = &_entries[_i]; \
uint64_t _value = *(uint64_t *)_entry->buf.base; \
munit_assert_int(_entry->term, ==, 1); \
munit_assert_int(_entry->type, ==, RAFT_COMMAND); \
munit_assert_int(_value, ==, _i); \
munit_assert_ptr_not_null(_entry->batch); \
} \
for (_i = 0; _i < _n; _i++) { \
struct raft_entry *_entry = &_entries[_i]; \
if (_entry->batch != _batch) { \
_batch = _entry->batch; \
raft_free(_batch); \
} \
_total_data_size += _entry->buf.len; \
} \
raft_free(_entries); \
munit_assert_int(_total_data_size, ==, TOTAL_DATA_SIZE); \
} while (0);
/******************************************************************************
*
* raft_io->append()
*
*****************************************************************************/
SUITE(append)
/* Append an entries array containing unaligned buffers. */
TEST(append, unaligned, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_SUBMIT_CB_DATA(0, 1, 9, NULL, NULL, RAFT_INVALID);
munit_assert_string_equal(f->io.errmsg,
"entry buffers must be 8-byte aligned");
APPEND_SUBMIT_CB_DATA(1, 3, 63, NULL, NULL, RAFT_INVALID);
munit_assert_string_equal(f->io.errmsg,
"entry buffers must be 8-byte aligned");
return MUNIT_OK;
}
/* Append the very first batch of entries. */
TEST(append, first, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND(1, 64);
ASSERT_ENTRIES(1, 64);
return MUNIT_OK;
}
/* As soon as the backend starts writing the first open segment, a second one
* and a third one get prepared. */
TEST(append, prepareSegments, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1, 64);
while (!DirHasFile(f->dir, "open-3")) {
LOOP_RUN(1);
}
munit_assert_true(DirHasFile(f->dir, "open-1"));
munit_assert_true(DirHasFile(f->dir, "open-2"));
munit_assert_true(DirHasFile(f->dir, "open-3"));
return MUNIT_OK;
}
/* Once the first segment fills up, it gets finalized, and an additional one
* gets prepared, to maintain the available segments pool size. */
TEST(append, finalizeSegment, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE);
APPEND(1, 64);
while (!DirHasFile(f->dir, "open-4")) {
LOOP_RUN(1);
}
while (!DirHasFile(f->dir, "0000000000000001-0000000000000004")) {
LOOP_RUN(1);
}
munit_assert_false(DirHasFile(f->dir, "open-1"));
munit_assert_true(DirHasFile(f->dir, "open-2"));
munit_assert_true(DirHasFile(f->dir, "open-3"));
return MUNIT_OK;
}
/* The very first batch of entries to append is bigger than the regular open
* segment size. */
TEST(append, firstBig, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE);
ASSERT_ENTRIES(MAX_SEGMENT_BLOCKS, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE);
return MUNIT_OK;
}
/* The second batch of entries to append is bigger than the regular open
* segment size. */
TEST(append, secondBig, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1, 64);
APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE);
return MUNIT_OK;
}
/* Schedule multiple appends each one exceeding the segment size. */
TEST(append, severalBig, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND_SUBMIT(0, 2, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE);
APPEND_SUBMIT(1, 2, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE);
APPEND_SUBMIT(2, 2, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE);
APPEND_WAIT(0);
APPEND_WAIT(1);
APPEND_WAIT(2);
ASSERT_ENTRIES(6, 6 * MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE);
return MUNIT_OK;
}
/* Write the very first entry and then another one, both fitting in the same
* block. */
TEST(append, fitBlock, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND(1, 64);
APPEND(1, 64);
ASSERT_ENTRIES(2, 128);
return MUNIT_OK;
}
/* Write an entry that fills the first block exactly and then another one. */
TEST(append, matchBlock, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
size_t size;
size = SEGMENT_BLOCK_SIZE;
size -= sizeof(uint64_t) + /* Format */
sizeof(uint64_t) + /* Checksums */
8 + 16; /* Header */
APPEND(1, size);
APPEND(1, 64);
ASSERT_ENTRIES(2, size + 64);
return MUNIT_OK;
}
/* Write an entry that exceeds the first block, then another one that fits in
* the second block, then a third one that fills the rest of the second block
* plus the whole third block exactly, and finally a fourth entry that fits in
* the fourth block */
TEST(append, exceedBlock, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
size_t written;
size_t size1;
size_t size2;
size1 = SEGMENT_BLOCK_SIZE;
APPEND(1, size1);
APPEND(1, 64);
written = sizeof(uint64_t) + /* Format version */
2 * sizeof(uint32_t) + /* CRC sums of first batch */
8 + 16 + /* Header of first batch */
size1 + /* Size of first batch */
2 * sizeof(uint32_t) + /* CRC of second batch */
8 + 16 + /* Header of second batch */
64; /* Size of second batch */
/* Write a third entry that fills the second block exactly */
size2 = SEGMENT_BLOCK_SIZE - (written % SEGMENT_BLOCK_SIZE);
size2 -= (2 * sizeof(uint32_t) + 8 + 16);
size2 += SEGMENT_BLOCK_SIZE;
APPEND(1, size2);
/* Write a fourth entry */
APPEND(1, 64);
ASSERT_ENTRIES(4, size1 + 64 + size2 + 64);
return MUNIT_OK;
}
/* If an append request is submitted before the write operation of the previous
* append request is started, then a single write will be performed for both
* requests. */
TEST(append, batch, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_SUBMIT(0, 1, 64);
APPEND_SUBMIT(1, 1, 64);
APPEND_WAIT(0);
APPEND_WAIT(1);
return MUNIT_OK;
}
/* An append request submitted while a write operation is in progress gets
* executed only when the write completes. */
TEST(append, wait, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_SUBMIT(0, 1, 64);
LOOP_RUN(1);
APPEND_SUBMIT(1, 1, 64);
APPEND_WAIT(0);
APPEND_WAIT(1);
return MUNIT_OK;
}
/* Several batches with different size gets appended in fast pace, forcing the
* segment arena to grow. */
TEST(append, resizeArena, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND_SUBMIT(0, 2, 64);
APPEND_SUBMIT(1, 1, SEGMENT_BLOCK_SIZE);
APPEND_SUBMIT(2, 2, 64);
APPEND_SUBMIT(3, 1, SEGMENT_BLOCK_SIZE);
APPEND_SUBMIT(4, 1, SEGMENT_BLOCK_SIZE);
APPEND_WAIT(0);
APPEND_WAIT(1);
APPEND_WAIT(2);
APPEND_WAIT(3);
APPEND_WAIT(4);
ASSERT_ENTRIES(7, 64 * 4 + SEGMENT_BLOCK_SIZE * 3);
return MUNIT_OK;
}
/* A few append requests get queued, then a truncate request comes in and other
* append requests right after, before truncation is fully completed. */
TEST(append, truncate, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
int rv;
return MUNIT_SKIP; /* FIXME: flaky */
APPEND(2, 64);
APPEND_SUBMIT(0, 2, 64);
rv = f->io.truncate(&f->io, 2);
munit_assert_int(rv, ==, 0);
APPEND_SUBMIT(1, 2, 64);
APPEND_WAIT(0);
APPEND_WAIT(1);
return MUNIT_OK;
}
/* A few append requests get queued, then a truncate request comes in and other
* append requests right after, before truncation is fully completed. However
* the backend is closed before the truncation request can be processed. */
TEST(append, truncateClosing, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
int rv;
APPEND(2, 64);
APPEND_SUBMIT(0, 2, 64);
rv = f->io.truncate(&f->io, 2);
munit_assert_int(rv, ==, 0);
APPEND_SUBMIT(1, 2, 64);
APPEND_EXPECT(1, RAFT_CANCELED);
TEAR_DOWN_UV;
return MUNIT_OK;
}
/* A few append requests get queued, however the backend is closed before
* preparing the second segment completes. */
TEST(append, prepareClosing, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND_SUBMIT(0, 2, 64);
LOOP_RUN(1);
TEAR_DOWN_UV;
return MUNIT_OK;
}
/* The counters of the open segments get increased as they are closed. */
TEST(append, counter, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
size_t size = SEGMENT_BLOCK_SIZE;
int i;
for (i = 0; i < 10; i++) {
APPEND(1, size);
}
munit_assert_true(DirHasFile(f->dir, "0000000000000001-0000000000000003"));
munit_assert_true(DirHasFile(f->dir, "0000000000000004-0000000000000006"));
munit_assert_true(DirHasFile(f->dir, "open-4"));
return MUNIT_OK;
}
/* If the I/O instance is closed, all pending append requests get canceled. */
TEST(append, cancel, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND_SUBMIT(0, 1, 64);
APPEND_EXPECT(0, RAFT_CANCELED);
TEAR_DOWN_UV;
return MUNIT_OK;
}
/* If creation of the current open segment fails because there's no space, it
* will be retried at regular intervals. */
TEST(append, noSpaceUponPrepareCurrent, setUp, tearDownDeps, 0, DirTmpfsParams)
{
struct fixture *f = data;
SKIP_IF_NO_FIXTURE;
raft_uv_set_segment_size(&f->io, SEGMENT_BLOCK_SIZE * 32768);
raft_uv_set_disk_retry(&f->io, 10);
APPEND_SUBMIT(0, 1, 64);
LOOP_RUN(5);
APPEND_EXPECT(0, RAFT_CANCELED);
TEAR_DOWN_UV;
return MUNIT_OK;
}
/* If creation of a spare open segment fails because there's no space, it
* will be retried at regular intervals. */
TEST(append, noSpaceUponPrepareSpare, setUp, tearDownDeps, 0, DirTmpfsParams)
{
struct fixture *f = data;
SKIP_IF_NO_FIXTURE;
#if defined(__powerpc64__)
/* XXX: fails on ppc64el */
return MUNIT_SKIP;
#endif
raft_uv_set_segment_size(&f->io, SEGMENT_BLOCK_SIZE * 2);
raft_uv_set_disk_retry(&f->io, 10);
DirFill(f->dir, SEGMENT_BLOCK_SIZE * 3);
APPEND(1, SEGMENT_BLOCK_SIZE);
APPEND_SUBMIT(0, 1, SEGMENT_BLOCK_SIZE);
LOOP_RUN(5);
APPEND_EXPECT(0, RAFT_CANCELED);
TEAR_DOWN_UV;
return MUNIT_OK;
}
/* If write request fails because there's not enough space, it will be retried
* at regular intervals. */
TEST(append, noSpaceUponWrite, setUp, tearDownDeps, 0, DirTmpfsParams)
{
struct fixture *f = data;
SKIP_IF_NO_FIXTURE;
#if defined(__powerpc64__)
/* XXX: fails on ppc64el */
TEAR_DOWN_UV;
return MUNIT_SKIP;
#endif
raft_uv_set_segment_size(&f->io, SEGMENT_BLOCK_SIZE);
raft_uv_set_disk_retry(&f->io, 10);
DirFill(f->dir, SEGMENT_BLOCK_SIZE);
APPEND_SUBMIT(0, 1, SEGMENT_BLOCK_SIZE * 2);
LOOP_RUN(5);
APPEND_EXPECT(0, RAFT_CANCELED);
TEAR_DOWN_UV;
return MUNIT_OK;
}
/* A request gets delayed because not enough disk space is available. Eventually
* the space is released and the request succeeds. */
TEST(append, noSpaceResolved, setUp, tearDownDeps, 0, DirTmpfsParams)
{
struct fixture *f = data;
SKIP_IF_NO_FIXTURE;
#if defined(__powerpc64__)
/* XXX: fails on ppc64el */
TEAR_DOWN_UV;
return MUNIT_SKIP;
#endif
raft_uv_set_disk_retry(&f->io, 10);
DirFill(f->dir, SEGMENT_BLOCK_SIZE);
APPEND_SUBMIT(0, 1, 64);
LOOP_RUN(5);
DirRemoveFile(f->dir, ".fill");
APPEND_WAIT(0);
ASSERT_ENTRIES(1, 64);
return MUNIT_OK;
}
/* An error occurs while performing a write. */
TEST(append, writeError, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
aio_context_t ctx = 0;
/* FIXME: doesn't fail anymore after
* https://github.com/CanonicalLtd/raft/pull/49 */
return MUNIT_SKIP;
APPEND_SUBMIT(0, 1, 64);
AioFill(&ctx, 0);
APPEND_WAIT(0);
AioDestroy(ctx);
return MUNIT_OK;
}
static char *oomHeapFaultDelay[] = {"1", /* FIXME "2", */ NULL};
static char *oomHeapFaultRepeat[] = {"1", NULL};
static MunitParameterEnum oomParams[] = {
{TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay},
{TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat},
{NULL, NULL},
};
/* Out of memory conditions. */
TEST(append, oom, setUp, tearDown, 0, oomParams)
{
struct fixture *f = data;
HEAP_FAULT_ENABLE;
APPEND_ERROR(1, 64, RAFT_NOMEM, "");
return MUNIT_OK;
}
/* The uv instance is closed while a write request is in progress. */
TEST(append, closeDuringWrite, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
/* TODO: broken */
return MUNIT_SKIP;
APPEND_SUBMIT(0, 1, 64);
LOOP_RUN(1);
TEAR_DOWN_UV;
return MUNIT_OK;
}
/* When the backend is closed, all unused open segments get removed. */
TEST(append, removeSegmentUponClose, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND(1, 64);
while (!DirHasFile(f->dir, "open-2")) {
LOOP_RUN(1);
}
TEAR_DOWN_UV;
munit_assert_false(DirHasFile(f->dir, "open-2"));
return MUNIT_OK;
}
/* When the backend is closed, all pending prepare get requests get canceled. */
TEST(append, cancelPrepareRequest, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
/* TODO: find a way to test a prepare request cancelation */
return MUNIT_SKIP;
APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE);
APPEND_SUBMIT(0, 1, 64);
APPEND_EXPECT(0, RAFT_CANCELED);
TEAR_DOWN_UV;
return MUNIT_OK;
}
/* When the writer gets closed it tells the writer to close the segment that
* it's currently writing. */
TEST(append, currentSegment, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND(1, 64);
TEAR_DOWN_UV;
munit_assert_true(DirHasFile(f->dir, "0000000000000001-0000000000000001"));
return MUNIT_OK;
}
/* The kernel has ran out of available AIO events. */
TEST(append, ioSetupError, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
aio_context_t ctx = 0;
int rv;
rv = AioFill(&ctx, 0);
if (rv != 0) {
return MUNIT_SKIP;
}
APPEND_FAILURE(1, 64, RAFT_TOOMANY,
"setup writer for open-1: AIO events user limit exceeded");
return MUNIT_OK;
}
/*===========================================================================
Test interaction between UvAppend and UvBarrier
===========================================================================*/
struct barrierData
{
int current; /* Count the number of finished AppendEntries RPCs */
int expected; /* Expected number of finished AppendEntries RPCs */
bool done; /* @true if the Barrier CB has fired */
bool expectDone; /* Expect the Barrier CB to have fired or not */
char **files; /* Expected files in the directory, NULL terminated */
struct uv *uv;
};
static void barrierCbCompareCounter(struct UvBarrierReq *barrier)
{
struct barrierData *bd = barrier->data;
munit_assert_false(bd->done);
bd->done = true;
struct uv *uv = bd->uv;
UvUnblock(uv);
munit_assert_int(bd->current, ==, bd->expected);
if (bd->files != NULL) {
int i = 0;
while (bd->files[i] != NULL) {
munit_assert_true(DirHasFile(uv->dir, bd->files[i]));
++i;
}
}
}
static void barrierDoneCb(struct UvBarrierReq *barrier)
{
struct barrierData *bd = barrier->data;
munit_assert_false(bd->done);
bd->done = true;
}
static void appendCbIncreaseCounterAssertResult(struct raft_io_append *req,
int status)
{
struct result *result = req->data;
munit_assert_int(status, ==, result->status);
result->done = true;
struct barrierData *bd = result->data;
munit_assert_true(bd->done == bd->expectDone);
bd->current += 1;
}
static void appendDummyCb(struct raft_io_append *req, int status)
{
(void)req;
(void)status;
}
static char *bools[] = {"0", "1", NULL};
static MunitParameterEnum blocking_bool_params[] = {
{"bool", bools},
{NULL, NULL},
};
/* Fill up 3 segments worth of AppendEntries RPC's.
* Request a Barrier and expect that the AppendEntries RPC's are finished before
* the Barrier callback is fired.
*/
TEST(append, barrierOpenSegments, setUp, tearDown, 0, blocking_bool_params)
{
return MUNIT_SKIP; /* TODO: modify this test to not use UvBarrier() */
struct fixture *f = data;
struct barrierData bd = {0};
bd.current = 0;
bd.expected = 3;
bd.done = false;
bd.expectDone = false;
bd.uv = f->io.impl;
char *files[] = {"0000000000000001-0000000000000004",
"0000000000000005-0000000000000008",
"0000000000000009-0000000000000012", NULL};
bd.files = files;
APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendCbIncreaseCounterAssertResult, &bd, 0);
APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendCbIncreaseCounterAssertResult, &bd, 0);
APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendCbIncreaseCounterAssertResult, &bd, 0);
struct UvBarrierReq barrier = {0};
barrier.data = (void *)&bd;
barrier.blocking =
(bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0);
barrier.cb = barrierCbCompareCounter;
UvBarrier(f->io.impl, 1, &barrier);
/* Make sure every callback fired */
LOOP_RUN_UNTIL(&bd.done);
APPEND_WAIT(0);
APPEND_WAIT(1);
APPEND_WAIT(2);
return MUNIT_OK;
}
/* Fill up 3 segments worth of AppendEntries RPC's.
* Request a Barrier and stop early.
*/
TEST(append, barrierOpenSegmentsExitEarly, setUp, NULL, 0, blocking_bool_params)
{
return MUNIT_SKIP; /* TODO: modify this test to not use UvBarrier() */
struct fixture *f = data;
struct barrierData bd = {0};
bd.current = 0;
bd.expected = 3;
bd.done = false;
bd.expectDone = false;
bd.uv = f->io.impl;
char *files[] = {"0000000000000001-0000000000000004",
"0000000000000005-0000000000000008",
"0000000000000009-0000000000000012", NULL};
bd.files = files;
APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendDummyCb, NULL, 0);
APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendDummyCb, NULL, 0);
APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendDummyCb, NULL, 0);
struct UvBarrierReq barrier = {0};
barrier.data = (void *)&bd;
barrier.blocking =
(bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0);
barrier.cb = barrierDoneCb;
UvBarrier(f->io.impl, 1, &barrier);
/* Exit early. */
tearDown(data);
munit_assert_true(bd.done);
return MUNIT_OK;
}
/* Fill up 3 segments worth of AppendEntries RPC's.
* Request a 2 barriers and expect their callbacks to fire.
*/
TEST(append, twoBarriersOpenSegments, setUp, tearDown, 0, blocking_bool_params)
{
return MUNIT_SKIP; /* TODO: modify this test to not use UvBarrier() */
struct fixture *f = data;
struct barrierData bd1 = {0};
bd1.current = 0;
bd1.expected = 3;
bd1.done = false;
bd1.expectDone = false;
bd1.uv = f->io.impl;
char *files[] = {"0000000000000001-0000000000000004",
"0000000000000005-0000000000000008",
"0000000000000009-0000000000000012", NULL};
bd1.files = files;
/* Only expect the callback to eventually fire. */
struct barrierData bd2 = {0};
bd2.uv = f->io.impl;
APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendCbIncreaseCounterAssertResult, &bd1, 0);
APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendCbIncreaseCounterAssertResult, &bd1, 0);
APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendCbIncreaseCounterAssertResult, &bd1, 0);
struct UvBarrierReq barrier1 = {0};
barrier1.data = (void *)&bd1;
barrier1.blocking =
(bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0);
barrier1.cb = barrierCbCompareCounter;
UvBarrier(f->io.impl, 1, &barrier1);
struct UvBarrierReq barrier2 = {0};
barrier2.data = (void *)&bd2;
barrier2.blocking =
(bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0);
barrier2.cb = barrierCbCompareCounter;
UvBarrier(f->io.impl, 1, &barrier2);
/* Make sure every callback fired */
LOOP_RUN_UNTIL(&bd1.done);
LOOP_RUN_UNTIL(&bd2.done);
APPEND_WAIT(0);
APPEND_WAIT(1);
APPEND_WAIT(2);
return MUNIT_OK;
}
/* Fill up 3 segments worth of AppendEntries RPC's.
* Request 2 barriers and exit early.
*/
TEST(append, twoBarriersExitEarly, setUp, NULL, 0, blocking_bool_params)
{
return MUNIT_SKIP; /* TODO: modify this test to not use UvBarrier() */
struct fixture *f = data;
struct barrierData bd1 = {0};
bd1.current = 0;
bd1.expected = 3;
bd1.done = false;
bd1.expectDone = false;
bd1.uv = f->io.impl;
char *files[] = {"0000000000000001-0000000000000004",
"0000000000000005-0000000000000008",
"0000000000000009-0000000000000012", NULL};
bd1.files = files;
/* Only expect the callback to eventually fire. */
struct barrierData bd2 = {0};
bd2.uv = f->io.impl;
APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendDummyCb, NULL, 0);
APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendDummyCb, NULL, 0);
APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendDummyCb, NULL, 0);
struct UvBarrierReq barrier1 = {0};
barrier1.data = (void *)&bd1;
barrier1.blocking =
(bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0);
barrier1.cb = barrierDoneCb;
UvBarrier(f->io.impl, 1, &barrier1);
struct UvBarrierReq barrier2 = {0};
barrier2.data = (void *)&bd2;
barrier2.blocking =
(bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0);
barrier2.cb = barrierDoneCb;
UvBarrier(f->io.impl, 1, &barrier2);
/* Exit early. */
tearDown(data);
munit_assert_true(bd1.done);
munit_assert_true(bd2.done);
return MUNIT_OK;
}
/* Request a blocking Barrier and expect that the no AppendEntries RPC's are
* finished before the Barrier callback is fired.
*/
TEST(append, blockingBarrierNoOpenSegments, setUp, tearDown, 0, NULL)
{
return MUNIT_SKIP; /* TODO: modify this test to not use UvBarrier() */
struct fixture *f = data;
struct barrierData bd = {0};
bd.current = 0;
bd.expected = 0;
bd.done = false;
bd.expectDone = true;
bd.uv = f->io.impl;
struct UvBarrierReq barrier = {0};
barrier.data = (void *)&bd;
barrier.blocking = true;
barrier.cb = barrierCbCompareCounter;
UvBarrier(f->io.impl, 1, &barrier);
APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendCbIncreaseCounterAssertResult, &bd, 0);
APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendCbIncreaseCounterAssertResult, &bd, 0);
APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendCbIncreaseCounterAssertResult, &bd, 0);
/* Make sure every callback fired */
LOOP_RUN_UNTIL(&bd.done);
APPEND_WAIT(0);
APPEND_WAIT(1);
APPEND_WAIT(2);
return MUNIT_OK;
}
/* Request a blocking Barrier and expect that the no AppendEntries RPC's are
* finished before the Barrier callback is fired. */
TEST(append, blockingBarrierSingleOpenSegment, setUp, tearDown, 0, NULL)
{
return MUNIT_SKIP; /* TODO: modify this test to not use UvBarrier() */
struct fixture *f = data;
struct barrierData bd = {0};
bd.current = 0;
bd.expected = 0;
bd.done = false;
bd.expectDone = true;
bd.uv = f->io.impl;
char *files[] = {"0000000000000001-0000000000000001", NULL};
bd.files = files;
/* Wait until there is at least 1 open segment otherwise
* the barrier Cb is fired immediately. */
APPEND(1, 64);
while (!DirHasFile(f->dir, "open-1")) {
LOOP_RUN(1);
}
struct UvBarrierReq barrier = {0};
barrier.data = (void *)&bd;
barrier.blocking = true;
barrier.cb = barrierCbCompareCounter;
UvBarrier(f->io.impl, 1, &barrier);
APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendCbIncreaseCounterAssertResult, &bd, 0);
APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendCbIncreaseCounterAssertResult, &bd, 0);
APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
appendCbIncreaseCounterAssertResult, &bd, 0);
/* Make sure every callback fired */
LOOP_RUN_UNTIL(&bd.done);
APPEND_WAIT(0);
APPEND_WAIT(1);
APPEND_WAIT(2);
return MUNIT_OK;
}
static void longWorkCb(uv_work_t *work)
{
(void)work;
sleep(1);
}
static void longAfterWorkCb(uv_work_t *work, int status)
{
struct barrierData *bd = work->data;
munit_assert_false(bd->done);
bd->done = true;
munit_assert_int(status, ==, 0);
struct uv *uv = bd->uv;
UvUnblock(uv);
munit_assert_int(bd->current, ==, bd->expected);
free(work);
}
static void barrierCbLongWork(struct UvBarrierReq *barrier)
{
struct barrierData *bd = barrier->data;
munit_assert_false(bd->done);
struct uv *uv = bd->uv;
int rv;
uv_work_t *work = munit_malloc(sizeof(*work));
munit_assert_ptr_not_null(work);
work->data = bd;
rv = uv_queue_work(uv->loop, work, longWorkCb, longAfterWorkCb);
munit_assert_int(rv, ==, 0);
}
/* Request a non-blocking Barrier that triggers a long-running task, the barrier
* is removed when the long running task completes. This simulates a large
* snapshot write. Ensure Append requests complete before the long running task
* completes.*/
TEST(append, nonBlockingBarrierLongBlockingTask, setUp, tearDown, 0, NULL)
{
return MUNIT_SKIP; /* TODO: modify this test to not use UvBarrier() */
struct fixture *f = data;
struct barrierData bd = {0};
bd.current = 0;
bd.expected = 1;
bd.done = false;
bd.expectDone = false;
bd.uv = f->io.impl;
struct UvBarrierReq barrier = {0};
barrier.data = (void *)&bd;
barrier.blocking = false;
barrier.cb = barrierCbLongWork;
UvBarrier(f->io.impl, bd.uv->append_next_index, &barrier);
APPEND_SUBMIT_CB_DATA(0, 1, 64, appendCbIncreaseCounterAssertResult, &bd,
0);
/* Make sure every callback fired */
LOOP_RUN_UNTIL(&bd.done);
APPEND_WAIT(0);
return MUNIT_OK;
}
/* Request a blocking Barrier that triggers a long-running task, the barrier
* is unblocked and removed when the long running task completes. This simulates
* a large snapshot install. Ensure Append requests complete after the work
* completes.*/
TEST(append, blockingBarrierLongBlockingTask, setUp, tearDown, 0, NULL)
{
return MUNIT_SKIP; /* TODO: modify this test to not use UvBarrier() */
struct fixture *f = data;
struct barrierData bd = {0};
bd.current = 0;
bd.expected = 0;
bd.done = false;
bd.expectDone = true;
bd.uv = f->io.impl;
struct UvBarrierReq barrier = {0};
barrier.data = (void *)&bd;
barrier.blocking = true;
barrier.cb = barrierCbLongWork;
UvBarrier(f->io.impl, bd.uv->append_next_index, &barrier);
APPEND_SUBMIT_CB_DATA(0, 1, 64, appendCbIncreaseCounterAssertResult, &bd,
0);
/* Make sure every callback fired */
LOOP_RUN_UNTIL(&bd.done);
APPEND_WAIT(0);
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_uv_bootstrap.c 0000664 0000000 0000000 00000005750 14601504142 0021770 0 ustar 00root root 0000000 0000000 #include "../lib/runner.h"
#include "../lib/uv.h"
/******************************************************************************
*
* Fixture with a libuv-based raft_io instance and an empty configuration.
*
*****************************************************************************/
struct fixture
{
FIXTURE_UV_DEPS;
FIXTURE_UV;
struct raft_configuration conf;
};
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
/* Add a server to the fixture's configuration. */
#define CONFIGURATION_ADD(ID, ADDRESS) \
{ \
int rv_; \
rv_ = raft_configuration_add(&f->conf, ID, ADDRESS, RAFT_VOTER); \
munit_assert_int(rv_, ==, 0); \
}
/* Invoke f->io->bootstrap() and assert that no error occurs. */
#define BOOTSTRAP \
{ \
int rv_; \
rv_ = f->io.bootstrap(&f->io, &f->conf); \
munit_assert_int(rv_, ==, 0); \
}
/******************************************************************************
*
* Set up and tear down.
*
*****************************************************************************/
static void *setUp(const MunitParameter params[], void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_UV_DEPS;
SETUP_UV;
raft_configuration_init(&f->conf);
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
raft_configuration_close(&f->conf);
TEAR_DOWN_UV;
TEAR_DOWN_UV_DEPS;
free(f);
}
/******************************************************************************
*
* raft_io->bootstrap()
*
*****************************************************************************/
SUITE(bootstrap)
/* Invoke f->io->bootstrap() and assert that it returns the given error code and
* message. */
#define BOOTSTRAP_ERROR(RV, ERRMSG) \
{ \
int rv_; \
rv_ = f->io.bootstrap(&f->io, &f->conf); \
munit_assert_int(rv_, ==, RV); \
munit_assert_string_equal(f->io.errmsg, ERRMSG); \
}
/* Bootstrap a pristine server. */
TEST(bootstrap, pristine, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
CONFIGURATION_ADD(1, "1");
BOOTSTRAP;
return MUNIT_OK;
}
/* The data directory already has metadata files with a non-zero term. */
TEST(bootstrap, termIsNonZero, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
CONFIGURATION_ADD(1, "1");
BOOTSTRAP;
BOOTSTRAP_ERROR(RAFT_CANTBOOTSTRAP, "metadata contains term 1");
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_uv_init.c 0000664 0000000 0000000 00000023436 14601504142 0020717 0 ustar 00root root 0000000 0000000 #include "../../include/raft/uv.h"
#include "../../src/byte.h"
#include "../lib/runner.h"
#include "../lib/uv.h"
#include
#include
/******************************************************************************
*
* Fixture with a non-initialized raft_io instance and uv dependencies.
*
*****************************************************************************/
struct fixture
{
FIXTURE_UV_DEPS;
FIXTURE_UV;
bool closed;
};
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
static void closeCb(struct raft_io *io)
{
struct fixture *f = io->data;
f->closed = true;
}
/* Invoke raft_uv_init() and assert that no error occurs. */
#define INIT(DIR) \
do { \
int _rv; \
_rv = raft_uv_init(&f->io, &f->loop, DIR, &f->transport); \
munit_assert_int(_rv, ==, 0); \
_rv = f->io.init(&f->io, 1, "1"); \
munit_assert_int(_rv, ==, 0); \
} while (0)
/* Invoke raft_io->close(). */
#define CLOSE \
do { \
f->io.close(&f->io, closeCb); \
LOOP_RUN_UNTIL(&f->closed); \
raft_uv_close(&f->io); \
} while (0)
/* Invoke raft_uv_init() and assert that the given error code is returned and
* the given error message set. */
#define INIT_ERROR(DIR, RV, ERRMSG) \
do { \
int _rv; \
_rv = raft_uv_init(&f->io, &f->loop, DIR, &f->transport); \
munit_assert_int(_rv, ==, 0); \
_rv = f->io.init(&f->io, 1, "1"); \
munit_assert_int(_rv, ==, RV); \
munit_assert_string_equal(f->io.errmsg, ERRMSG); \
CLOSE; \
} while (0)
/* Write either the metadata1 or metadata2 file, filling it with the given
* values. */
#define WRITE_METADATA_FILE(N, FORMAT, VERSION, TERM, VOTED_FOR) \
{ \
uint8_t buf[8 * 4]; \
uint8_t *cursor = buf; \
char filename[strlen("metadataN") + 1]; \
sprintf(filename, "metadata%d", N); \
bytePut64(&cursor, FORMAT); \
bytePut64(&cursor, VERSION); \
bytePut64(&cursor, TERM); \
bytePut64(&cursor, VOTED_FOR); \
DirWriteFile(f->dir, filename, buf, sizeof buf); \
}
#define LONG_DIR \
"/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" \
"/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" \
"/ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc" \
"/ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd" \
"/eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee" \
"/fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" \
"/ggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggg" \
"/hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh" \
"/iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii" \
"/jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj" \
"/kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk" \
"/lllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllll" \
"/mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm"
static void *setUp(const MunitParameter params[], void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_UV_DEPS;
f->io.data = f;
f->closed = false;
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
if (f == NULL) {
return;
}
TEAR_DOWN_UV_DEPS;
free(f);
}
/******************************************************************************
*
* raft_io->init()
*
*****************************************************************************/
SUITE(init)
TEST(init, dirTooLong, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_io io = {0};
int rv;
rv = raft_uv_init(&io, &f->loop, LONG_DIR, &f->transport);
munit_assert_int(rv, ==, RAFT_NAMETOOLONG);
munit_assert_string_equal(io.errmsg, "directory path too long");
return 0;
}
/* Out of memory conditions upon probing for direct I/O. */
TEST(init, probeDirectIoOom, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
/* XXX: tmpfs seems to not support O_DIRECT */
struct statfs info;
int rv;
rv = statfs(f->dir, &info);
munit_assert_int(rv, ==, 0);
if (info.f_type == TMPFS_MAGIC) {
return MUNIT_SKIP;
}
#if defined(__powerpc64__)
/* XXX: fails on ppc64el */
return MUNIT_SKIP;
#endif
HeapFaultConfig(&f->heap, 1 /* delay */, 1 /* repeat */);
HEAP_FAULT_ENABLE;
INIT_ERROR(f->dir, RAFT_NOMEM, "probe Direct I/O: out of memory");
return 0;
}
/* Out of memory conditions upon probing for async I/O. */
TEST(init, probeAsyncIoOom, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
/* XXX: tmpfs seems to not support O_DIRECT */
struct statfs info;
int rv;
rv = statfs(f->dir, &info);
munit_assert_int(rv, ==, 0);
if (info.f_type == TMPFS_MAGIC) {
return MUNIT_SKIP;
}
#if defined(__powerpc64__)
/* XXX: fails on ppc64el */
return MUNIT_SKIP;
#endif
HeapFaultConfig(&f->heap, 2 /* delay */, 1 /* repeat */);
HEAP_FAULT_ENABLE;
INIT_ERROR(f->dir, RAFT_NOMEM, "probe Async I/O: out of memory");
return 0;
}
/* The given directory does not exist. */
TEST(init, dirDoesNotExist, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
INIT_ERROR("/foo/bar/egg/baz", RAFT_NOTFOUND,
"directory '/foo/bar/egg/baz' does not exist");
return MUNIT_OK;
}
/* The given directory not accessible */
TEST(init, dirNotAccessible, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
sprintf(errmsg, "directory '%s' is not writable", f->dir);
DirMakeUnexecutable(f->dir);
INIT_ERROR(f->dir, RAFT_INVALID, errmsg);
return MUNIT_OK;
}
/* No space is left for probing I/O capabilities. */
TEST(init, noSpace, setUp, tearDown, 0, DirTmpfsParams)
{
struct fixture *f = data;
SKIP_IF_NO_FIXTURE;
DirFill(f->dir, 4);
INIT_ERROR(f->dir, RAFT_NOSPACE,
"create I/O capabilities probe file: not enough space to "
"allocate 4096 bytes");
return MUNIT_OK;
}
/* The metadata1 file has not the expected number of bytes. In this case the
* file is not considered at all, and the effect is as if this was a brand new
* server. */
TEST(init, metadataOneTooShort, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
uint8_t buf[16] = {0};
DirWriteFile(f->dir, "metadata1", buf, sizeof buf);
INIT(f->dir);
CLOSE;
return MUNIT_OK;
}
/* The metadata1 file has not the expected format. */
TEST(init, metadataOneBadFormat, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
WRITE_METADATA_FILE(1, /* Metadata file index */
2, /* Format */
1, /* Version */
1, /* Term */
0 /* Voted for */);
INIT_ERROR(f->dir, RAFT_MALFORMED,
"decode content of metadata1: bad format version 2");
return MUNIT_OK;
}
/* The metadata1 file has not a valid version. */
TEST(init, metadataOneBadVersion, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
WRITE_METADATA_FILE(1, /* Metadata file index */
1, /* Format */
0, /* Version */
1, /* Term */
0 /* Voted for */);
INIT_ERROR(f->dir, RAFT_CORRUPT,
"decode content of metadata1: version is set to zero");
return MUNIT_OK;
}
/* The data directory has both metadata files, but they have the same
* version. */
TEST(init, metadataOneAndTwoSameVersion, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
WRITE_METADATA_FILE(1, /* Metadata file index */
1, /* Format */
2, /* Version */
3, /* Term */
0 /* Voted for */);
WRITE_METADATA_FILE(2, /* Metadata file index */
1, /* Format */
2, /* Version */
2, /* Term */
0 /* Voted for */);
INIT_ERROR(f->dir, RAFT_CORRUPT,
"metadata1 and metadata2 are both at version 2");
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_uv_load.c 0000664 0000000 0000000 00000212301 14601504142 0020662 0 ustar 00root root 0000000 0000000 #include
#include "../../src/byte.h"
#include "../lib/runner.h"
#include "../lib/uv.h"
/******************************************************************************
*
* Fixture with a non-initialized libuv-based raft_io instance.
*
*****************************************************************************/
struct fixture
{
FIXTURE_UV_DEPS;
FIXTURE_UV;
};
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
static void closeCb(struct raft_io *io)
{
bool *done = io->data;
*done = true;
}
static void appendCb(struct raft_io_append *req, int status)
{
bool *done = req->data;
munit_assert_int(status, ==, 0);
*done = true;
}
static void snapshotPutCb(struct raft_io_snapshot_put *req, int status)
{
bool *done = req->data;
munit_assert_int(status, ==, 0);
*done = true;
}
struct snapshot
{
raft_term term;
raft_index index;
uint64_t data;
};
#define WORD_SIZE 8
/* Maximum number of blocks a segment can have */
#define MAX_SEGMENT_BLOCKS 4
/* This block size should work fine for all file systems. */
#define SEGMENT_BLOCK_SIZE 4096
/* Desired segment size */
#define SEGMENT_SIZE SEGMENT_BLOCK_SIZE *MAX_SEGMENT_BLOCKS
#define CLOSED_SEGMENT_FILENAME(START, END) \
"000000000000000" #START \
"-" \
"000000000000000" #END
/* Check if open segment file exists. */
#define HAS_OPEN_SEGMENT_FILE(COUNT) DirHasFile(f->dir, "open-" #COUNT)
/* Check if closed segment file exists. */
#define HAS_CLOSED_SEGMENT_FILE(START, END) \
DirHasFile(f->dir, CLOSED_SEGMENT_FILENAME(START, END))
/* Initialize a standalone raft_io instance and use it to append N batches of
* entries, each containing one entry. DATA should be an integer that will be
* used as base value for the data of the first entry, and will be then
* incremented for subsequent entries. */
#define APPEND(N, DATA) \
do { \
struct raft_uv_transport _transport; \
struct raft_io _io; \
raft_term _term; \
raft_id _voted_for; \
struct raft_snapshot *_snapshot; \
raft_index _start_index; \
struct raft_entry *_entries; \
size_t _i; \
size_t _n; \
void *_batch = NULL; \
struct raft_entry _new_entry; \
uint64_t _new_entry_data; \
uint64_t _data = DATA; \
struct raft_io_append _req; \
bool _done = false; \
int _rv; \
\
/* Initialize the instance, loading existing data, but discarding \
* it. This makes sure that the start index is correctly set. */ \
_transport.version = 1; \
_rv = raft_uv_tcp_init(&_transport, &f->loop); \
munit_assert_int(_rv, ==, 0); \
_rv = raft_uv_init(&_io, &f->loop, f->dir, &_transport); \
munit_assert_int(_rv, ==, 0); \
_rv = _io.init(&_io, 1, "1"); \
munit_assert_int(_rv, ==, 0); \
raft_uv_set_block_size(&_io, SEGMENT_BLOCK_SIZE); \
raft_uv_set_segment_size(&_io, SEGMENT_SIZE); \
_rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index, \
&_entries, &_n); \
munit_assert_int(_rv, ==, 0); \
for (_i = 0; _i < _n; _i++) { \
struct raft_entry *_entry = &_entries[_i]; \
if (_entry->batch != _batch) { \
_batch = _entry->batch; \
raft_free(_batch); \
} \
} \
if (_entries != NULL) { \
raft_free(_entries); \
} \
if (_snapshot != NULL) { \
raft_configuration_close(&_snapshot->configuration); \
munit_assert_int(_snapshot->n_bufs, ==, 1); \
raft_free(_snapshot->bufs[0].base); \
raft_free(_snapshot->bufs); \
raft_free(_snapshot); \
} \
\
/* Append the new entries. */ \
for (_i = 0; _i < N; _i++) { \
struct raft_entry *entry = &_new_entry; \
entry->term = 1; \
entry->type = RAFT_COMMAND; \
entry->buf.base = &_new_entry_data; \
entry->buf.len = sizeof _new_entry_data; \
entry->batch = NULL; \
munit_assert_ptr_not_null(entry->buf.base); \
memset(entry->buf.base, 0, entry->buf.len); \
*(uint64_t *)entry->buf.base = _data; \
_data++; \
_req.data = &_done; \
_rv = _io.append(&_io, &_req, entry, 1, appendCb); \
munit_assert_int(_rv, ==, 0); \
LOOP_RUN_UNTIL(&_done); \
_done = false; \
} \
\
/* Shutdown the standalone raft_io instance. */ \
_done = false; \
_io.data = &_done; \
_io.close(&_io, closeCb); \
LOOP_RUN_UNTIL(&_done); \
raft_uv_close(&_io); \
raft_uv_tcp_close(&_transport); \
} while (0);
/* Initialize a standalone raft_io instance and use it to persist a new snapshot
* at the given INDEX and TERM. DATA should be an integer that will be used as
* as snapshot content. */
#define SNAPSHOT_PUT(TERM, INDEX, DATA) \
do { \
struct raft_uv_transport _transport; \
struct raft_io _io; \
raft_term _term; \
raft_id _voted_for; \
struct raft_snapshot *_snapshot; \
raft_index _start_index; \
struct raft_entry *_entries; \
size_t _i; \
size_t _n; \
void *_batch = NULL; \
struct raft_snapshot _new_snapshot; \
struct raft_buffer _new_snapshot_buf; \
uint64_t _new_snapshot_data = DATA; \
struct raft_io_snapshot_put _req; \
bool _done = false; \
int _rv; \
\
/* Initialize the instance, loading existing data, but discarding \
* it. This makes sure that the start index is correctly set. */ \
_transport.version = 1; \
_rv = raft_uv_tcp_init(&_transport, &f->loop); \
munit_assert_int(_rv, ==, 0); \
_rv = raft_uv_init(&_io, &f->loop, f->dir, &_transport); \
munit_assert_int(_rv, ==, 0); \
_rv = _io.init(&_io, 1, "1"); \
munit_assert_int(_rv, ==, 0); \
raft_uv_set_block_size(&_io, SEGMENT_BLOCK_SIZE); \
raft_uv_set_segment_size(&_io, SEGMENT_SIZE); \
_rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index, \
&_entries, &_n); \
munit_assert_int(_rv, ==, 0); \
for (_i = 0; _i < _n; _i++) { \
struct raft_entry *_entry = &_entries[_i]; \
if (_entry->batch != _batch) { \
_batch = _entry->batch; \
raft_free(_batch); \
} \
} \
if (_entries != NULL) { \
raft_free(_entries); \
} \
if (_snapshot != NULL) { \
raft_configuration_close(&_snapshot->configuration); \
munit_assert_int(_snapshot->n_bufs, ==, 1); \
raft_free(_snapshot->bufs[0].base); \
raft_free(_snapshot->bufs); \
raft_free(_snapshot); \
} \
\
/* Persist the new snapshot. */ \
_new_snapshot.index = INDEX; \
_new_snapshot.term = TERM; \
raft_configuration_init(&_new_snapshot.configuration); \
_rv = raft_configuration_add(&_new_snapshot.configuration, 1, "1", \
RAFT_VOTER); \
munit_assert_int(_rv, ==, 0); \
_new_snapshot.bufs = &_new_snapshot_buf; \
_new_snapshot.n_bufs = 1; \
_new_snapshot_buf.base = &_new_snapshot_data; \
_new_snapshot_buf.len = sizeof _new_snapshot_data; \
_req.data = &_done; \
_rv = \
_io.snapshot_put(&_io, 10, &_req, &_new_snapshot, snapshotPutCb); \
munit_assert_int(_rv, ==, 0); \
LOOP_RUN_UNTIL(&_done); \
raft_configuration_close(&_new_snapshot.configuration); \
\
/* Shutdown the standalone raft_io instance. */ \
_done = false; \
_io.data = &_done; \
_io.close(&_io, closeCb); \
LOOP_RUN_UNTIL(&_done); \
raft_uv_close(&_io); \
raft_uv_tcp_close(&_transport); \
} while (0);
/* Forcibly turn a closed segment into an open one, by renaming the underlying
* file and growing its size. */
#define UNFINALIZE(FIRST_INDEX, LAST_INDEX, COUNTER) \
do { \
const char *_filename1 = \
CLOSED_SEGMENT_FILENAME(FIRST_INDEX, LAST_INDEX); \
char _filename2[64]; \
sprintf(_filename2, "open-%u", (unsigned)COUNTER); \
munit_assert_true(DirHasFile(f->dir, _filename1)); \
munit_assert_false(DirHasFile(f->dir, _filename2)); \
DirRenameFile(f->dir, _filename1, _filename2); \
DirGrowFile(f->dir, _filename2, SEGMENT_SIZE); \
} while (0)
#define LOAD_VARS \
int _rv; \
raft_term _term; \
raft_id _voted_for; \
struct raft_snapshot *_snapshot; \
raft_index _start_index; \
struct raft_entry *_entries; \
size_t _n;
/* Initialize the raft_io instance, then call raft_io->load() and assert that it
* returns the given error code and message. */
#define LOAD_ERROR(RV, ERRMSG) \
do { \
LOAD_VARS; \
SETUP_UV; \
_rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, \
&_start_index, &_entries, &_n); \
munit_assert_int(_rv, ==, RV); \
munit_assert_string_equal(f->io.errmsg, ERRMSG); \
} while (0)
#define LOAD_ERROR_NO_SETUP(RV, ERRMSG) \
do { \
LOAD_VARS; \
_rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, \
&_start_index, &_entries, &_n); \
munit_assert_int(_rv, ==, RV); \
munit_assert_string_equal(f->io.errmsg, ERRMSG); \
} while (0)
#define LOAD_ERROR_NO_RECOVER(RV, ERRMSG) \
do { \
LOAD_VARS; \
SETUP_UV; \
_rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, \
&_start_index, &_entries, &_n); \
munit_assert_int(_rv, ==, RV); \
munit_assert_string_equal(f->io.errmsg, ERRMSG); \
} while (0)
#define _LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES) \
_rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, &_start_index, \
&_entries, &_n); \
munit_assert_int(_rv, ==, 0); \
munit_assert_int(_term, ==, TERM); \
munit_assert_int(_voted_for, ==, VOTED_FOR); \
munit_assert_int(_start_index, ==, START_INDEX); \
if (_snapshot != NULL) { \
struct snapshot *_expected = (struct snapshot *)(SNAPSHOT); \
munit_assert_ptr_not_null(_snapshot); \
munit_assert_int(_snapshot->term, ==, _expected->term); \
munit_assert_int(_snapshot->index, ==, _expected->index); \
munit_assert_int(_snapshot->n_bufs, ==, 1); \
munit_assert_int(*(uint64_t *)_snapshot->bufs[0].base, ==, \
_expected->data); \
raft_configuration_close(&_snapshot->configuration); \
raft_free(_snapshot->bufs[0].base); \
raft_free(_snapshot->bufs); \
raft_free(_snapshot); \
} \
if (_n != 0) { \
munit_assert_int(_n, ==, N_ENTRIES); \
for (_i = 0; _i < _n; _i++) { \
struct raft_entry *_entry = &_entries[_i]; \
uint64_t _value = *(uint64_t *)_entry->buf.base; \
munit_assert_int(_value, ==, _data); \
_data++; \
} \
for (_i = 0; _i < _n; _i++) { \
struct raft_entry *_entry = &_entries[_i]; \
if (_entry->batch != _batch) { \
_batch = _entry->batch; \
raft_free(_batch); \
} \
} \
raft_free(_entries); \
}
/* Initialize the raft_io instance, then invoke raft_io->load() and assert that
* it returns the given state. If non-NULL, SNAPSHOT points to a struct snapshot
* object whose attributes must match the loaded snapshot. ENTRIES_DATA is
* supposed to be the integer stored in the data of first loaded entry. */
#define LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, ENTRIES_DATA, N_ENTRIES) \
do { \
LOAD_VARS; \
void *_batch = NULL; \
uint64_t _data = ENTRIES_DATA; \
unsigned _i; \
SETUP_UV; \
_LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES) \
} while (0)
/* Same as LOAD but with auto recovery turned on. */
#define LOAD_WITH_AUTO_RECOVERY(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, \
ENTRIES_DATA, N_ENTRIES) \
do { \
LOAD_VARS; \
void *_batch = NULL; \
uint64_t _data = ENTRIES_DATA; \
unsigned _i; \
SETUP_UV; \
raft_uv_set_auto_recovery(&f->io, true); \
_LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES) \
} while (0)
/* Same as LOAD without SETUP_UV */
#define LOAD_NO_SETUP(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, ENTRIES_DATA, \
N_ENTRIES) \
do { \
LOAD_VARS; \
void *_batch = NULL; \
uint64_t _data = ENTRIES_DATA; \
unsigned _i; \
_LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES) \
} while (0)
/******************************************************************************
*
* Set up and tear down.
*
*****************************************************************************/
static void *setUp(const MunitParameter params[], void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_UV_DEPS;
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
TEAR_DOWN_UV;
TEAR_DOWN_UV_DEPS;
free(f);
}
/******************************************************************************
*
* raft_io->load()
*
*****************************************************************************/
SUITE(load)
/* Load the initial state of a pristine server. */
TEST(load, emptyDir, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
LOAD(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
0, /* data for first loaded entry */
0 /* n entries */
);
return MUNIT_OK;
}
static char *unknownFiles[] = {
"garbage",
"0000000000000000000000000001-00000000001garbage",
"open-1garbage",
NULL,
};
static MunitParameterEnum unknownFilesParams[] = {
{"filename", unknownFiles},
{NULL, NULL},
};
/* Files that are not part of the raft state are ignored. */
TEST(load, ignoreUnknownFiles, setUp, tearDown, 0, unknownFilesParams)
{
struct fixture *f = data;
const char *filename = munit_parameters_get(params, "filename");
DirWriteFileWithZeros(f->dir, filename, 128);
LOAD(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
0, /* data for first loaded entry */
0 /* n entries */
);
return MUNIT_OK;
}
static char *unusableFiles[] = {"tmp-0000000001221212-0000000001221217",
"tmp-snapshot-15-8260687-512469866",
"snapshot-525-43326736-880259052",
"snapshot-999-13371337-880259052.meta",
"snapshot-20-8260687-512469866",
"snapshot-88-8260687-512469866.meta",
"snapshot-88-8260999-512469866.meta",
"tmp-snapshot-88-8260999-512469866.meta",
"tmp-snapshot-33-8260687-512469866",
"snapshot-33-8260687-512469866.meta",
"tmp-metadata1",
"tmp-metadata2",
"tmp-open1",
"tmp-open13",
NULL};
static MunitParameterEnum unusableFilesParams[] = {
{"filename", unusableFiles},
{NULL, NULL},
};
/* Files that can no longer be used are removed. */
TEST(load, removeUnusableFiles, setUp, tearDown, 0, unusableFilesParams)
{
struct fixture *f = data;
const char *filename = munit_parameters_get(params, "filename");
DirWriteFileWithZeros(f->dir, filename, 128);
munit_assert_true(DirHasFile(f->dir, filename));
LOAD(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
0, /* data for first loaded entry */
0 /* n entries */
);
munit_assert_false(DirHasFile(f->dir, filename));
return MUNIT_OK;
}
/* The data directory has an empty open segment. */
TEST(load, emptyOpenSegment, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
DirWriteFile(f->dir, "open-1", NULL, 0);
LOAD(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
0, /* data for first loaded entry */
0 /* n entries */
);
/* The empty segment has been removed. */
munit_assert_false(HAS_OPEN_SEGMENT_FILE(1));
return MUNIT_OK;
}
/* The data directory has a freshly allocated open segment filled with zeros. */
TEST(load, openSegmentWithTrailingZeros, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
DirWriteFileWithZeros(f->dir, "open-1", 256);
LOAD(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
0, /* data for first loaded entry */
0 /* n entries */
);
/* The empty segment has been removed. */
munit_assert_false(HAS_OPEN_SEGMENT_FILE(1));
return MUNIT_OK;
}
/* The data directory has a valid closed and open segments. */
TEST(load, bothOpenAndClosedSegments, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(2, 1);
APPEND(1, 3);
APPEND(1, 4);
UNFINALIZE(4, 4, 1);
LOAD(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
1, /* data for first loaded entry */
4 /* n entries */
);
return MUNIT_OK;
}
/* The data directory has an allocated open segment which contains non-zero
* corrupted data in its second batch. */
TEST(load, openSegmentWithNonZeroData, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
uint64_t corrupt = 123456789;
APPEND(2, 1);
UNFINALIZE(1, 2, 1);
DirOverwriteFile(f->dir, "open-1", &corrupt, sizeof corrupt, 60);
LOAD(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
1, /* data for first loaded entry */
1 /* n entries */
);
/* The segment has been removed. */
munit_assert_false(HAS_OPEN_SEGMENT_FILE(1));
return MUNIT_OK;
}
/* The data directory has an open segment with a partially written batch that
* needs to be truncated. */
TEST(load, openSegmentWithIncompleteBatch, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
uint8_t zero[256];
APPEND(2, 1);
UNFINALIZE(1, 2, 1);
memset(zero, 0, sizeof zero);
DirOverwriteFile(f->dir, "open-1", &zero, sizeof zero, 62);
LOAD(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
1, /* data for first loaded entry */
1 /* n entries */
);
return MUNIT_OK;
}
/* The data directory has an open segment whose first batch is only
* partially written. In that case the segment gets removed. */
TEST(load, openSegmentWithIncompleteFirstBatch, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
uint8_t buf[4 * WORD_SIZE] = {
1, 0, 0, 0, 0, 0, 0, 0, /* Format version */
0, 0, 0, 0, 0, 0, 0, 0, /* CRC32 checksums */
0, 0, 0, 0, 0, 0, 0, 0, /* Number of entries */
0, 0, 0, 0, 0, 0, 0, 0 /* Batch data */
};
APPEND(1, 1);
UNFINALIZE(1, 1, 1);
DirOverwriteFile(f->dir, "open-1", buf, sizeof buf, 0);
LOAD(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
0, /* data for first loaded entry */
0 /* n entries */
);
return MUNIT_OK;
}
/* The data directory has two segments, with the second having an entry. */
TEST(load, twoOpenSegments, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1, 1);
APPEND(1, 2);
UNFINALIZE(1, 1, 1);
UNFINALIZE(2, 2, 2);
LOAD(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
1, /* data for first loaded entry */
2 /* n entries */
);
/* The first and second segments have been renamed. */
munit_assert_false(HAS_OPEN_SEGMENT_FILE(1));
munit_assert_false(HAS_OPEN_SEGMENT_FILE(2));
munit_assert_true(HAS_CLOSED_SEGMENT_FILE(1, 1));
munit_assert_true(HAS_CLOSED_SEGMENT_FILE(2, 2));
return MUNIT_OK;
}
/* The data directory has two open segments, with the second one filled with
* zeros. */
TEST(load, secondOpenSegmentIsAllZeros, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1, 1);
UNFINALIZE(1, 1, 1);
DirWriteFileWithZeros(f->dir, "open-2", SEGMENT_SIZE);
LOAD(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
1, /* data for first loaded entry */
1 /* n entries */
);
/* The first segment has been renamed. */
munit_assert_false(HAS_OPEN_SEGMENT_FILE(1));
munit_assert_true(HAS_CLOSED_SEGMENT_FILE(1, 1));
/* The second segment has been removed. */
munit_assert_false(HAS_OPEN_SEGMENT_FILE(2));
return MUNIT_OK;
}
/* The data directory has two open segments, the first one has a corrupt header
* and auto-recovery is on. */
TEST(load, twoOpenSegmentsFirstCorruptAutoRecovery, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1, 1);
UNFINALIZE(1, 1, 1);
DirWriteFileWithZeros(f->dir, "open-2", SEGMENT_SIZE);
/* Corrupt open segment */
uint64_t version = 0 /* Format version */;
DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
/* Load is successful and equals pristine condition. */
LOAD_WITH_AUTO_RECOVERY(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
0, /* data for first loaded entry */
0 /* n entries */
);
/* The open segments are renamed, and there is no closed segment. */
munit_assert_false(HAS_OPEN_SEGMENT_FILE(1));
munit_assert_false(HAS_OPEN_SEGMENT_FILE(2));
munit_assert_false(HAS_CLOSED_SEGMENT_FILE(1, 1));
return MUNIT_OK;
}
/* The data directory has two open segments, the first one has a corrupt header.
*/
TEST(load, twoOpenSegmentsFirstCorrupt, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1, 1);
UNFINALIZE(1, 1, 1);
DirWriteFileWithZeros(f->dir, "open-2", SEGMENT_SIZE);
/* Corrupt open segment */
uint64_t version = 0 /* Format version */;
DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
LOAD_ERROR(RAFT_CORRUPT,
"load open segment open-1: unexpected format version 0");
/* The open segments are renamed, and there is no closed segment. */
munit_assert_true(HAS_OPEN_SEGMENT_FILE(1));
munit_assert_true(HAS_OPEN_SEGMENT_FILE(2));
return MUNIT_OK;
}
/* The data directory has a valid open segment. */
TEST(load, openSegment, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1, 1);
UNFINALIZE(1, 1, 1);
LOAD(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
1, /* data for first loaded entry */
1 /* n entries */
);
return MUNIT_OK;
}
/* There is exactly one snapshot and no segments. */
TEST(load, onlyOneSnapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct snapshot snapshot = {
1, /* term */
1, /* index */
1 /* data */
};
SNAPSHOT_PUT(1, 1, 1);
LOAD(0, /* term */
0, /* voted for */
&snapshot, /* snapshot */
2, /* start index */
0, /* data for first loaded entry */
0 /* n entries */
);
return MUNIT_OK;
}
/* There are several snapshots, including an incomplete one. The last one is
* loaded and the incomplete or older ones are removed. */
TEST(load, manySnapshots, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct snapshot snapshot = {
2, /* term */
9, /* index */
4 /* data */
};
char filename[64];
uint64_t now;
/* Take a snapshot but then remove the data file, as if the server crashed
* before it could complete writing it. */
uv_update_time(&f->loop);
now = uv_now(&f->loop);
sprintf(filename, "snapshot-1-8-%ju", now);
SNAPSHOT_PUT(1, 8, 1);
DirRemoveFile(f->dir, filename);
SNAPSHOT_PUT(1, 8, 2);
SNAPSHOT_PUT(2, 6, 3);
SNAPSHOT_PUT(2, 9, 4);
LOAD(0, /* term */
0, /* voted for */
&snapshot, /* snapshot */
10, /* start index */
0, /* data for first loaded entry */
0 /* n entries */
);
/* The orphaned .meta file is removed */
char meta_filename[128];
sprintf(meta_filename, "%s%s", filename, ".meta");
munit_assert_false(DirHasFile(f->dir, meta_filename));
return MUNIT_OK;
}
/* There are two snapshots, but the last one has an empty data file. The first
* one is loaded and the empty one is discarded. */
TEST(load, emptySnapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct snapshot snapshot = {
1, /* term */
4, /* index */
1 /* data */
};
char filename[64];
uint64_t now;
SNAPSHOT_PUT(1, 4, 1);
/* Take a snapshot but then truncate the data file, as if the server ran out
* of space before it could write it. */
uv_update_time(&f->loop);
now = uv_now(&f->loop);
sprintf(filename, "snapshot-2-6-%ju", now);
SNAPSHOT_PUT(2, 6, 2);
DirTruncateFile(f->dir, filename, 0);
LOAD(0, /* term */
0, /* voted for */
&snapshot, /* snapshot */
5, /* start index */
0, /* data for first loaded entry */
0 /* n entries */
);
return MUNIT_OK;
}
/* There are two snapshots, but the last one has an invalid metadata file. The
* first one is loaded and the invalid one is discarded. */
TEST(load, invalidSnapshotMetadata, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct snapshot snapshot = {
1, /* term */
4, /* index */
1 /* data */
};
char filename[64];
uint64_t now;
uint64_t format = 0;
SNAPSHOT_PUT(1, 4, 1);
/* Take a snapshot but then truncate the data file, as if the server ran out
* of space before it could write it. */
uv_update_time(&f->loop);
now = uv_now(&f->loop);
sprintf(filename, "snapshot-2-6-%ju.meta", now);
SNAPSHOT_PUT(2, 6, 2);
DirOverwriteFile(f->dir, filename, &format, sizeof format, 0);
LOAD(0, /* term */
0, /* voted for */
&snapshot, /* snapshot */
5, /* start index */
0, /* data for first loaded entry */
0 /* n entries */
);
return MUNIT_OK;
}
/* There is an orphaned snapshot and an orphaned snapshot .meta file,
* make sure they are removed */
TEST(load, orphanedSnapshotFiles, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
uv_update_time(&f->loop);
uint64_t now = uv_now(&f->loop);
struct snapshot expected_snapshot = {
2, /* term */
16, /* index */
4 /* data */
};
char filename1_removed[64];
char metafilename1_removed[64];
char filename2_removed[64];
char metafilename2_removed[64];
/* Take a snapshot but then remove the data file, as if the server crashed
* before it could complete writing it. */
sprintf(filename1_removed, "snapshot-2-18-%ju", now);
sprintf(metafilename1_removed, "snapshot-2-18-%ju.meta", now);
SNAPSHOT_PUT(2, 18, 1);
munit_assert_true(DirHasFile(f->dir, filename1_removed));
munit_assert_true(DirHasFile(f->dir, metafilename1_removed));
DirRemoveFile(f->dir, filename1_removed);
/* Take a snapshot but then remove the .meta file */
now = uv_now(&f->loop);
sprintf(filename2_removed, "snapshot-2-19-%ju", now);
sprintf(metafilename2_removed, "snapshot-2-19-%ju.meta", now);
SNAPSHOT_PUT(2, 19, 2);
munit_assert_true(DirHasFile(f->dir, filename2_removed));
munit_assert_true(DirHasFile(f->dir, metafilename2_removed));
DirRemoveFile(f->dir, metafilename2_removed);
/* Take a valid snapshot and make sure it's loaded */
SNAPSHOT_PUT(2, 16, 4);
LOAD(0, /* term */
0, /* voted for */
&expected_snapshot, /* snapshot */
17, /* start index */
0, /* data for first loaded entry */
0 /* n entries */
);
/* The orphaned files are removed */
munit_assert_false(DirHasFile(f->dir, metafilename1_removed));
munit_assert_false(DirHasFile(f->dir, filename2_removed));
return MUNIT_OK;
}
/* The data directory has a closed segment with entries that are no longer
* needed, since they are included in a snapshot. We still keep those segments
* and just let the next snapshot logic delete them. */
TEST(load, closedSegmentWithEntriesBehindSnapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct snapshot snapshot = {
1, /* term */
2, /* index */
1 /* data */
};
APPEND(1, 1);
SNAPSHOT_PUT(1, 2, 1);
LOAD(0, /* term */
0, /* voted for */
&snapshot, /* snapshot */
3, /* start index */
0, /* data for first loaded entry */
0 /* n entries */
);
munit_assert_true(HAS_CLOSED_SEGMENT_FILE(1, 1));
return MUNIT_OK;
}
/* The data directory has a closed segment with entries that are no longer
* needed, since they are included in a snapshot. However it also has an open
* segment that has enough entries to reach the snapshot last index. */
TEST(load, openSegmentWithEntriesPastSnapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct snapshot snapshot = {
1, /* term */
2, /* index */
1 /* data */
};
APPEND(1, 1);
APPEND(1, 2);
SNAPSHOT_PUT(1, 2, 1);
UNFINALIZE(2, 2, 1);
LOAD(0, /* term */
0, /* voted for */
&snapshot, /* snapshot */
1, /* start index */
1, /* data for first loaded entry */
2 /* n entries */
);
return MUNIT_OK;
}
/* The data directory has a closed segment whose filename encodes a number of
* entries which is different then ones it actually contains. */
TEST(load, closedSegmentWithInconsistentFilename, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(3, 1);
DirRenameFile(f->dir, "0000000000000001-0000000000000003",
"0000000000000001-0000000000000004");
LOAD_ERROR(RAFT_CORRUPT,
"load closed segment 0000000000000001-0000000000000004: found 3 "
"entries (expected 4)");
return MUNIT_OK;
}
/* The data directory has a closed segment whose filename encodes a number of
* entries which is different then ones it actually contains, and auto-recovery
* is turned on. */
TEST(load,
closedSegmentWithInconsistentFilenameAutoRecovery,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
APPEND(3, 1);
DirRenameFile(f->dir, "0000000000000001-0000000000000003",
"0000000000000001-0000000000000004");
/* Load in pristine condition */
LOAD_WITH_AUTO_RECOVERY(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
0, /* data for first loaded entry */
0 /* n entries */
);
return MUNIT_OK;
}
/* The data directory has a closed segment with entries that are no longer
* needed, since they are included in a snapshot. It also has an open segment,
* however that does not have enough entries to reach the snapshot last
* index. */
TEST(load, openSegmentWithEntriesBehindSnapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1, 1);
APPEND(1, 2);
SNAPSHOT_PUT(1, 3, 1);
UNFINALIZE(2, 2, 1);
LOAD_ERROR(RAFT_CORRUPT,
"last entry on disk has index 2, which is behind last "
"snapshot's index 3");
return MUNIT_OK;
}
/* The data directory has a closed segment with entries that are no longer
* needed, since they are included in a snapshot. It also has an open segment,
* however that does not have enough entries to reach the snapshot last
* index, and auto-receovery is turned on. */
TEST(load,
openSegmentWithEntriesBehindSnapshotAutoRecovery,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
struct snapshot snapshot = {
1, /* term */
3, /* index */
1 /* data */
};
APPEND(1, 1);
APPEND(1, 2);
SNAPSHOT_PUT(1, 3, 1);
UNFINALIZE(2, 2, 1);
LOAD_WITH_AUTO_RECOVERY(0, /* term */
0, /* voted for */
&snapshot, /* snapshot */
4, /* start index */
0, /* data for first loaded entry */
0 /* n entries */
);
return MUNIT_OK;
}
/* The data directory contains a snapshot and an open segment containing a valid
* entry, and no closed segments. */
TEST(load, openSegmentNoClosedSegmentsSnapshotPresent, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct snapshot snapshot = {
1, /* term */
3, /* index */
1 /* data */
};
SNAPSHOT_PUT(1, 3, 1);
APPEND(1, 4);
UNFINALIZE(4, 4, 1);
LOAD(0, /* term */
0, /* voted for */
&snapshot, /* snapshot */
4, /* start index */
4, /* data for first loaded entry */
1 /* n entries */
);
return MUNIT_OK;
}
/* The data directory contains a snapshot and an open segment with a corrupt
* format header and no closed segments. */
TEST(load,
corruptOpenSegmentNoClosedSegmentsSnapshotPresent,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
SNAPSHOT_PUT(1, 3, 1);
APPEND(1, 4);
UNFINALIZE(4, 4, 1);
/* Corrupt open segment */
uint64_t version = 0 /* Format version */;
DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
LOAD_ERROR(RAFT_CORRUPT,
"load open segment open-1: unexpected format version 0");
return MUNIT_OK;
}
/* The data directory contains a snapshot and an open segment with a corrupt
* format header and no closed segments. Auto-recovery is turned on. */
TEST(load,
corruptOpenSegmentNoClosedSegmentsSnapshotPresentWithAutoRecovery,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
struct snapshot snapshot = {
1, /* term */
3, /* index */
1 /* data */
};
SNAPSHOT_PUT(1, 3, 1);
APPEND(1, 4);
UNFINALIZE(4, 4, 1);
/* Corrupt open segment */
uint64_t version = 0 /* Format version */;
DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
/* Load is successful. */
LOAD_WITH_AUTO_RECOVERY(0, /* term */
0, /* voted for */
&snapshot, /* snapshot */
4, /* start index */
1, /* data for first loaded entry */
1 /* n entries */
);
return MUNIT_OK;
}
/* The data directory contains a snapshot and an open segment with a corrupt
* format header and a closed segment. */
TEST(load,
corruptOpenSegmentClosedSegmentSnapshotPresent,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
SNAPSHOT_PUT(1, 3, 1);
APPEND(1, 4);
APPEND(1, 5);
UNFINALIZE(5, 5, 1);
/* Corrupt open segment */
uint64_t version = 0 /* Format version */;
DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
LOAD_ERROR(RAFT_CORRUPT,
"load open segment open-1: unexpected format version 0");
return MUNIT_OK;
}
/* The data directory contains a snapshot and an open segment with a corrupt
* format header and a closed segment. Auto-recovery is turned on. */
TEST(load,
corruptOpenSegmentClosedSegmentSnapshotPresentWithAutoRecovery,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
struct snapshot snapshot = {
1, /* term */
3, /* index */
1 /* data */
};
SNAPSHOT_PUT(1, 3, 1);
APPEND(1, 4);
APPEND(1, 5);
UNFINALIZE(5, 5, 1);
/* Corrupt open segment */
uint64_t version = 0 /* Format version */;
DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
/* Load is successful. */
LOAD_WITH_AUTO_RECOVERY(0, /* term */
0, /* voted for */
&snapshot, /* snapshot */
4, /* start index */
4, /* data for first loaded entry */
1 /* n entries */
);
/* Open segment has been renamed */
munit_assert_false(DirHasFile(f->dir, "open-1"));
return MUNIT_OK;
}
/* The data directory contains a snapshot and an open segment with a corrupt
* format header and multiple closed segment. Auto-recovery is turned on. */
TEST(load,
corruptOpenSegmentClosedSegmentsSnapshotPresentWithAutoRecovery,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
struct snapshot snapshot = {
1, /* term */
3, /* index */
1 /* data */
};
SNAPSHOT_PUT(1, 3, 1);
APPEND(1, 4);
APPEND(1, 5);
APPEND(1, 6);
UNFINALIZE(6, 6, 1);
/* Corrupt open segment */
uint64_t version = 0 /* Format version */;
DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
LOAD_WITH_AUTO_RECOVERY(0, /* term */
0, /* voted for */
&snapshot, /* snapshot */
4, /* start index */
4, /* data for first loaded entry */
2 /* n entries */
);
/* Open segment has been renamed during the first load */
munit_assert_false(DirHasFile(f->dir, "open-1"));
return MUNIT_OK;
}
/* The data directory contains a snapshot and an open segment with a corrupt
* format header and multiple closed segment. */
TEST(load,
corruptOpenSegmentClosedSegmentsSnapshotPresent,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
SNAPSHOT_PUT(1, 3, 1);
APPEND(1, 4);
APPEND(1, 5);
APPEND(1, 6);
UNFINALIZE(6, 6, 1);
/* Corrupt open segment */
uint64_t version = 0 /* Format version */;
DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
LOAD_ERROR(RAFT_CORRUPT,
"load open segment open-1: unexpected format version 0");
return MUNIT_OK;
}
/* The data directory contains a closed segment and an open segment with a
* corrupt format header and no snapshot. */
TEST(load, corruptOpenSegmentClosedSegments, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(4, 1);
APPEND(1, 5);
UNFINALIZE(5, 5, 1);
/* Corrupt open segment */
uint64_t version = 0 /* Format version */;
DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
LOAD_ERROR(RAFT_CORRUPT,
"load open segment open-1: unexpected format version 0");
return MUNIT_OK;
}
/* The data directory contains a closed segment and an open segment with a
* corrupt format header and no snapshot. Auto-recovery is turned on. */
TEST(load,
corruptOpenSegmentClosedSegmentsWithAutoRecovery,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
APPEND(4, 1);
APPEND(1, 5);
UNFINALIZE(5, 5, 1);
/* Corrupt open segment */
uint64_t version = 0 /* Format version */;
DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
/* load is successful. */
LOAD_WITH_AUTO_RECOVERY(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
1, /* data for first loaded entry */
4 /* n entries */
);
/* Open segment has been renamed */
munit_assert_false(DirHasFile(f->dir, "open-1"));
return MUNIT_OK;
}
/* The data directory contains a closed segment and two open segments.
* The first open segment has a corrupt header. */
TEST(load, corruptOpenSegmentsClosedSegments, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(3, 1);
APPEND(1, 4);
APPEND(1, 5);
UNFINALIZE(4, 4, 1);
UNFINALIZE(5, 5, 2);
/* Corrupt open segment */
uint64_t version = 0 /* Format version */;
DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
LOAD_ERROR(RAFT_CORRUPT,
"load open segment open-1: unexpected format version 0");
return MUNIT_OK;
}
/* The data directory contains a closed segment and two open segments.
* The first open segment has a corrupt header. Auto-recovery is turned on. */
TEST(load,
corruptOpenSegmentsClosedSegmentsWithAutoRecovery,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
APPEND(3, 1);
APPEND(1, 4);
APPEND(1, 5);
UNFINALIZE(4, 4, 1);
UNFINALIZE(5, 5, 2);
/* Corrupt open segment */
uint64_t version = 0 /* Format version */;
DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
LOAD_WITH_AUTO_RECOVERY(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
1, /* data for first loaded entry */
3 /* n entries */
);
/* Open segments have been renamed */
munit_assert_false(DirHasFile(f->dir, "open-1"));
munit_assert_false(DirHasFile(f->dir, "open-2"));
return MUNIT_OK;
}
/* The data directory contains a closed segment and two open segments.
* The second open segment has a corrupt header. */
TEST(load, corruptLastOpenSegmentClosedSegments, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(3, 1);
APPEND(1, 4);
APPEND(1, 5);
UNFINALIZE(4, 4, 1);
UNFINALIZE(5, 5, 2);
/* Corrupt open segment */
uint64_t version = 0 /* Format version */;
DirOverwriteFile(f->dir, "open-2", &version, sizeof version, 0);
LOAD_ERROR(RAFT_CORRUPT,
"load open segment open-2: unexpected format version 0");
return MUNIT_OK;
}
/* The data directory contains a closed segment and two open segments.
* The second open segment has a corrupt header. Auto-recovery is turned on. */
TEST(load,
corruptLastOpenSegmentClosedSegmentsWithAutoRecovery,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
APPEND(3, 1);
APPEND(1, 4);
APPEND(1, 5);
UNFINALIZE(4, 4, 1);
UNFINALIZE(5, 5, 2);
/* Corrupt open segment */
uint64_t version = 0 /* Format version */;
DirOverwriteFile(f->dir, "open-2", &version, sizeof version, 0);
LOAD_WITH_AUTO_RECOVERY(0, /* term */
0, /* voted for */
NULL, /* snapshot */
1, /* start index */
1, /* data for first loaded entry */
4 /* n entries */
);
/* Open segment has been renamed during the first load */
munit_assert_false(DirHasFile(f->dir, "open-2"));
return MUNIT_OK;
}
/* The data directory has several closed segments, all with entries compatible
* with the snapshot. */
TEST(load, closedSegmentsOverlappingWithSnapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct snapshot snapshot = {
1, /* term */
4, /* index */
1 /* data */
};
APPEND(1, 1);
APPEND(2, 2);
APPEND(3, 4);
SNAPSHOT_PUT(1, 4, 1);
LOAD(0, /* term */
0, /* voted for */
&snapshot, /* snapshot */
1, /* start index */
1, /* data for first loaded entry */
6 /* n entries */
);
return MUNIT_OK;
}
/* The data directory has several closed segments, the last of which is corrupt.
* There is a snapshot. */
TEST(load,
closedSegmentsWithSnapshotLastSegmentCorrupt,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
SNAPSHOT_PUT(1, 4, 1);
APPEND(1, 5);
APPEND(2, 6);
APPEND(2, 8);
/* Corrupt the last closed segment */
size_t offset =
WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */;
uint32_t corrupted = 123456789;
DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 9), &corrupted,
sizeof corrupted, offset);
LOAD_ERROR(RAFT_CORRUPT,
"load closed segment 0000000000000008-0000000000000009: entries "
"batch 1 starting at byte 8: data checksum mismatch");
return MUNIT_OK;
}
/* The data directory has several closed segments, the last of which is corrupt.
* There is a snapshot. Auto-recovery is turned on. */
TEST(load,
closedSegmentsWithSnapshotLastSegmentCorruptAutoRecovery,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
struct snapshot snapshot = {
1, /* term */
4, /* index */
1 /* data */
};
SNAPSHOT_PUT(1, 4, 1);
APPEND(1, 5);
APPEND(2, 6);
APPEND(2, 8);
/* Corrupt the last closed segment */
size_t offset =
WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */;
uint32_t corrupted = 123456789;
DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 9), &corrupted,
sizeof corrupted, offset);
LOAD_WITH_AUTO_RECOVERY(0, /* term */
0, /* voted for */
&snapshot, /* snapshot */
5, /* start index */
5, /* data for first loaded entry */
3 /* n entries */
);
return MUNIT_OK;
}
/* The data directory has several closed segments, the last of which is corrupt.
* There is an open segment and a snapshot. Auto-recovery is turned on. */
TEST(load,
closedSegmentsWithSnapshotLastSegmentCorruptOpenSegmentWithAutoRecovery,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
struct snapshot snapshot = {
1, /* term */
4, /* index */
1 /* data */
};
SNAPSHOT_PUT(1, 4, 1);
APPEND(1, 5);
APPEND(2, 6);
APPEND(1, 8);
APPEND(1, 9);
UNFINALIZE(9, 9, 1);
/* Corrupt the last closed segment */
size_t offset =
WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */;
uint32_t corrupted = 123456789;
DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 8), &corrupted,
sizeof corrupted, offset);
munit_assert_true(HAS_OPEN_SEGMENT_FILE(1));
LOAD_WITH_AUTO_RECOVERY(0, /* term */
0, /* voted for */
&snapshot, /* snapshot */
5, /* start index */
5, /* data for first loaded entry */
3 /* n entries */
);
munit_assert_false(HAS_OPEN_SEGMENT_FILE(1));
return MUNIT_OK;
}
/* The data directory has several closed segments, the last of which is corrupt.
* There is an open segment and a snapshot. */
TEST(load,
closedSegmentsWithSnapshotLastSegmentCorruptOpenSegment,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
SNAPSHOT_PUT(1, 4, 1);
APPEND(1, 5);
APPEND(2, 6);
APPEND(1, 8);
APPEND(1, 9);
UNFINALIZE(9, 9, 1);
/* Corrupt the last closed segment */
size_t offset =
WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */;
uint32_t corrupted = 123456789;
DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 8), &corrupted,
sizeof corrupted, offset);
munit_assert_true(HAS_OPEN_SEGMENT_FILE(1));
LOAD_ERROR(RAFT_CORRUPT,
"load closed segment 0000000000000008-0000000000000008: entries "
"batch 1 starting at byte 8: data checksum mismatch");
return MUNIT_OK;
}
/* The data directory has several closed segments, the second to last one of
* which is corrupt. There is a snapshot. */
TEST(load,
closedSegmentsWithSnapshotSecondLastSegmentCorrupt,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
SNAPSHOT_PUT(1, 4, 1);
APPEND(1, 5);
APPEND(2, 6);
APPEND(2, 8);
/* Corrupt the second last closed segment */
size_t offset =
WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */;
uint32_t corrupted = 123456789;
DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(6, 7), &corrupted,
sizeof corrupted, offset);
LOAD_ERROR(RAFT_CORRUPT,
"load closed segment 0000000000000006-0000000000000007: entries "
"batch 1 starting at byte 8: data checksum mismatch");
/* Second load still fails. */
LOAD_ERROR_NO_SETUP(
RAFT_CORRUPT,
"load closed segment 0000000000000006-0000000000000007: entries "
"batch 1 starting at byte 8: data checksum mismatch");
return MUNIT_OK;
}
/* The data directory has several closed segments, some of which have a gap,
* which is still compatible with the snapshot. */
TEST(load, nonContiguousClosedSegments, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct snapshot snapshot = {
1, /* term */
4, /* index */
1 /* data */
};
APPEND(1, 1);
APPEND(2, 2);
APPEND(3, 4);
SNAPSHOT_PUT(1, 4, 1);
DirRemoveFile(f->dir, CLOSED_SEGMENT_FILENAME(2, 3));
LOAD(0, /* term */
0, /* voted for */
&snapshot, /* snapshot */
4, /* start index */
4, /* data for first loaded entry */
3 /* n entries */
);
return MUNIT_OK;
}
/* If the data directory has a closed segment whose start index is beyond the
* snapshot's last index, an error is returned. */
TEST(load, closedSegmentWithEntriesPastSnapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
uint64_t now;
char errmsg[128];
APPEND(5, 1);
APPEND(1, 5);
uv_update_time(&f->loop);
now = uv_now(&f->loop);
sprintf(errmsg,
"closed segment 0000000000000006-0000000000000006 is past last "
"snapshot snapshot-1-4-%ju",
now);
SNAPSHOT_PUT(1, 4, 1);
DirRemoveFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 5));
LOAD_ERROR(RAFT_CORRUPT, errmsg);
return MUNIT_OK;
}
/* The data directory has an open segment which has incomplete format data. */
TEST(load, openSegmentWithIncompleteFormat, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
DirWriteFileWithZeros(f->dir, "open-1", WORD_SIZE / 2);
LOAD_ERROR(RAFT_IOERR, "load open segment open-1: file has only 4 bytes");
return MUNIT_OK;
}
/* The data directory has an open segment which has an incomplete batch
* preamble. */
TEST(load, openSegmentWithIncompletePreamble, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
size_t offset = WORD_SIZE /* Format version */ + WORD_SIZE /* Checksums */;
APPEND(1, 1);
UNFINALIZE(1, 1, 1);
DirTruncateFile(f->dir, "open-1", offset);
LOAD_ERROR(RAFT_IOERR,
"load open segment open-1: entries batch 1 starting at byte 16: "
"read preamble: short read: 0 bytes instead of 8");
return MUNIT_OK;
}
/* The data directory has an open segment which has incomplete batch header. */
TEST(load, openSegmentWithIncompleteBatchHeader, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
size_t offset = WORD_SIZE + /* Format version */
WORD_SIZE + /* Checksums */
WORD_SIZE + /* Number of entries */
WORD_SIZE /* Partial batch header */;
APPEND(1, 1);
UNFINALIZE(1, 1, 1);
DirTruncateFile(f->dir, "open-1", offset);
LOAD_ERROR(RAFT_IOERR,
"load open segment open-1: entries batch 1 starting at byte 8: "
"read header: short read: 8 bytes instead of 16");
return MUNIT_OK;
}
/* The data directory has an open segment which has incomplete batch data. */
TEST(load, openSegmentWithIncompleteBatchData, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
size_t offset = WORD_SIZE + /* Format version */
WORD_SIZE + /* Checksums */
WORD_SIZE + /* Number of entries */
WORD_SIZE + /* Entry term */
WORD_SIZE + /* Entry type and data size */
WORD_SIZE / 2 /* Partial entry data */;
APPEND(1, 1);
UNFINALIZE(1, 1, 1);
DirTruncateFile(f->dir, "open-1", offset);
LOAD_ERROR(RAFT_IOERR,
"load open segment open-1: entries batch 1 starting at byte 8: "
"read data: short read: 4 bytes instead of 8");
return MUNIT_OK;
}
/* The data directory has a closed segment which has corrupted batch header. */
TEST(load, closedSegmentWithCorruptedBatchHeader, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
size_t offset = WORD_SIZE /* Format version */;
uint64_t corrupted = 12345678;
APPEND(1, 1);
DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), &corrupted,
sizeof corrupted, offset);
LOAD_ERROR(RAFT_CORRUPT,
"load closed segment 0000000000000001-0000000000000001: entries "
"batch 1 starting at byte 8: header checksum mismatch");
return MUNIT_OK;
}
/* The data directory has a closed segment which has corrupted batch data. */
TEST(load, closedSegmentWithCorruptedBatchData, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
size_t offset =
WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */;
uint32_t corrupted = 123456789;
APPEND(1, 1);
DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), &corrupted,
sizeof corrupted, offset);
LOAD_ERROR(RAFT_CORRUPT,
"load closed segment 0000000000000001-0000000000000001: entries "
"batch 1 starting at byte 8: data checksum mismatch");
return MUNIT_OK;
}
/* The data directory has a closed segment whose first index does not match what
* we expect. */
TEST(load, closedSegmentWithBadIndex, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1, 1);
APPEND(1, 2);
DirRemoveFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1));
LOAD_ERROR(RAFT_CORRUPT,
"unexpected closed segment 0000000000000002-0000000000000002: "
"first index should have been 1");
return MUNIT_OK;
}
/* The data directory has an empty closed segment. */
TEST(load, emptyClosedSegment, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
DirWriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), NULL, 0);
LOAD_ERROR(
RAFT_CORRUPT,
"load closed segment 0000000000000001-0000000000000001: file is empty");
return MUNIT_OK;
}
/* The data directory has a closed segment with an unexpected format. */
TEST(load, closedSegmentWithBadFormat, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
uint8_t buf[8] = {2, 0, 0, 0, 0, 0, 0, 0};
DirWriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), buf, sizeof buf);
LOAD_ERROR(RAFT_CORRUPT,
"load closed segment 0000000000000001-0000000000000001: "
"unexpected format version 2");
return MUNIT_OK;
}
/* The data directory has an open segment which is not readable. */
TEST(load, openSegmentWithNoAccessPermission, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
/* Skip the test when running as root, since EACCES would not be triggered
* in that case. */
if (getuid() == 0) {
SETUP_UV; /* Setup the uv object since teardown expects it. */
return MUNIT_SKIP;
}
APPEND(1, 1);
UNFINALIZE(1, 1, 1);
DirMakeFileUnreadable(f->dir, "open-1");
LOAD_ERROR(RAFT_IOERR,
"load open segment open-1: read file: open: permission denied");
return MUNIT_OK;
}
/* The data directory has an open segment with format set to 0 and non-zero
* content. */
TEST(load, openSegmentWithZeroFormatAndThenData, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
uint64_t version = 0 /* Format version */;
APPEND(1, 1);
UNFINALIZE(1, 1, 1);
DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
LOAD_ERROR(RAFT_CORRUPT,
"load open segment open-1: unexpected format version 0");
return MUNIT_OK;
}
/* The data directory has an open segment with an unexpected format. */
TEST(load, openSegmentWithBadFormat, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
uint8_t version[8] = {2, 0, 0, 0, 0, 0, 0, 0};
APPEND(1, 1);
UNFINALIZE(1, 1, 1);
DirOverwriteFile(f->dir, "open-1", version, sizeof version, 0);
LOAD_ERROR(RAFT_CORRUPT,
"load open segment open-1: unexpected format version 2");
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_uv_recover.c 0000664 0000000 0000000 00000004274 14601504142 0021420 0 ustar 00root root 0000000 0000000 #include "../lib/runner.h"
#include "../lib/uv.h"
/******************************************************************************
*
* Fixture
*
*****************************************************************************/
struct fixture
{
FIXTURE_UV_DEPS;
FIXTURE_UV;
};
static void *setUp(const MunitParameter params[], void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_UV_DEPS;
SETUP_UV;
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
TEAR_DOWN_UV;
TEAR_DOWN_UV_DEPS;
free(f);
}
/******************************************************************************
*
* raft_io->recover()
*
*****************************************************************************/
SUITE(recover)
/* Invoke recover and assert that it fails with the given error. */
#define RECOVER_ERROR(RV, CONF) \
{ \
int rv_; \
rv_ = f->io.recover(&f->io, CONF); \
munit_assert_int(rv_, ==, RV); \
}
/* Invoke recover and assert that it succeeds */
#define RECOVER(CONF) RECOVER_ERROR(0, CONF)
/* If the instance has been already initialized, an error is returned. */
/* A new configuration is saved as last entry on disk. */
TEST(recover, newConfiguration, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_configuration configuration1;
struct raft_configuration configuration2;
int rv;
/* Bootstrap using an initial configuration */
raft_configuration_init(&configuration1);
rv = raft_configuration_add(&configuration1, 1, "1", RAFT_VOTER);
munit_assert_int(rv, ==, 0);
rv = raft_configuration_add(&configuration1, 2, "2", RAFT_VOTER);
munit_assert_int(rv, ==, 0);
rv = f->io.bootstrap(&f->io, &configuration1);
munit_assert_int(rv, ==, 0);
/* Bootstrap using a different configuration */
raft_configuration_init(&configuration2);
rv = raft_configuration_add(&configuration2, 1, "1", RAFT_VOTER);
munit_assert_int(rv, ==, 0);
RECOVER(&configuration2);
raft_configuration_close(&configuration1);
raft_configuration_close(&configuration2);
return 0;
}
raft-0.22.1/test/integration/test_uv_recv.c 0000664 0000000 0000000 00000042433 14601504142 0020711 0 ustar 00root root 0000000 0000000 #include "../lib/runner.h"
#include "../lib/tcp.h"
#include "../lib/uv.h"
/******************************************************************************
*
* Fixture with a libuv-based raft_io instance.
*
*****************************************************************************/
struct peer
{
struct uv_loop_s loop;
struct raft_uv_transport transport;
struct raft_io io;
};
struct fixture
{
FIXTURE_UV_DEPS;
FIXTURE_TCP;
FIXTURE_UV;
struct peer peer;
bool closed;
};
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
struct result
{
struct raft_message *message;
bool done;
};
static void recvCb(struct raft_io *io, struct raft_message *m1)
{
struct result *result = io->data;
struct raft_message *m2 = result->message;
unsigned i;
munit_assert_int(m1->type, ==, m2->type);
switch (m1->type) {
case RAFT_REQUEST_VOTE:
munit_assert_int(m1->request_vote.term, ==, m2->request_vote.term);
munit_assert_int(m1->request_vote.candidate_id, ==,
m2->request_vote.candidate_id);
munit_assert_int(m1->request_vote.last_log_index, ==,
m2->request_vote.last_log_index);
munit_assert_int(m1->request_vote.last_log_term, ==,
m2->request_vote.last_log_term);
munit_assert_int(m1->request_vote.disrupt_leader, ==,
m2->request_vote.disrupt_leader);
break;
case RAFT_REQUEST_VOTE_RESULT:
munit_assert_int(m1->request_vote_result.term, ==,
m2->request_vote_result.term);
munit_assert_int(m1->request_vote_result.vote_granted, ==,
m2->request_vote_result.vote_granted);
break;
case RAFT_APPEND_ENTRIES:
munit_assert_int(m1->append_entries.n_entries, ==,
m2->append_entries.n_entries);
for (i = 0; i < m1->append_entries.n_entries; i++) {
struct raft_entry *entry1 = &m1->append_entries.entries[i];
struct raft_entry *entry2 = &m2->append_entries.entries[i];
munit_assert_int(entry1->term, ==, entry2->term);
munit_assert_int(entry1->type, ==, entry2->type);
munit_assert_int(entry1->buf.len, ==, entry2->buf.len);
munit_assert_int(
memcmp(entry1->buf.base, entry2->buf.base, entry1->buf.len),
==, 0);
}
if (m1->append_entries.n_entries > 0) {
raft_free(m1->append_entries.entries[0].batch);
raft_free(m1->append_entries.entries);
}
break;
case RAFT_APPEND_ENTRIES_RESULT:
munit_assert_int(m1->append_entries_result.term, ==,
m2->append_entries_result.term);
munit_assert_int(m1->append_entries_result.rejected, ==,
m2->append_entries_result.rejected);
munit_assert_int(m1->append_entries_result.last_log_index, ==,
m2->append_entries_result.last_log_index);
break;
case RAFT_INSTALL_SNAPSHOT:
munit_assert_int(m1->install_snapshot.conf.n, ==,
m2->install_snapshot.conf.n);
for (i = 0; i < m1->install_snapshot.conf.n; i++) {
struct raft_server *s1 = &m1->install_snapshot.conf.servers[i];
struct raft_server *s2 = &m2->install_snapshot.conf.servers[i];
munit_assert_int(s1->id, ==, s2->id);
munit_assert_string_equal(s1->address, s2->address);
munit_assert_int(s1->role, ==, s2->role);
}
munit_assert_int(m1->install_snapshot.data.len, ==,
m2->install_snapshot.data.len);
munit_assert_int(memcmp(m1->install_snapshot.data.base,
m2->install_snapshot.data.base,
m2->install_snapshot.data.len),
==, 0);
raft_configuration_close(&m1->install_snapshot.conf);
raft_free(m1->install_snapshot.data.base);
break;
case RAFT_TIMEOUT_NOW:
munit_assert_int(m1->timeout_now.term, ==, m2->timeout_now.term);
munit_assert_int(m1->timeout_now.last_log_index, ==,
m2->timeout_now.last_log_index);
munit_assert_int(m1->timeout_now.last_log_term, ==,
m2->timeout_now.last_log_term);
break;
};
result->done = true;
}
static void peerSendCb(struct raft_io_send *req, int status)
{
bool *done = req->data;
munit_assert_int(status, ==, 0);
*done = true;
}
static void peerCloseCb(struct raft_io *io)
{
bool *done = io->data;
*done = true;
}
/* Set up the fixture's peer raft_io instance. */
#define PEER_SETUP \
do { \
struct uv_loop_s *_loop = &f->peer.loop; \
struct raft_uv_transport *_transport = &f->peer.transport; \
struct raft_io *_io = &f->peer.io; \
int _rv; \
_rv = uv_loop_init(_loop); \
munit_assert_int(_rv, ==, 0); \
_transport->version = 1; \
_rv = raft_uv_tcp_init(_transport, _loop); \
munit_assert_int(_rv, ==, 0); \
_rv = raft_uv_init(_io, _loop, f->dir, _transport); \
munit_assert_int(_rv, ==, 0); \
_rv = _io->init(_io, 2, "127.0.0.1:9002"); \
munit_assert_int(_rv, ==, 0); \
} while (0)
/* Tear down the fixture's peer raft_io instance. */
#define PEER_TEAR_DOWN \
do { \
struct uv_loop_s *_loop = &f->peer.loop; \
struct raft_uv_transport *_transport = &f->peer.transport; \
struct raft_io *_io = &f->peer.io; \
bool _done = false; \
int _i; \
_done = false; \
_io->data = &_done; \
_io->close(_io, peerCloseCb); \
for (_i = 0; _i < 10; _i++) { \
if (_done) { \
break; \
} \
uv_run(_loop, UV_RUN_ONCE); \
} \
uv_run(_loop, UV_RUN_DEFAULT); \
munit_assert_true(_done); \
raft_uv_close(_io); \
raft_uv_tcp_close(_transport); \
uv_loop_close(_loop); \
} while (0)
/* Send a message to the main fixture's raft_io instance using the fixture's
* peer instance. */
#define PEER_SEND(MESSAGE) \
do { \
struct uv_loop_s *_loop = &f->peer.loop; \
struct raft_io *_io = &f->peer.io; \
struct raft_io_send _req; \
bool _done = false; \
int _i; \
int _rv; \
(MESSAGE)->server_id = 1; \
(MESSAGE)->server_address = "127.0.0.1:9001"; \
_req.data = &_done; \
_rv = _io->send(_io, &_req, MESSAGE, peerSendCb); \
munit_assert_int(_rv, ==, 0); \
for (_i = 0; _i < 10; _i++) { \
if (_done) { \
break; \
} \
uv_run(_loop, UV_RUN_ONCE); \
} \
munit_assert_true(_done); \
} while (0)
/* Establish a connection and send an handshake using plain TCP. */
#define PEER_HANDSHAKE \
do { \
uint8_t _handshake[] = { \
6, 6, 6, 0, 0, 0, 0, 0, /* Protocol */ \
1, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ \
2, 0, 0, 0, 0, 0, 0, 0, /* Address length, in words */ \
0, 0, 0, 0, 0, 0, 0, 0, /* First address word */ \
0, 0, 0, 0, 0, 0, 0, 0 /* Second address word */ \
}; \
sprintf((char *)&_handshake[24], "127.0.0.1:666"); \
TCP_CLIENT_CONNECT(9001); \
TCP_CLIENT_SEND(_handshake, sizeof _handshake); \
} while (0);
/* Run the loop until a new message is received. Assert that the received
* message matches the given one. */
#define RECV(MESSAGE) \
do { \
struct result _result = {MESSAGE, false}; \
f->io.data = &_result; \
LOOP_RUN_UNTIL(&_result.done); \
f->io.data = NULL; \
} while (0)
/******************************************************************************
*
* Set up and tear down.
*
*****************************************************************************/
static void *setUpDeps(const MunitParameter params[], void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_UV_DEPS;
SETUP_TCP;
PEER_SETUP;
f->io.version = 0; /* Magic value to avoid assuming that io.data is raft */
f->io.data = f;
f->closed = false;
return f;
}
static void tearDownDeps(void *data)
{
struct fixture *f = data;
PEER_TEAR_DOWN;
TEAR_DOWN_TCP;
TEAR_DOWN_UV_DEPS;
free(f);
}
static void *setUp(const MunitParameter params[], void *user_data)
{
struct fixture *f = setUpDeps(params, user_data);
int rv;
SETUP_UV;
f->io.version = 0; /* Magic value to avoid assuming that io.data is raft */
f->io.data = f;
rv = f->io.start(&f->io, 10000, NULL, recvCb);
munit_assert_int(rv, ==, 0);
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
TEAR_DOWN_UV;
tearDownDeps(f);
}
/******************************************************************************
*
* raft_io_recv_cb
*
*****************************************************************************/
SUITE(recv)
/* Receive the very first message over the connection. */
TEST(recv, first, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_message message;
message.type = RAFT_REQUEST_VOTE;
message.request_vote.candidate_id = 2;
message.request_vote.last_log_index = 123;
message.request_vote.last_log_term = 2;
message.request_vote.disrupt_leader = false;
PEER_SEND(&message);
RECV(&message);
return MUNIT_OK;
}
/* Receive the a first message then another one. */
TEST(recv, second, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_message message;
message.type = RAFT_REQUEST_VOTE;
message.request_vote.candidate_id = 2;
message.request_vote.last_log_index = 123;
message.request_vote.last_log_term = 2;
message.request_vote.disrupt_leader = true;
PEER_SEND(&message);
RECV(&message);
PEER_SEND(&message);
RECV(&message);
return MUNIT_OK;
}
/* Receive a RequestVote result message. */
TEST(recv, requestVoteResult, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_message message;
message.type = RAFT_REQUEST_VOTE_RESULT;
message.request_vote_result.term = 3;
message.request_vote_result.vote_granted = true;
message.request_vote_result.pre_vote = false;
PEER_SEND(&message);
RECV(&message);
return MUNIT_OK;
}
/* Receive an AppendEntries message with two entries. */
TEST(recv, appendEntries, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_entry entries[2];
struct raft_message message;
uint8_t data1[8] = {1, 2, 3, 4, 5, 6, 7, 8};
uint8_t data2[8] = {8, 7, 6, 5, 4, 3, 2, 1};
entries[0].type = RAFT_COMMAND;
entries[0].buf.base = data1;
entries[0].buf.len = sizeof data1;
entries[1].type = RAFT_COMMAND;
entries[1].buf.base = data2;
entries[1].buf.len = sizeof data2;
message.type = RAFT_APPEND_ENTRIES;
message.append_entries.entries = entries;
message.append_entries.n_entries = 2;
PEER_SEND(&message);
RECV(&message);
return MUNIT_OK;
}
/* Receive an AppendEntries message with no entries (i.e. an heartbeat). */
TEST(recv, heartbeat, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_message message;
message.type = RAFT_APPEND_ENTRIES;
message.append_entries.entries = NULL;
message.append_entries.n_entries = 0;
PEER_SEND(&message);
RECV(&message);
return MUNIT_OK;
}
/* Receive an AppendEntries result f->peer.message. */
TEST(recv, appendEntriesResult, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_message message;
message.type = RAFT_APPEND_ENTRIES_RESULT;
message.append_entries_result.term = 3;
message.append_entries_result.rejected = 0;
message.append_entries_result.last_log_index = 123;
PEER_SEND(&message);
RECV(&message);
return MUNIT_OK;
}
/* Receive an InstallSnapshot message. */
TEST(recv, installSnapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_message message;
uint8_t snapshot_data[8] = {1, 2, 3, 4, 5, 6, 7, 8};
int rv;
message.type = RAFT_INSTALL_SNAPSHOT;
message.install_snapshot.term = 2;
message.install_snapshot.last_index = 123;
message.install_snapshot.last_term = 1;
raft_configuration_init(&message.install_snapshot.conf);
rv = raft_configuration_add(&message.install_snapshot.conf, 1, "1",
RAFT_VOTER);
munit_assert_int(rv, ==, 0);
message.install_snapshot.data.len = sizeof snapshot_data;
message.install_snapshot.data.base = snapshot_data;
PEER_SEND(&message);
RECV(&message);
raft_configuration_close(&message.install_snapshot.conf);
return MUNIT_OK;
}
/* Receive a TimeoutNow message. */
TEST(recv, timeoutNow, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_message message;
message.type = RAFT_TIMEOUT_NOW;
message.timeout_now.term = 3;
message.timeout_now.last_log_index = 123;
message.timeout_now.last_log_term = 2;
PEER_SEND(&message);
RECV(&message);
return MUNIT_OK;
}
/* The handshake fails because of an unexpected protocon version. */
TEST(recv, badProtocol, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
uint8_t handshake[] = {
6, 6, 6, 0, 0, 0, 0, 0, /* Protocol */
1, 0, 0, 0, 0, 0, 0, 0, /* Server ID */
2, 0, 0, 0, 0, 0, 0, 0 /* Address length */
};
TCP_CLIENT_CONNECT(9001);
TCP_CLIENT_SEND(handshake, sizeof handshake);
LOOP_RUN(2);
return MUNIT_OK;
}
/* A message can't have zero length. */
TEST(recv, badSize, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
uint8_t header[] = {
1, 0, 0, 0, 0, 0, 0, 0, /* Message type */
0, 0, 0, 0, 0, 0, 0, 0 /* Message size */
};
PEER_HANDSHAKE;
TCP_CLIENT_SEND(header, sizeof header);
LOOP_RUN(2);
return MUNIT_OK;
}
/* A message with a bad type causes the connection to be aborted. */
TEST(recv, badType, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
uint8_t header[] = {
1, 2, 3, 4, 5, 6, 7, 8, /* Message type */
0, 0, 0, 0, 0, 0, 0, 0 /* Message size */
};
PEER_HANDSHAKE;
TCP_CLIENT_SEND(header, sizeof header);
LOOP_RUN(2);
return MUNIT_OK;
}
/* The backend is closed just before accepting a new connection. */
TEST(recv, closeBeforeAccept, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
uint8_t header[] = {
1, 2, 3, 4, 5, 6, 7, 8, /* Message type */
0, 0, 0, 0, 0, 0, 0, 0 /* Message size */
};
PEER_HANDSHAKE;
TCP_CLIENT_SEND(header, sizeof header);
LOOP_RUN(1);
TEAR_DOWN_UV;
return MUNIT_OK;
}
/* The backend is closed after receiving the header of an AppendEntries
* message. */
TEST(recv, closeAfterAppendEntriesHeader, setUp, tearDown, 0, NULL)
{
/* TODO */
return MUNIT_SKIP;
}
raft-0.22.1/test/integration/test_uv_send.c 0000664 0000000 0000000 00000030266 14601504142 0020704 0 ustar 00root root 0000000 0000000 #include
#include "../lib/runner.h"
#include "../lib/tcp.h"
#include "../lib/uv.h"
/******************************************************************************
*
* Fixture with a libuv-based raft_io instance and some pre-set messages.
*
*****************************************************************************/
#define N_MESSAGES 5
struct fixture
{
FIXTURE_UV_DEPS;
FIXTURE_TCP_SERVER;
FIXTURE_UV;
struct raft_message messages[N_MESSAGES];
};
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
struct result
{
int status;
bool done;
};
static void sendCbAssertResult(struct raft_io_send *req, int status)
{
struct result *result = req->data;
munit_assert_int(status, ==, result->status);
result->done = true;
}
/* Get I'th fixture's message. */
#define MESSAGE(I) (&f->messages[I])
/* Submit a send request for the I'th fixture's message. */
#define SEND_SUBMIT(I, RV, STATUS) \
struct raft_io_send _req##I; \
struct result _result##I = {STATUS, false}; \
int _rv##I; \
_req##I.data = &_result##I; \
_rv##I = \
f->io.send(&f->io, &_req##I, &f->messages[I], sendCbAssertResult); \
munit_assert_int(_rv##I, ==, RV)
/* Wait for the submit request of the I'th message to finish. */
#define SEND_WAIT(I) LOOP_RUN_UNTIL(&_result##I.done)
/* Submit a send request for the I'th fixture's message and wait for the
* operation to successfully complete. */
#define SEND(I) \
do { \
SEND_SUBMIT(I, 0 /* rv */, 0 /* status */); \
SEND_WAIT(I); \
} while (0)
/* Submit a send request and assert that it fails synchronously with the
* given error code and message. */
#define SEND_ERROR(I, RV, ERRMSG) \
do { \
SEND_SUBMIT(I, RV, 0 /* status */); \
/* munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \
} while (0)
/* Submit a send request and wait for the operation to fail with the given code
* and message. */
#define SEND_FAILURE(I, STATUS, ERRMSG) \
do { \
SEND_SUBMIT(I, 0 /* rv */, STATUS); \
SEND_WAIT(I); \
/*munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \
} while (0)
/******************************************************************************
*
* Set up and tear down.
*
*****************************************************************************/
static void *setUpDeps(const MunitParameter params[], void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_UV_DEPS;
SETUP_TCP_SERVER;
f->io.version = 0; /* Magic value to avoid assuming that io.data is raft */
f->io.data = f;
return f;
}
static void tearDownDeps(void *data)
{
struct fixture *f = data;
TEAR_DOWN_TCP_SERVER;
TEAR_DOWN_UV_DEPS;
free(f);
}
static void *setUp(const MunitParameter params[], void *user_data)
{
struct fixture *f = setUpDeps(params, user_data);
unsigned i;
SETUP_UV;
raft_uv_set_connect_retry_delay(&f->io, 1);
for (i = 0; i < N_MESSAGES; i++) {
struct raft_message *message = &f->messages[i];
message->type = RAFT_REQUEST_VOTE;
message->server_id = 1;
message->server_address = f->server.address;
}
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
TEAR_DOWN_UV;
tearDownDeps(f);
}
/******************************************************************************
*
* raft_io->send()
*
*****************************************************************************/
SUITE(send)
/* The first time a request is sent to a server a connection attempt is
* triggered. If the connection succeeds the request gets written out. */
TEST(send, first, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
SEND(0);
return MUNIT_OK;
}
/* The second time a request is sent it re-uses the connection that was already
* established */
TEST(send, second, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
SEND(0);
SEND(0);
return MUNIT_OK;
}
/* Submit a few send requests in parallel. */
TEST(send, parallel, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
SEND_SUBMIT(0 /* message */, 0 /* rv */, 0 /* status */);
SEND_SUBMIT(1 /* message */, 0 /* rv */, 0 /* status */);
SEND_WAIT(0);
SEND_WAIT(1);
return MUNIT_OK;
}
/* Send a request vote result message. */
TEST(send, voteResult, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
MESSAGE(0)->type = RAFT_REQUEST_VOTE_RESULT;
SEND(0);
return MUNIT_OK;
}
/* Send an append entries message. */
TEST(send, appendEntries, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_entry entries[2];
entries[0].buf.base = raft_malloc(16);
entries[0].buf.len = 16;
entries[1].buf.base = raft_malloc(8);
entries[1].buf.len = 8;
MESSAGE(0)->type = RAFT_APPEND_ENTRIES;
MESSAGE(0)->append_entries.entries = entries;
MESSAGE(0)->append_entries.n_entries = 2;
SEND(0);
raft_free(entries[0].buf.base);
raft_free(entries[1].buf.base);
return MUNIT_OK;
}
/* Send an append entries message with zero entries (i.e. a heartbeat). */
TEST(send, heartbeat, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
MESSAGE(0)->type = RAFT_APPEND_ENTRIES;
MESSAGE(0)->append_entries.entries = NULL;
MESSAGE(0)->append_entries.n_entries = 0;
SEND(0);
return MUNIT_OK;
}
/* Send an append entries result message. */
TEST(send, appendEntriesResult, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
MESSAGE(0)->type = RAFT_APPEND_ENTRIES_RESULT;
SEND(0);
return MUNIT_OK;
}
/* Send an install snapshot message. */
TEST(send, installSnapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_install_snapshot *p = &MESSAGE(0)->install_snapshot;
int rv;
MESSAGE(0)->type = RAFT_INSTALL_SNAPSHOT;
raft_configuration_init(&p->conf);
rv = raft_configuration_add(&p->conf, 1, "1", RAFT_VOTER);
munit_assert_int(rv, ==, 0);
p->data.len = 8;
p->data.base = raft_malloc(p->data.len);
SEND(0);
raft_configuration_close(&p->conf);
raft_free(p->data.base);
return MUNIT_OK;
}
/* A connection attempt fails asynchronously after the connect function
* returns. */
TEST(send, noConnection, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
MESSAGE(0)->server_address = "127.0.0.1:123456";
SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
TEAR_DOWN_UV;
return MUNIT_OK;
}
/* The message has an invalid IPv4 address. */
TEST(send, badAddress, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
MESSAGE(0)->server_address = "1";
SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
TEAR_DOWN_UV;
return MUNIT_OK;
}
/* Make sure UvSend doesn't use a stale connection for a certain server id
* by first sending a message to a valid address and then sending a message to
* an invalid address, making sure the valid connection is not reused.
* Afterwards assert that a send to the correct address still succeeds. */
TEST(send, changeToUnconnectedAddress, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
/* Send a message to a server and a connected address */
SEND(0);
/* Send a message to the same server, but update the address to an
* unconnected address and assert it fails. */
munit_assert_ullong(MESSAGE(0)->server_id, ==, MESSAGE(1)->server_id);
MESSAGE(1)->server_address = "127.0.0.2:1";
SEND_SUBMIT(1 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
/* Send another message to the same server and connected address */
munit_assert_ullong(MESSAGE(0)->server_id, ==, MESSAGE(2)->server_id);
SEND(2);
/* Send another message to the same server and connected address */
munit_assert_ullong(MESSAGE(0)->server_id, ==, MESSAGE(3)->server_id);
SEND(3);
TEAR_DOWN_UV;
return MUNIT_OK;
}
/* The message has an invalid type. */
TEST(send, badMessage, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
MESSAGE(0)->type = 666;
SEND_ERROR(0, RAFT_MALFORMED, "");
return MUNIT_OK;
}
/* Old send requests that have accumulated and could not yet be sent are
* progressively evicted. */
TEST(send, evictOldPending, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
TCP_SERVER_STOP;
SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_NOCONNECTION /* status */);
SEND_SUBMIT(1 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
SEND_SUBMIT(2 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
SEND_SUBMIT(3 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
SEND_WAIT(0);
TEAR_DOWN_UV;
return MUNIT_OK;
}
/* After the connection is established the peer dies and then comes back a
* little bit later. */
TEST(send, reconnectAfterWriteError, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
int socket;
SEND(0);
socket = TcpServerAccept(&f->server);
close(socket);
SEND_FAILURE(0, RAFT_IOERR, "");
SEND(0);
return MUNIT_OK;
}
/* After the connection is established the peer dies and then comes back a
* little bit later. At the time the peer died there where several writes
* pending. */
TEST(send, reconnectAfterMultipleWriteErrors, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
int socket;
signal(SIGPIPE, SIG_IGN);
SEND(0);
socket = TcpServerAccept(&f->server);
close(socket);
SEND_SUBMIT(1 /* message */, 0 /* rv */, RAFT_IOERR /* status */);
SEND_SUBMIT(2 /* message */, 0 /* rv */, RAFT_IOERR /* status */);
SEND_WAIT(1);
SEND_WAIT(2);
SEND(3);
return MUNIT_OK;
}
static char *oomHeapFaultDelay[] = {"0", "1", "2", "3", "4", NULL};
static char *oomHeapFaultRepeat[] = {"1", NULL};
static MunitParameterEnum oomParams[] = {
{TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay},
{TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat},
{NULL, NULL},
};
/* Out of memory conditions. */
TEST(send, oom, setUp, tearDown, 0, oomParams)
{
struct fixture *f = data;
HEAP_FAULT_ENABLE;
SEND_ERROR(0, RAFT_NOMEM, "");
return MUNIT_OK;
}
static char *oomAsyncHeapFaultDelay[] = {"2", NULL};
static char *oomAsyncHeapFaultRepeat[] = {"1", NULL};
static MunitParameterEnum oomAsyncParams[] = {
{TEST_HEAP_FAULT_DELAY, oomAsyncHeapFaultDelay},
{TEST_HEAP_FAULT_REPEAT, oomAsyncHeapFaultRepeat},
{NULL, NULL},
};
/* Transient out of memory error happening after send() has returned. */
TEST(send, oomAsync, setUp, tearDown, 0, oomAsyncParams)
{
struct fixture *f = data;
SEND(0);
return MUNIT_OK;
}
/* The backend gets closed while there is a pending write. */
TEST(send, closeDuringWrite, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
struct raft_entry entry;
/* Set a very large message that is likely to fill the socket buffer.
* TODO: figure a more deterministic way to choose the value. */
entry.buf.len = 1024 * 1024 * 8;
entry.buf.base = raft_malloc(entry.buf.len);
MESSAGE(0)->type = RAFT_APPEND_ENTRIES;
MESSAGE(0)->append_entries.entries = &entry;
MESSAGE(0)->append_entries.n_entries = 1;
SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
TEAR_DOWN_UV;
raft_free(entry.buf.base);
return MUNIT_OK;
}
/* The backend gets closed while there is a pending connect request. */
TEST(send, closeDuringConnection, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
TEAR_DOWN_UV;
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_uv_set_term.c 0000664 0000000 0000000 00000021661 14601504142 0021574 0 ustar 00root root 0000000 0000000 #include "../../include/raft/uv.h"
#include "../../src/byte.h"
#include "../lib/runner.h"
#include "../lib/uv.h"
/******************************************************************************
*
* Fixture with a libuv-based raft_io instance.
*
*****************************************************************************/
struct fixture
{
FIXTURE_UV_DEPS;
FIXTURE_UV;
bool closed;
};
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
static void closeCb(struct raft_io *io)
{
struct fixture *f = io->data;
f->closed = true;
}
/* Invoke raft_uv_init() and assert that no error occurs. */
#define INIT \
do { \
int _rv; \
_rv = raft_uv_init(&f->io, &f->loop, f->dir, &f->transport); \
munit_assert_int(_rv, ==, 0); \
_rv = f->io.init(&f->io, 1, "1"); \
munit_assert_int(_rv, ==, 0); \
} while (0)
/* Invoke raft_io->close(). */
#define CLOSE \
do { \
f->io.close(&f->io, closeCb); \
LOOP_RUN_UNTIL(&f->closed); \
raft_uv_close(&f->io); \
} while (0)
/* Invoke f->io->set_term() and assert that no error occurs. */
#define SET_TERM(TERM) \
do { \
int _rv; \
_rv = f->io.set_term(&f->io, TERM); \
munit_assert_int(_rv, ==, 0); \
} while (0)
/* Invoke f->io->set_term() and assert that the given error code is returned and
* the given error message set. */
#define SET_TERM_ERROR(TERM, RV, ERRMSG) \
do { \
int _rv; \
_rv = f->io.set_term(&f->io, TERM); \
munit_assert_int(_rv, ==, RV); \
munit_assert_string_equal(f->io.errmsg_(&f->io), ERRMSG); \
} while (0)
/* Write either the metadata1 or metadata2 file, filling it with the given
* values. */
#define WRITE_METADATA_FILE(N, FORMAT, VERSION, TERM, VOTED_FOR) \
{ \
uint8_t buf[8 * 4]; \
uint8_t *cursor = buf; \
char filename[strlen("metadataN") + 1]; \
sprintf(filename, "metadata%d", N); \
bytePut64(&cursor, FORMAT); \
bytePut64(&cursor, VERSION); \
bytePut64(&cursor, TERM); \
bytePut64(&cursor, VOTED_FOR); \
DirWriteFile(f->dir, filename, buf, sizeof buf); \
}
/* Assert that the content of either the metadata1 or metadata2 file match the
* given values. */
#define ASSERT_METADATA_FILE(N, VERSION, TERM, VOTED_FOR) \
{ \
uint8_t buf2[8 * 4]; \
const uint8_t *cursor = buf2; \
char filename[strlen("metadataN") + 1]; \
sprintf(filename, "metadata%d", N); \
DirReadFile(f->dir, filename, buf2, sizeof buf2); \
munit_assert_ullong(byteGet64(&cursor), ==, 1); \
munit_assert_ullong(byteGet64(&cursor), ==, VERSION); \
munit_assert_ullong(byteGet64(&cursor), ==, TERM); \
munit_assert_ullong(byteGet64(&cursor), ==, VOTED_FOR); \
}
/******************************************************************************
*
* Set up and tear down.
*
*****************************************************************************/
static void *setUpDeps(const MunitParameter params[], void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_UV_DEPS;
f->io.data = f;
f->closed = false;
return f;
}
static void *setUp(const MunitParameter params[], void *user_data)
{
struct fixture *f = setUpDeps(params, user_data);
INIT;
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
CLOSE;
TEAR_DOWN_UV_DEPS;
free(f);
}
/******************************************************************************
*
* raft_io->set_term()
*
*****************************************************************************/
SUITE(set_term)
/* The very first time set_term() is called, the metadata1 file gets written. */
TEST(set_term, first, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
SET_TERM(1);
ASSERT_METADATA_FILE(1, 1, 1, 0);
munit_assert_false(DirHasFile(f->dir, "metadata2"));
return MUNIT_OK;
}
/* The second time set_term() is called, the metadata2 file gets written. */
TEST(set_term, second, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
SET_TERM(1);
SET_TERM(2);
ASSERT_METADATA_FILE(1, 1, 1, 0);
ASSERT_METADATA_FILE(2, 2, 2, 0);
return MUNIT_OK;
}
/* The third time set_term() is called, the metadata1 file gets overwritten. */
TEST(set_term, third, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
SET_TERM(1);
SET_TERM(2);
SET_TERM(3);
ASSERT_METADATA_FILE(1, 3, 3, 0);
ASSERT_METADATA_FILE(2, 2, 2, 0);
return MUNIT_OK;
}
/* The fourth time set_term() is called, the metadata2 file gets overwritten. */
TEST(set_term, fourth, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
SET_TERM(1);
SET_TERM(2);
SET_TERM(3);
SET_TERM(4);
ASSERT_METADATA_FILE(1, 3, 3, 0);
ASSERT_METADATA_FILE(2, 4, 4, 0);
return MUNIT_OK;
}
/* If the data directory has a single metadata1 file, the first time set_data()
* is called, the second metadata file gets created. */
TEST(set_term, metadataOneExists, setUpDeps, tearDown, 0, NULL)
{
struct fixture *f = data;
WRITE_METADATA_FILE(1, /* Metadata file index */
1, /* Format */
1, /* Version */
1, /* Term */
0 /* Voted for */);
INIT;
SET_TERM(2);
ASSERT_METADATA_FILE(1, 1, 1, 0);
ASSERT_METADATA_FILE(2, 2, 2, 0);
return MUNIT_OK;
}
/* The data directory has both metadata files, but metadata1 is greater. */
TEST(set_term, metadataOneIsGreater, setUpDeps, tearDown, 0, NULL)
{
struct fixture *f = data;
WRITE_METADATA_FILE(1, /* Metadata file index */
1, /* Format */
3, /* Version */
3, /* Term */
0 /* Voted for */);
WRITE_METADATA_FILE(2, /* Metadata file index */
1, /* Format */
2, /* Version */
2, /* Term */
0 /* Voted for */);
INIT;
SET_TERM(4);
ASSERT_METADATA_FILE(1 /* n */, 3 /* version */, 3 /* term */,
0 /* voted for */);
ASSERT_METADATA_FILE(2 /* n */, 4 /* version */, 4 /* term */,
0 /* voted for */);
return MUNIT_OK;
}
/* The data directory has both metadata files, but metadata2 is greater. */
TEST(set_term, metadataTwoIsGreater, setUpDeps, tearDown, 0, NULL)
{
struct fixture *f = data;
WRITE_METADATA_FILE(1, /* Metadata file index */
1, /* Format */
1, /* Version */
1, /* Term */
0 /* Voted for */);
WRITE_METADATA_FILE(2, /* Metadata file index */
1, /* Format */
2, /* Version */
2, /* Term */
0 /* Voted for */);
INIT;
SET_TERM(2);
ASSERT_METADATA_FILE(1 /* n */, 3 /* version */, 2 /* term */,
0 /* voted for */);
ASSERT_METADATA_FILE(2 /* n */, 2 /* version */, 2 /* term */,
0 /* voted for */);
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_uv_snapshot_put.c 0000664 0000000 0000000 00000025736 14601504142 0022510 0 ustar 00root root 0000000 0000000 #include
#include "../lib/runner.h"
#include "../lib/tcp.h"
#include "../lib/uv.h"
#include "append_helpers.h"
/******************************************************************************
*
* Fixture with a libuv-based raft_io instance.
*
*****************************************************************************/
struct fixture
{
FIXTURE_UV_DEPS;
FIXTURE_UV;
bool closed;
int count;
};
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
struct snapshot
{
raft_term term;
raft_index index;
uint64_t data;
bool done;
};
static void snapshotPutCbAssertResult(struct raft_io_snapshot_put *req,
int status)
{
struct result *result = req->data;
struct raft_snapshot *snapshot = result->data;
munit_assert_int(status, ==, result->status);
result->done = true;
raft_configuration_close(&snapshot->configuration);
}
static void snapshotGetCbAssertResult(struct raft_io_snapshot_get *req,
struct raft_snapshot *snapshot,
int status)
{
struct snapshot *expect = req->data;
munit_assert_int(status, ==, 0);
munit_assert_ptr_not_null(snapshot);
munit_assert_int(snapshot->term, ==, expect->term);
munit_assert_int(snapshot->index, ==, snapshot->index);
expect->done = true;
raft_configuration_close(&snapshot->configuration);
raft_free(snapshot->bufs[0].base);
raft_free(snapshot->bufs);
raft_free(snapshot);
}
/* Submit a request to truncate the log at N */
#define TRUNCATE(N) \
{ \
int _rv; \
_rv = f->io.truncate(&f->io, N); \
munit_assert_int(_rv, ==, 0); \
}
#define SNAPSHOT_PUT_REQ(TRAILING, INDEX, RV, STATUS) \
struct raft_snapshot _snapshot; \
struct raft_buffer _snapshot_buf; \
uint64_t _snapshot_data; \
struct raft_io_snapshot_put _req; \
struct result _result = {STATUS, false, &_snapshot}; \
int _rv; \
_snapshot.term = 1; \
_snapshot.index = INDEX; \
raft_configuration_init(&_snapshot.configuration); \
_rv = raft_configuration_add(&_snapshot.configuration, 1, "1", \
RAFT_STANDBY); \
munit_assert_int(_rv, ==, 0); \
_snapshot.bufs = &_snapshot_buf; \
_snapshot.n_bufs = 1; \
_snapshot_buf.base = &_snapshot_data; \
_snapshot_buf.len = sizeof _snapshot_data; \
_req.data = &_result; \
_rv = f->io.snapshot_put(&f->io, TRAILING, &_req, &_snapshot, \
snapshotPutCbAssertResult); \
munit_assert_int(_rv, ==, RV)
/* Submit a snapshot put request for the given snapshot and wait for the
* operation to successfully complete. */
#define SNAPSHOT_PUT(TRAILING, INDEX) \
do { \
SNAPSHOT_PUT_REQ(TRAILING, INDEX, 0 /* rv */, 0 /* status */); \
LOOP_RUN_UNTIL(&_result.done); \
} while (0)
/* Submit a snapshot put request and assert that it fails synchronously with the
* given error code and message. */
#define SNAPSHOT_PUT_ERROR(SNAPSHOT, TRAILING, RV, ERRMSG) \
do { \
SNAPSHOT_PUT_REQ(SNAPSHOT, TRAILING, RV, 0 /* status */); \
/* munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \
} while (0)
/* Submit a snapshot put request and wait for the operation to fail with the
* given code and message. */
#define SNAPSHOT_PUT_FAILURE(STATUS, ERRMSG) \
do { \
SNAPSHOT_PUT_REQ(0 /* rv */, STATUS); \
LOOP_RUN_UNTIL(&_result.done); \
/*munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \
} while (0)
/* Use raft_io->snapshot_get to load the last snapshot and compare it with the
* given parameters. */
#define ASSERT_SNAPSHOT(TERM, INDEX, DATA) \
do { \
struct raft_io_snapshot_get _req; \
struct snapshot _expect = {TERM, INDEX, DATA, false}; \
int _rv; \
_req.data = &_expect; \
_rv = f->io.snapshot_get(&f->io, &_req, snapshotGetCbAssertResult); \
munit_assert_int(_rv, ==, 0); \
LOOP_RUN_UNTIL(&_expect.done); \
} while (0)
/******************************************************************************
*
* Set up and tear down.
*
*****************************************************************************/
static void tearDownDeps(void *data)
{
struct fixture *f = data;
if (f == NULL) {
return;
}
TEAR_DOWN_UV_DEPS;
free(f);
}
static void *setUp(const MunitParameter params[], void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_UV_DEPS;
SETUP_UV;
f->io.data = f;
f->closed = false;
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
if (f == NULL) {
return;
}
TEAR_DOWN_UV;
tearDownDeps(f);
}
/******************************************************************************
*
* raft_io->snapshot_put
*
*****************************************************************************/
SUITE(snapshot_put)
/* Put the first snapshot. */
TEST(snapshot_put, first, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
SNAPSHOT_PUT(10, /* trailing */
1 /* index */
);
ASSERT_SNAPSHOT(1, 1, 1);
return MUNIT_OK;
}
/* If the number of closed entries is less than the given trailing amount, no
* segment is deleted. */
TEST(snapshot_put, entriesLessThanTrailing, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned i;
raft_uv_set_segment_size(
&f->io, 4096); /* Lower the number of block to force finalizing */
for (i = 0; i < 40; i++) {
APPEND(10, 8);
}
SNAPSHOT_PUT(128, /* trailing */
100 /* index */
);
munit_assert_true(DirHasFile(f->dir, "0000000000000001-0000000000000150"));
munit_assert_true(DirHasFile(f->dir, "0000000000000151-0000000000000300"));
return MUNIT_OK;
}
/* If the number of closed entries is greater than the given trailing amount,
* closed segments that are fully past the trailing amount get deleted. */
TEST(snapshot_put, entriesMoreThanTrailing, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
unsigned i;
raft_uv_set_segment_size(
&f->io, 4096); /* Lower the number of block to force finalizing */
for (i = 0; i < 40; i++) {
APPEND(10, 8);
}
SNAPSHOT_PUT(128, /* trailing */
280 /* index */
);
munit_assert_false(DirHasFile(f->dir, "0000000000000001-0000000000000150"));
munit_assert_true(DirHasFile(f->dir, "0000000000000151-0000000000000300"));
return MUNIT_OK;
}
/* Request to install a snapshot. */
TEST(snapshot_put, install, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(4, 8);
SNAPSHOT_PUT(0, /* trailing */
1 /* index */
);
return MUNIT_OK;
}
/* Request to install a snapshot, no previous entry is present. */
TEST(snapshot_put, installWithoutPreviousEntries, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
SNAPSHOT_PUT(0, /* trailing */
1 /* index */
);
return MUNIT_OK;
}
/* Request to install a couple of snapshots in a row, no previous entry is
* present. */
TEST(snapshot_put,
installMultipleWithoutPreviousEntries,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
SNAPSHOT_PUT(0, /* trailing */
1 /* index */
);
SNAPSHOT_PUT(0, /* trailing */
3 /* index */
);
SNAPSHOT_PUT(0, /* trailing */
1337 /* index */
);
return MUNIT_OK;
}
/* Request to install a couple of snapshots in a row, AppendEntries Requests
* happen before, meanwhile and after */
TEST(snapshot_put,
installMultipleAppendEntriesInBetween,
setUp,
tearDown,
0,
NULL)
{
struct fixture *f = data;
APPEND_SUBMIT(0, 256, 8);
APPEND_SUBMIT(1, 256, 8);
SNAPSHOT_PUT(0, /* trailing */
1 /* index */
);
APPEND_WAIT(0);
APPEND_WAIT(1);
APPEND_SUBMIT(2, 256, 8);
APPEND_SUBMIT(3, 256, 8);
SNAPSHOT_PUT(0, /* trailing */
100 /* index */
);
APPEND_WAIT(2);
APPEND_WAIT(3);
APPEND_SUBMIT(4, 256, 8);
APPEND_SUBMIT(5, 256, 8);
APPEND_WAIT(4);
APPEND_WAIT(5);
return MUNIT_OK;
}
/* A request to install a snapshot fails due to lack of disk space, the
* operation is retried until it's finally cancelled upon shutdown. */
TEST(snapshot_put, noSpaceRetryCancel, setUp, tearDownDeps, 0, DirTmpfsParams)
{
struct fixture *f = data;
unsigned i;
SKIP_IF_NO_FIXTURE;
#if defined(__powerpc64__)
/* XXX: fails on ppc64el */
TEAR_DOWN_UV;
return MUNIT_SKIP;
#endif
raft_uv_set_segment_size(&f->io, 4096 * 2);
raft_uv_set_disk_retry(&f->io, 10);
for (i = 0; i < 5; i++) {
APPEND(10, 8);
}
DirFill(f->dir, 32);
SNAPSHOT_PUT_REQ(128, /* trailing */
280 /* index */, 0 /* rv */, RAFT_CANCELED);
LOOP_RUN(2);
TEAR_DOWN_UV;
return MUNIT_OK;
}
/* A request to install a snapshot is retried due to lack of disk space and
* eventually succeeds when disk space is recovered. */
TEST(snapshot_put, noSpaceRetryResolved, setUp, tearDown, 0, DirTmpfsParams)
{
struct fixture *f = data;
unsigned i;
SKIP_IF_NO_FIXTURE;
#if defined(__powerpc64__)
/* XXX: fails on ppc64el */
return MUNIT_SKIP;
#endif
raft_uv_set_segment_size(&f->io, 4096 * 2);
raft_uv_set_disk_retry(&f->io, 10);
for (i = 0; i < 5; i++) {
APPEND(10, 8);
}
DirFill(f->dir, 32);
SNAPSHOT_PUT_REQ(128, /* trailing */
280 /* index */, 0 /* rv */, 0);
LOOP_RUN(2);
DirRemoveFile(f->dir, ".fill");
LOOP_RUN_UNTIL(&_result.done);
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_uv_tcp_connect.c 0000664 0000000 0000000 00000026415 14601504142 0022253 0 ustar 00root root 0000000 0000000 #include "../../include/raft.h"
#include "../../include/raft/uv.h"
#include "../lib/addrinfo.h"
#include "../lib/heap.h"
#include "../lib/loop.h"
#include "../lib/runner.h"
#include "../lib/tcp.h"
/******************************************************************************
*
* Fixture with a TCP-based raft_uv_transport.
*
*****************************************************************************/
struct fixture
{
FIXTURE_HEAP;
FIXTURE_LOOP;
FIXTURE_TCP_SERVER;
struct raft_uv_transport transport;
bool closed;
};
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
struct result
{
int status;
bool done;
};
static void closeCb(struct raft_uv_transport *transport)
{
struct fixture *f = transport->data;
f->closed = true;
}
static void connectCbAssertResult(struct raft_uv_connect *req,
struct uv_stream_s *stream,
int status)
{
struct result *result = req->data;
munit_assert_int(status, ==, result->status);
if (status == 0) {
uv_close((struct uv_handle_s *)stream, (uv_close_cb)raft_free);
}
result->done = true;
}
#define INIT \
do { \
int _rv; \
_rv = f->transport.init(&f->transport, 1, "127.0.0.1:9000"); \
munit_assert_int(_rv, ==, 0); \
f->transport.data = f; \
f->closed = false; \
} while (0)
#define CLOSE_SUBMIT \
munit_assert_false(f->closed); \
f->transport.close(&f->transport, closeCb);
#define CLOSE_WAIT LOOP_RUN_UNTIL(&f->closed)
#define CLOSE \
CLOSE_SUBMIT; \
CLOSE_WAIT
#define CONNECT_REQ(ID, ADDRESS, RV, STATUS) \
struct raft_uv_connect _req; \
struct result _result = {STATUS, false}; \
int _rv; \
_req.data = &_result; \
_rv = f->transport.connect(&f->transport, &_req, ID, ADDRESS, \
connectCbAssertResult); \
munit_assert_int(_rv, ==, RV)
/* Try to submit a connect request and assert that the given error code and
* message are returned. */
#define CONNECT_ERROR(ID, ADDRESS, RV, ERRMSG) \
{ \
CONNECT_REQ(ID, ADDRESS, RV /* rv */, 0 /* status */); \
munit_assert_string_equal(f->transport.errmsg, ERRMSG); \
}
/* Submit a connect request with the given parameters and wait for the operation
* to successfully complete. */
#define CONNECT(ID, ADDRESS) \
{ \
CONNECT_REQ(ID, ADDRESS, 0 /* rv */, 0 /* status */); \
LOOP_RUN_UNTIL(&_result.done); \
}
/* Submit a connect request with the given parameters and wait for the operation
* to fail with the given code and message. */
#define CONNECT_FAILURE(ID, ADDRESS, STATUS, ERRMSG) \
{ \
CONNECT_REQ(ID, ADDRESS, 0 /* rv */, STATUS); \
LOOP_RUN_UNTIL(&_result.done); \
munit_assert_string_equal(f->transport.errmsg, ERRMSG); \
}
/* Submit a connect request with the given parameters, close the transport after
* N loop iterations and assert that the request got canceled. */
#define CONNECT_CLOSE(ID, ADDRESS, N) \
{ \
CONNECT_REQ(ID, ADDRESS, 0 /* rv */, RAFT_CANCELED); \
LOOP_RUN(N); \
CLOSE_SUBMIT; \
munit_assert_false(_result.done); \
LOOP_RUN_UNTIL(&_result.done); \
CLOSE_WAIT; \
}
/******************************************************************************
*
* Set up and tear down.
*
*****************************************************************************/
static void *setUpDeps(const MunitParameter params[],
MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
int rv;
SET_UP_ADDRINFO;
SET_UP_HEAP;
SETUP_LOOP;
SETUP_TCP_SERVER;
f->transport.version = 1;
rv = raft_uv_tcp_init(&f->transport, &f->loop);
munit_assert_int(rv, ==, 0);
return f;
}
static void tearDownDeps(void *data)
{
struct fixture *f = data;
LOOP_STOP;
raft_uv_tcp_close(&f->transport);
TEAR_DOWN_TCP_SERVER;
TEAR_DOWN_LOOP;
TEAR_DOWN_HEAP;
TEAR_DOWN_ADDRINFO;
free(f);
}
static void *setUp(const MunitParameter params[], void *user_data)
{
struct fixture *f = setUpDeps(params, user_data);
INIT;
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
CLOSE;
tearDownDeps(f);
}
/******************************************************************************
*
* raft_uv_transport->connect()
*
*****************************************************************************/
#define BOGUS_ADDRESS "127.0.0.1:6666"
#define INVALID_ADDRESS "500.0.0.1:6666"
SUITE(tcp_connect)
/* Successfully connect to the peer by IP */
TEST(tcp_connect, first, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
CONNECT(2, TCP_SERVER_ADDRESS);
return MUNIT_OK;
}
/* Successfully connect to the peer by hostname */
TEST(tcp_connect, connectByName, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
char host_adress[256];
sprintf(host_adress, "localhost:%d", TCP_SERVER_PORT);
CONNECT(2, host_adress);
return MUNIT_OK;
}
/* Successfully connect to the peer by first IP */
TEST(tcp_connect, firstIP, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
const struct AddrinfoResult results[] = {{"127.0.0.1", TCP_SERVER_PORT},
{"192.0.2.0", 6666}};
AddrinfoInjectSetResponse(0, 2, results);
CONNECT(2, "any-host");
return MUNIT_OK;
}
/* Successfully connect to the peer by second IP */
TEST(tcp_connect, secondIP, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
const struct AddrinfoResult results[] = {{"127.0.0.1", .6666},
{"127.0.0.1", TCP_SERVER_PORT}};
AddrinfoInjectSetResponse(0, 2, results);
CONNECT(2, "any-host");
return MUNIT_OK;
}
/* The peer has shutdown */
TEST(tcp_connect, refused, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
TCP_SERVER_STOP;
CONNECT_FAILURE(2, BOGUS_ADDRESS, RAFT_NOCONNECTION,
"uv_tcp_connect(): connection refused");
return MUNIT_OK;
}
static char *oomHeapFaultDelay[] = {"0", "1", "2", NULL};
static char *oomHeapFaultRepeat[] = {"1", NULL};
static MunitParameterEnum oomParams[] = {
{TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay},
{TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat},
{NULL, NULL},
};
/* Out of memory conditions. */
TEST(tcp_connect, oom, setUp, tearDown, 0, oomParams)
{
struct fixture *f = data;
HEAP_FAULT_ENABLE;
CONNECT_ERROR(2, BOGUS_ADDRESS, RAFT_NOMEM, "out of memory");
return MUNIT_OK;
}
/* The transport is closed immediately after a connect request as been
* submitted. The request's callback is invoked with RAFT_CANCELED. */
TEST(tcp_connect, closeImmediately, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
CONNECT_CLOSE(2, TCP_SERVER_ADDRESS, 0);
return MUNIT_OK;
}
/* The transport gets closed during the dns lookup */
TEST(tcp_connect, closeDuringDnsLookup, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
CONNECT_CLOSE(2, TCP_SERVER_ADDRESS, 1);
return MUNIT_OK;
}
/* The transport gets closed during the handshake. */
TEST(tcp_connect, closeDuringHandshake, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
/* This test fails for libuv version >= 1.44.2 due to changes in uv_run
* whereby queueing and processing the write_cb happen in the same loop
* iteration, not leaving us a chance to close without going through a lot
* of hoops.
* https://github.com/libuv/libuv/pull/3598 */
unsigned incompatible_uv = (1 << 16) | (44 << 8) | 2;
if (uv_version() >= incompatible_uv) {
CLOSE;
return MUNIT_SKIP;
}
CONNECT_CLOSE(2, TCP_SERVER_ADDRESS, 2);
return MUNIT_OK;
}
static void checkCb(struct uv_check_s *check)
{
struct fixture *f = check->data;
CLOSE_SUBMIT;
uv_close((struct uv_handle_s *)check, NULL);
}
/* The transport gets closed right after a dns lookup failure, while the
* connection attempt is being aborted. */
TEST(tcp_connect, closeDuringDnsLookupAbort, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
struct uv_check_s check;
int rv;
/* Use a check handle in order to close the transport in the same loop
* iteration where the dns failure lookup occurs */
rv = uv_check_init(&f->loop, &check);
munit_assert_int(rv, ==, 0);
check.data = f;
uv_check_start(&check, checkCb);
CONNECT_REQ(2, INVALID_ADDRESS, 0, RAFT_NOCONNECTION);
LOOP_RUN(1);
LOOP_RUN_UNTIL(&_result.done);
CLOSE_WAIT;
return MUNIT_OK;
}
/* The transport gets closed right after a connection failure, while the
* connection attempt is being aborted. */
TEST(tcp_connect, closeDuringConnectAbort, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
struct uv_check_s check;
int rv;
/* Use a check handle in order to close the transport in the same loop
* iteration where the connection failure occurs. */
rv = uv_check_init(&f->loop, &check);
munit_assert_int(rv, ==, 0);
check.data = f;
CONNECT_REQ(2, BOGUS_ADDRESS, 0, RAFT_NOCONNECTION);
/* Successfull DNS lookup will initiate async connect */
LOOP_RUN(1);
uv_check_start(&check, checkCb);
LOOP_RUN(1);
LOOP_RUN_UNTIL(&_result.done);
CLOSE_WAIT;
return MUNIT_OK;
}
/* The transport gets closed right after the first connection attempt failed,
* while doing a second connection attempt. */
TEST(tcp_connect, closeDuringSecondConnect, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
struct uv_check_s check;
int rv;
const struct AddrinfoResult results[] = {{"127.0.0.1", .6666},
{"127.0.0.1", TCP_SERVER_PORT}};
AddrinfoInjectSetResponse(0, 2, results);
/* Use a check handle in order to close the transport in the same loop
* iteration where the second connection attempt occurs. */
rv = uv_check_init(&f->loop, &check);
munit_assert_int(rv, ==, 0);
check.data = f;
CONNECT_REQ(2, "any-host", 0, RAFT_CANCELED);
/* Successfull DNS lookup will initiate async connect */
LOOP_RUN(1);
uv_check_start(&check, checkCb);
LOOP_RUN(1);
LOOP_RUN_UNTIL(&_result.done);
CLOSE_WAIT;
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_uv_tcp_listen.c 0000664 0000000 0000000 00000031522 14601504142 0022113 0 ustar 00root root 0000000 0000000 #include "../../include/raft.h"
#include "../../include/raft/uv.h"
#include "../../src/byte.h"
#include "../lib/addrinfo.h"
#include "../lib/heap.h"
#include "../lib/loop.h"
#include "../lib/runner.h"
#include "../lib/tcp.h"
/******************************************************************************
*
* Fixture with a TCP-based raft_uv_transport.
*
*****************************************************************************/
struct fixture
{
FIXTURE_HEAP;
FIXTURE_LOOP;
FIXTURE_TCP;
struct raft_uv_transport transport;
bool accepted;
bool closed;
struct
{
uint8_t buf[sizeof(uint64_t) + /* Protocol version */
sizeof(uint64_t) + /* Server ID */
sizeof(uint64_t) + /* Length of address */
sizeof(uint64_t) * 2 /* Address */];
size_t offset;
} handshake;
};
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
#define PEER_ID 2
#define PEER_ADDRESS "127.0.0.1:666"
static void closeCb(struct raft_uv_transport *transport)
{
struct fixture *f = transport->data;
f->closed = true;
}
static void acceptCb(struct raft_uv_transport *t,
raft_id id,
const char *address,
struct uv_stream_s *stream)
{
struct fixture *f = t->data;
munit_assert_int(id, ==, PEER_ID);
munit_assert_string_equal(address, PEER_ADDRESS);
f->accepted = true;
uv_close((struct uv_handle_s *)stream, (uv_close_cb)raft_free);
}
#define INIT \
do { \
int _rv; \
f->transport.version = 1; \
_rv = raft_uv_tcp_init(&f->transport, &f->loop); \
munit_assert_int(_rv, ==, 0); \
const char *bind_addr = munit_parameters_get(params, "bind-address"); \
if (bind_addr && strlen(bind_addr)) { \
_rv = raft_uv_tcp_set_bind_address(&f->transport, bind_addr); \
munit_assert_int(_rv, ==, 0); \
} \
const char *address = munit_parameters_get(params, "address"); \
if (!address) { \
address = "127.0.0.1:9000"; \
} \
_rv = f->transport.init(&f->transport, 1, address); \
munit_assert_int(_rv, ==, 0); \
f->transport.data = f; \
f->closed = false; \
} while (0)
#define CLOSE \
do { \
f->transport.close(&f->transport, closeCb); \
LOOP_RUN_UNTIL(&f->closed); \
raft_uv_tcp_close(&f->transport); \
} while (0)
/******************************************************************************
*
* Set up and tear down.
*
*****************************************************************************/
static void *setUpDeps(const MunitParameter params[],
MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SET_UP_ADDRINFO;
SET_UP_HEAP;
SETUP_LOOP;
SETUP_TCP;
return f;
}
static void tearDownDeps(void *data)
{
struct fixture *f = data;
TEAR_DOWN_TCP;
TEAR_DOWN_LOOP;
TEAR_DOWN_HEAP;
TEAR_DOWN_ADDRINFO;
free(f);
}
static void *setUp(const MunitParameter params[], void *user_data)
{
struct fixture *f = setUpDeps(params, user_data);
uint8_t *cursor;
/* test_tcp_listen(&f->tcp); */
INIT;
f->accepted = false;
f->handshake.offset = 0;
cursor = f->handshake.buf;
bytePut64(&cursor, 1);
bytePut64(&cursor, PEER_ID);
bytePut64(&cursor, 16);
strcpy((char *)cursor, PEER_ADDRESS);
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
CLOSE;
tearDownDeps(f);
}
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
#define LISTEN(EXPECTED_RV) \
do { \
int rv; \
rv = f->transport.listen(&f->transport, acceptCb); \
munit_assert_int(rv, ==, EXPECTED_RV); \
} while (false)
/* Connect to the listening socket of the transport, creating a new connection
* that is waiting to be accepted. */
#define PEER_CONNECT TCP_CLIENT_CONNECT(9000)
/* Make the peer close the connection. */
#define PEER_CLOSE TCP_CLIENT_CLOSE
/* Make the connected client send handshake data. */
#define PEER_HANDSHAKE \
do { \
size_t n = sizeof f->handshake.buf; \
TCP_CLIENT_SEND(f->handshake.buf, n); \
} while (0)
/* Make the connected client send partial handshake data: only N bytes will be
* sent, starting from the offset of the last call. */
#define PEER_HANDSHAKE_PARTIAL(N) \
do { \
TCP_CLIENT_SEND(f->handshake.buf + f->handshake.offset, N); \
} while (0)
/* After a PEER_CONNECT() call, spin the event loop until the connected
* callback of the listening TCP handle gets called. */
#define LOOP_RUN_UNTIL_CONNECTED LOOP_RUN(1);
/* After a PEER_HANDSHAKE_PARTIAL() call, spin the event loop until the read
* callback gets called. */
#define LOOP_RUN_UNTIL_READ LOOP_RUN(1);
/* Spin the event loop until the accept callback gets eventually invoked. */
#define ACCEPT LOOP_RUN_UNTIL(&f->accepted);
/******************************************************************************
*
* Success scenarios.
*
*****************************************************************************/
SUITE(tcp_listen)
/* Parameters for listen address */
static char *validAddresses[] = {"127.0.0.1:9000", "localhost:9000", NULL};
static char *validBindAddresses[] = {
"", "127.0.0.1:9000", "localhost:9000", ":9000", "0.0.0.0:9000", NULL};
static MunitParameterEnum validListenParams[] = {
{"address", validAddresses},
{"bind-address", validBindAddresses},
{NULL, NULL},
};
/* If the handshake is successful, the accept callback is invoked. */
TEST(tcp_listen, success, setUp, tearDown, 0, validListenParams)
{
struct fixture *f = data;
LISTEN(0);
PEER_CONNECT;
PEER_HANDSHAKE;
ACCEPT;
return MUNIT_OK;
}
/* Parameters for invalid listen addresses */
static char *invalidAddresses[] = {"500.1.2.3:9000", "not-existing:9000",
"192.0.2.0:9000", NULL};
static char *invalidBindAddresses[] = {
"", "500.1.2.3:9000", "not-existing:9000", "192.0.2.0:9000", NULL};
static MunitParameterEnum invalidTcpListenParams[] = {
{"address", invalidAddresses},
{"bind-address", invalidBindAddresses},
{NULL, NULL},
};
/* Check error on invalid hostname specified */
TEST(tcp_listen, invalidAddress, setUp, tearDown, 0, invalidTcpListenParams)
{
struct fixture *f = data;
LISTEN(RAFT_IOERR);
return MUNIT_OK;
}
/* Check success with addrinfo resolve to mutiple IP and first one is used to
* connect */
TEST(tcp_listen, firstOfTwo, setUp, tearDown, 0, NULL)
{
const struct AddrinfoResult results[] = {{"127.0.0.1", 9000},
{"127.0.0.2", 9000}};
struct fixture *f = data;
AddrinfoInjectSetResponse(0, 2, results);
LISTEN(0);
PEER_CONNECT;
PEER_HANDSHAKE;
ACCEPT;
return MUNIT_OK;
}
/* Check success with addrinfo resolve to mutiple IP and second one is used to
* connect */
TEST(tcp_listen, secondOfTwo, setUp, tearDown, 0, NULL)
{
const struct AddrinfoResult results[] = {{"127.0.0.2", 9000},
{"127.0.0.1", 9000}};
struct fixture *f = data;
AddrinfoInjectSetResponse(0, 2, results);
LISTEN(0);
PEER_CONNECT;
PEER_HANDSHAKE;
ACCEPT;
return MUNIT_OK;
}
/* Simulate port already in use error by addrinfo response contain the same IP
* twice */
TEST(tcp_listen, alreadyBound, setUp, tearDown, 0, NULL)
{
/* We need to use the same endpoint three times as a simple duplicate will
* be skipped due to a glib strange behavior
* https://bugzilla.redhat.com/show_bug.cgi?id=496300 */
const struct AddrinfoResult results[] = {
{"127.0.0.1", 9000}, {"127.0.0.1", 9000}, {"127.0.0.1", 9000}};
struct fixture *f = data;
AddrinfoInjectSetResponse(0, 3, results);
LISTEN(RAFT_IOERR);
return MUNIT_OK;
}
/* Error in bind first IP address */
TEST(tcp_listen, cannotBindFirst, setUp, tearDown, 0, NULL)
{
const struct AddrinfoResult results[] = {{"192.0.2.0", 9000},
{"127.0.0.1", 9000}};
struct fixture *f = data;
AddrinfoInjectSetResponse(0, 2, results);
LISTEN(RAFT_IOERR);
return MUNIT_OK;
}
/* Error in bind of second IP address */
TEST(tcp_listen, cannotBindSecond, setUp, tearDown, 0, NULL)
{
const struct AddrinfoResult results[] = {{"127.0.0.1", 9000},
{"192.0.2.0", 9000}};
struct fixture *f = data;
AddrinfoInjectSetResponse(0, 2, results);
LISTEN(RAFT_IOERR);
return MUNIT_OK;
}
/* Check error on general dns server failure */
TEST(tcp_listen, resolveFailure, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
AddrinfoInjectSetResponse(EAI_FAIL, 0, NULL);
LISTEN(RAFT_IOERR);
return MUNIT_OK;
}
/* The client sends us a bad protocol version */
TEST(tcp_listen, badProtocol, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
LISTEN(0);
memset(f->handshake.buf, 999, sizeof(uint64_t));
PEER_CONNECT;
PEER_HANDSHAKE;
LOOP_RUN_UNTIL_CONNECTED;
LOOP_RUN_UNTIL_READ;
return MUNIT_OK;
}
/* Parameters for sending a partial handshake */
static char *partialHandshakeN[] = {"8", "16", "24", "32", NULL};
static MunitParameterEnum peerAbortParams[] = {
{"n", partialHandshakeN},
{NULL, NULL},
};
/* The peer closes the connection after having sent a partial handshake. */
TEST(tcp_listen, peerAbort, setUp, tearDown, 0, peerAbortParams)
{
struct fixture *f = data;
LISTEN(0);
const char *n = munit_parameters_get(params, "n");
PEER_CONNECT;
PEER_HANDSHAKE_PARTIAL(atoi(n));
LOOP_RUN_UNTIL_CONNECTED;
LOOP_RUN_UNTIL_READ;
PEER_CLOSE;
return MUNIT_OK;
}
/* TODO: skip "2" because it makes libuv crash, as it calls abort(). See also
* https://github.com/libuv/libuv/issues/1948 */
static char *oomHeapFaultDelay[] = {"0", "1", "3", NULL};
static char *oomHeapFaultRepeat[] = {"1", NULL};
static MunitParameterEnum oomParams[] = {
{TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay},
{TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat},
{NULL, NULL},
};
/* Out of memory conditions */
TEST(tcp_listen, oom, setUp, tearDown, 0, oomParams)
{
struct fixture *f = data;
LISTEN(0);
PEER_CONNECT;
PEER_HANDSHAKE;
HEAP_FAULT_ENABLE;
/* Run as much as possible. */
uv_run(&f->loop, UV_RUN_NOWAIT);
uv_run(&f->loop, UV_RUN_NOWAIT);
uv_run(&f->loop, UV_RUN_NOWAIT);
return MUNIT_OK;
}
/* Close the transport right after an incoming connection becomes pending, but
* it hasn't been accepted yet. */
TEST(tcp_listen, pending, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
LISTEN(0);
PEER_CONNECT;
return MUNIT_OK;
}
/* Close the transport right after an incoming connection gets accepted, and the
* peer hasn't sent handshake data yet. */
TEST(tcp_listen, closeBeforeHandshake, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
LISTEN(0);
PEER_CONNECT;
LOOP_RUN_UNTIL_CONNECTED;
return MUNIT_OK;
}
static MunitParameterEnum closeDuringHandshake[] = {
{"n", partialHandshakeN},
{NULL, NULL},
};
/* Close the transport right after the peer has started to send handshake data,
* but isn't done with it yet. */
TEST(tcp_listen, handshake, setUp, tearDown, 0, closeDuringHandshake)
{
struct fixture *f = data;
LISTEN(0);
const char *n_param = munit_parameters_get(params, "n");
PEER_CONNECT;
PEER_HANDSHAKE_PARTIAL(atoi(n_param));
LOOP_RUN_UNTIL_CONNECTED;
LOOP_RUN_UNTIL_READ;
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_uv_truncate.c 0000664 0000000 0000000 00000027644 14601504142 0021606 0 ustar 00root root 0000000 0000000 #include "../lib/runner.h"
#include "../lib/uv.h"
/******************************************************************************
*
* Fixture
*
*****************************************************************************/
struct fixture
{
FIXTURE_UV_DEPS;
FIXTURE_UV;
int count; /* To generate deterministic entry data */
};
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
struct result
{
int status;
bool done;
};
static void appendCbAssertResult(struct raft_io_append *req, int status)
{
struct result *result = req->data;
munit_assert_int(status, ==, result->status);
result->done = true;
}
/* Declare and fill the entries array for the append request identified by
* I. The array will have N entries, and each entry will have a data buffer of
* SIZE bytes.*/
#define ENTRIES(I, N, SIZE) \
struct raft_entry _entries##I[N]; \
uint8_t _entries_data##I[N * SIZE]; \
do { \
int _i; \
for (_i = 0; _i < N; _i++) { \
struct raft_entry *entry = &_entries##I[_i]; \
entry->term = 1; \
entry->type = RAFT_COMMAND; \
entry->buf.base = &_entries_data##I[_i * SIZE]; \
entry->buf.len = SIZE; \
entry->batch = NULL; \
munit_assert_ptr_not_null(entry->buf.base); \
memset(entry->buf.base, 0, entry->buf.len); \
f->count++; \
*(uint64_t *)entry->buf.base = f->count; \
} \
} while (0)
/* Submit an append request identified by I, with N_ENTRIES entries, each one of
* size ENTRY_SIZE). */
#define APPEND_SUBMIT(I, N_ENTRIES, ENTRY_SIZE) \
struct raft_io_append _req##I; \
struct result _result##I = {0, false}; \
int _rv##I; \
ENTRIES(I, N_ENTRIES, ENTRY_SIZE); \
_req##I.data = &_result##I; \
_rv##I = f->io.append(&f->io, &_req##I, _entries##I, N_ENTRIES, \
appendCbAssertResult); \
munit_assert_int(_rv##I, ==, 0)
/* Wait for the append request identified by I to complete. */
#define APPEND_WAIT(I) LOOP_RUN_UNTIL(&_result##I.done)
#define APPEND_EXPECT(I, STATUS) _result##I.status = STATUS
/* Submit an append request and wait for it to successfully complete. */
#define APPEND(N) \
do { \
APPEND_SUBMIT(9999, N, 8); \
APPEND_WAIT(9999); \
} while (0)
#define TRUNCATE(N) \
do { \
int rv_; \
rv_ = f->io.truncate(&f->io, N); \
munit_assert_int(rv_, ==, 0); \
} while (0)
/******************************************************************************
*
* Set up and tear down.
*
*****************************************************************************/
static void *setUp(const MunitParameter params[], void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_UV_DEPS;
SETUP_UV;
f->count = 0;
return f;
}
static void tearDownDeps(void *data)
{
struct fixture *f = data;
TEAR_DOWN_UV_DEPS;
free(f);
}
/******************************************************************************
*
* Assertions
*
*****************************************************************************/
/* Shutdown the fixture's raft_io instance, then load all entries on disk using
* a new raft_io instance, and assert that there are N entries with data
* matching the DATA array. */
#define ASSERT_ENTRIES(N, ...) \
TEAR_DOWN_UV; \
do { \
struct uv_loop_s _loop; \
struct raft_uv_transport _transport; \
struct raft_io _io; \
struct raft_tracer _tracer; \
raft_term _term; \
raft_id _voted_for; \
struct raft_snapshot *_snapshot; \
raft_index _start_index; \
struct raft_entry *_entries; \
size_t _i; \
size_t _n; \
void *_batch = NULL; \
unsigned _data[N] = {__VA_ARGS__}; \
int _rv; \
\
_rv = uv_loop_init(&_loop); \
munit_assert_int(_rv, ==, 0); \
_transport.version = 1; \
_rv = raft_uv_tcp_init(&_transport, &_loop); \
munit_assert_int(_rv, ==, 0); \
_rv = raft_uv_init(&_io, &_loop, f->dir, &_transport); \
munit_assert_int(_rv, ==, 0); \
_tracer.emit = TracerEmit; \
_tracer.version = 2; \
raft_uv_set_tracer(&_io, &_tracer); \
_rv = _io.init(&_io, 1, "1"); \
munit_assert_int(_rv, ==, 0); \
_rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index, \
&_entries, &_n); \
munit_assert_int(_rv, ==, 0); \
_io.close(&_io, NULL); \
uv_run(&_loop, UV_RUN_NOWAIT); \
raft_uv_close(&_io); \
raft_uv_tcp_close(&_transport); \
uv_loop_close(&_loop); \
\
munit_assert_ptr_null(_snapshot); \
munit_assert_int(_n, ==, N); \
for (_i = 0; _i < _n; _i++) { \
struct raft_entry *_entry = &_entries[_i]; \
uint64_t _value = *(uint64_t *)_entry->buf.base; \
munit_assert_int(_entry->term, ==, 1); \
munit_assert_int(_entry->type, ==, RAFT_COMMAND); \
munit_assert_int(_value, ==, _data[_i]); \
munit_assert_ptr_not_null(_entry->batch); \
} \
for (_i = 0; _i < _n; _i++) { \
struct raft_entry *_entry = &_entries[_i]; \
if (_entry->batch != _batch) { \
_batch = _entry->batch; \
raft_free(_batch); \
} \
} \
raft_free(_entries); \
} while (0);
/******************************************************************************
*
* raft_io->truncate()
*
*****************************************************************************/
SUITE(truncate)
/* If the index to truncate is at the start of a segment, that segment and all
* subsequent ones are removed. */
TEST(truncate, wholeSegment, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND(3);
TRUNCATE(1);
APPEND(1);
ASSERT_ENTRIES(1 /* n entries */, 4 /* entries data */);
return MUNIT_OK;
}
/* The index to truncate is the same as the last appended entry. */
TEST(truncate, sameAsLastIndex, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND(3);
TRUNCATE(3);
APPEND(1);
ASSERT_ENTRIES(3 /* n entries */, 1, 2, 4 /* entries data */);
return MUNIT_OK;
}
/* If the index to truncate is not at the start of a segment, that segment gets
* truncated. */
TEST(truncate, partialSegment, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND(3);
APPEND(1);
TRUNCATE(2);
APPEND(1);
ASSERT_ENTRIES(2, /* n entries */
1, 5 /* entries data */
);
return MUNIT_OK;
}
/* The truncate request is issued while an append request is still pending. */
TEST(truncate, pendingAppend, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND_SUBMIT(0, /* request ID */
3, /* n entries */
8 /* entry size */
);
TRUNCATE(2 /* truncation index */);
APPEND(1);
ASSERT_ENTRIES(2, /* n entries */
1, 4 /* entries data */
);
return MUNIT_OK;
}
/* Multiple truncate requests pending at the same time. */
TEST(truncate, multiplePending, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND_SUBMIT(0, /* request ID */
3, /* n entries */
8 /* entry size */
);
TRUNCATE(2 /* truncation index */);
APPEND_SUBMIT(1, /* request ID */
2, /* n entries */
8 /* entry size */
);
TRUNCATE(3 /* truncation index */);
APPEND(1);
ASSERT_ENTRIES(3, /* n entries */
1, 4, 6 /* entries data */
);
return MUNIT_OK;
}
/* The truncate request gets canceled because we're closing. */
TEST(truncate, closing, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND_SUBMIT(0, /* request ID */
3, /* n entries */
8 /* entry size */
);
TRUNCATE(2 /* truncation index */);
APPEND_EXPECT(0, /* request ID */
RAFT_CANCELED /* status */
);
TEAR_DOWN_UV;
return MUNIT_OK;
}
/* Multiple truncate requests get canceled because we're closing. */
TEST(truncate, closingMultiple, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND_SUBMIT(0, /* request ID */
3, /* n entries */
8 /* entry size */
);
TRUNCATE(2 /* truncation index */);
APPEND_SUBMIT(1, /* request ID */
2, /* n entries */
8 /* entry size */
);
TRUNCATE(3 /* truncation index */);
APPEND_EXPECT(0, /* request ID */
RAFT_CANCELED /* status */
);
APPEND_EXPECT(1, /* request ID */
RAFT_CANCELED /* status */
);
TEAR_DOWN_UV;
return MUNIT_OK;
}
raft-0.22.1/test/integration/test_uv_truncate_snapshot.c 0000664 0000000 0000000 00000027011 14601504142 0023511 0 ustar 00root root 0000000 0000000 #include "../lib/runner.h"
#include "../lib/uv.h"
/******************************************************************************
*
* Fixture
*
*****************************************************************************/
struct fixture
{
FIXTURE_UV_DEPS;
FIXTURE_UV;
int count; /* To generate deterministic entry data */
};
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
/* Maximum number of blocks a segment can have */
#define MAX_SEGMENT_BLOCKS 4
/* This block size should work fine for all file systems. */
#define SEGMENT_BLOCK_SIZE 4096
/* Default segment size */
#define SEGMENT_SIZE 4096 * MAX_SEGMENT_BLOCKS
struct result
{
int status;
bool done;
void *data;
};
static void appendCbAssertResult(struct raft_io_append *req, int status)
{
struct result *result = req->data;
munit_assert_int(status, ==, result->status);
result->done = true;
}
static void snapshotPutCbAssertResult(struct raft_io_snapshot_put *req,
int status)
{
struct result *result = req->data;
munit_assert_int(status, ==, result->status);
result->done = true;
}
/* Declare and fill the entries array for the append request identified by
* I. The array will have N entries, and each entry will have a data buffer of
* SIZE bytes.*/
#define ENTRIES(I, N, SIZE) \
struct raft_entry _entries##I[N]; \
uint8_t _entries_data##I[N * SIZE]; \
do { \
int _i; \
for (_i = 0; _i < N; _i++) { \
struct raft_entry *entry = &_entries##I[_i]; \
entry->term = 1; \
entry->type = RAFT_COMMAND; \
entry->buf.base = &_entries_data##I[_i * SIZE]; \
entry->buf.len = SIZE; \
entry->batch = NULL; \
munit_assert_ptr_not_null(entry->buf.base); \
memset(entry->buf.base, 0, entry->buf.len); \
f->count++; \
*(uint64_t *)entry->buf.base = f->count; \
} \
} while (0)
/* Submit an append request identified by I, with N_ENTRIES entries, each one of
* size ENTRY_SIZE). */
#define APPEND_SUBMIT(I, N_ENTRIES, ENTRY_SIZE) \
struct raft_io_append _req##I; \
struct result _result##I = {0, false, NULL}; \
int _rv##I; \
ENTRIES(I, N_ENTRIES, ENTRY_SIZE); \
_req##I.data = &_result##I; \
_rv##I = f->io.append(&f->io, &_req##I, _entries##I, N_ENTRIES, \
appendCbAssertResult); \
munit_assert_int(_rv##I, ==, 0)
#define TRUNCATE(N) \
do { \
int rv_; \
rv_ = f->io.truncate(&f->io, N); \
munit_assert_int(rv_, ==, 0); \
} while (0)
/******************************************************************************
*
* Set up and tear down.
*
*****************************************************************************/
static void *setUp(const MunitParameter params[], void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_UV_DEPS;
SETUP_UV;
raft_uv_set_block_size(&f->io, SEGMENT_BLOCK_SIZE);
raft_uv_set_segment_size(&f->io, SEGMENT_SIZE);
f->count = 0;
return f;
}
static void tearDownDeps(void *data)
{
struct fixture *f = data;
TEAR_DOWN_UV_DEPS;
free(f);
}
/******************************************************************************
*
* Assertions
*
*****************************************************************************/
/* Shutdown the fixture's raft_io instance, then load all entries on disk using
* a new raft_io instance, and assert that there are N entries with data
* matching the DATA array. */
#define ASSERT_ENTRIES(N, ...) \
TEAR_DOWN_UV; \
do { \
struct uv_loop_s _loop; \
struct raft_uv_transport _transport; \
struct raft_io _io; \
struct raft_tracer _tracer; \
raft_term _term; \
raft_id _voted_for; \
struct raft_snapshot *_snap; \
raft_index _start_index; \
struct raft_entry *_entries; \
size_t _i; \
size_t _n; \
void *_batch = NULL; \
unsigned _data[N] = {__VA_ARGS__}; \
int _ret; \
\
_ret = uv_loop_init(&_loop); \
munit_assert_int(_ret, ==, 0); \
_transport.version = 1; \
_ret = raft_uv_tcp_init(&_transport, &_loop); \
munit_assert_int(_ret, ==, 0); \
_ret = raft_uv_init(&_io, &_loop, f->dir, &_transport); \
munit_assert_int(_ret, ==, 0); \
_tracer.emit = TracerEmit; \
_tracer.version = 2; \
raft_uv_set_tracer(&_io, &_tracer); \
_ret = _io.init(&_io, 1, "1"); \
munit_assert_int(_ret, ==, 0); \
_ret = _io.load(&_io, &_term, &_voted_for, &_snap, &_start_index, \
&_entries, &_n); \
munit_assert_int(_ret, ==, 0); \
_io.close(&_io, NULL); \
uv_run(&_loop, UV_RUN_NOWAIT); \
raft_uv_close(&_io); \
raft_uv_tcp_close(&_transport); \
uv_loop_close(&_loop); \
\
munit_assert_size(_n, ==, N); \
for (_i = 0; _i < _n; _i++) { \
struct raft_entry *_entry = &_entries[_i]; \
uint64_t _value = *(uint64_t *)_entry->buf.base; \
munit_assert_int(_entry->term, ==, 1); \
munit_assert_int(_entry->type, ==, RAFT_COMMAND); \
munit_assert_int(_value, ==, _data[_i]); \
munit_assert_ptr_not_null(_entry->batch); \
} \
for (_i = 0; _i < _n; _i++) { \
struct raft_entry *_entry = &_entries[_i]; \
if (_entry->batch != _batch) { \
_batch = _entry->batch; \
raft_free(_batch); \
} \
} \
raft_free(_entries); \
if (_snap != NULL) { \
raft_configuration_close(&_snap->configuration); \
munit_assert_int(_snap->n_bufs, ==, 1); \
raft_free(_snap->bufs[0].base); \
raft_free(_snap->bufs); \
raft_free(_snap); \
} \
} while (0);
#define SNAPSHOT_PUT_REQ(TRAILING, INDEX, RV, STATUS) \
struct raft_snapshot _snapshot; \
struct raft_buffer _snapshot_buf; \
uint64_t _snapshot_data; \
struct raft_io_snapshot_put _req; \
struct result _result = {STATUS, false, NULL}; \
int _rv; \
_snapshot.term = 1; \
_snapshot.index = INDEX; \
raft_configuration_init(&_snapshot.configuration); \
_rv = raft_configuration_add(&_snapshot.configuration, 1, "1", \
RAFT_STANDBY); \
munit_assert_int(_rv, ==, 0); \
_snapshot.bufs = &_snapshot_buf; \
_snapshot.n_bufs = 1; \
_snapshot_buf.base = &_snapshot_data; \
_snapshot_buf.len = sizeof _snapshot_data; \
_req.data = &_result; \
_rv = f->io.snapshot_put(&f->io, TRAILING, &_req, &_snapshot, \
snapshotPutCbAssertResult); \
munit_assert_int(_rv, ==, RV)
#define SNAPSHOT_CLEANUP() raft_configuration_close(&_snapshot.configuration)
/******************************************************************************
*
* test interaction of raft_io->snapshot_put and raft_io->truncate()
*
*****************************************************************************/
SUITE(snapshot_truncate)
/* Fill up 3 segments worth of data, then take a snapshot.
* While the snapshot is taken, start a truncate request. */
TEST(snapshot_truncate, snapshotThenTruncate, setUp, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
APPEND_SUBMIT(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE);
APPEND_SUBMIT(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE);
APPEND_SUBMIT(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE);
/* Take a snapshot, this will use a uv_barrier. */
SNAPSHOT_PUT_REQ(8192, 6, 0, 0);
/* Truncate, this will use a uv_barrier too. */
TRUNCATE(8);
/* There's no truncate callback to wait for, loop for a while. */
LOOP_RUN(1000);
/* Check that truncate has done its job. */
ASSERT_ENTRIES(7, 1, 2, 3, 4, 5, 6, 7);
SNAPSHOT_CLEANUP();
return MUNIT_OK;
}
raft-0.22.1/test/lib/ 0000775 0000000 0000000 00000000000 14601504142 0014252 5 ustar 00root root 0000000 0000000 raft-0.22.1/test/lib/addrinfo.c 0000664 0000000 0000000 00000013035 14601504142 0016206 0 ustar 00root root 0000000 0000000 #include "addrinfo.h"
#include
#include
#include
#include
#include
bool addrinfo_mock_enabled = false;
enum addrinfo_mock_state { MockResultSet, MockResultReturned, SystemResult };
struct addrinfo_mock_data
{
enum addrinfo_mock_state state;
int rv;
struct addrinfo *result;
struct addrinfo_mock_data *next;
};
static struct addrinfo_mock_data *addrinfo_data;
void AddrinfoInjectSetUp(MUNIT_UNUSED const MunitParameter params[])
{
munit_assert_int(addrinfo_mock_enabled, ==, false);
munit_assert_ptr((void *)addrinfo_data, ==, NULL);
addrinfo_mock_enabled = true;
}
void AddrinfoInjectTearDown(void)
{
munit_assert_int(addrinfo_mock_enabled, ==, true);
// If data is not freed the freeaddrinfo was not invoked.
munit_assert_ptr((void *)addrinfo_data, ==, NULL);
addrinfo_mock_enabled = false;
}
void AddrinfoInjectSetResponse(int rv,
int num_results,
const struct AddrinfoResult *results)
{
munit_assert_int(addrinfo_mock_enabled, ==, true);
munit_assert(!addrinfo_data || addrinfo_data->state == MockResultReturned);
munit_assert(rv || (num_results && results));
struct addrinfo_mock_data *response =
malloc(sizeof(struct addrinfo_mock_data));
munit_assert_ptr((void *)response, !=, NULL);
response->state = MockResultSet;
response->rv = rv;
response->result = NULL;
for (int i = num_results - 1; i >= 0; --i) {
struct sockaddr_in *addr_in = malloc(sizeof(struct sockaddr_in));
munit_assert_ptr((void *)addr_in, !=, NULL);
munit_assert_int(uv_ip4_addr(results[i].ip, results[i].port, addr_in),
==, 0);
struct addrinfo *ai = malloc(sizeof(struct addrinfo));
munit_assert_ptr((void *)ai, !=, NULL);
ai->ai_flags = 0;
ai->ai_family = AF_INET;
ai->ai_socktype = SOCK_STREAM;
ai->ai_protocol = IPPROTO_TCP;
ai->ai_addrlen = sizeof(struct sockaddr_in);
ai->ai_addr = (struct sockaddr *)addr_in;
ai->ai_canonname = NULL;
ai->ai_next = response->result;
response->result = ai;
}
response->next = addrinfo_data;
addrinfo_data = response;
}
static int invoke_system_getaddrinfo(const char *node,
const char *service,
const struct addrinfo *hints,
struct addrinfo **res)
{
int (*system_getaddrinfo)(const char *node, const char *service,
const struct addrinfo *hints,
struct addrinfo **res);
*(void **)(&system_getaddrinfo) = dlsym(RTLD_NEXT, "getaddrinfo");
munit_assert_ptr(*(void **)&system_getaddrinfo, !=, NULL);
return (*system_getaddrinfo)(node, service, hints, res);
}
int getaddrinfo(const char *node,
const char *service,
const struct addrinfo *hints,
struct addrinfo **res)
{
int rv;
if (!addrinfo_mock_enabled) {
return invoke_system_getaddrinfo(node, service, hints, res);
}
if (!addrinfo_data || addrinfo_data->state == SystemResult) {
/* We have not injected response, invoke system function */
rv = invoke_system_getaddrinfo(node, service, hints, res);
if (!rv) {
/* Store result for check on freeaddrinfo */
struct addrinfo_mock_data *response =
malloc(sizeof(struct addrinfo_mock_data));
munit_assert_ptr((void *)response, !=, NULL);
response->state = SystemResult;
response->rv = rv;
response->result = *res;
response->next = addrinfo_data;
addrinfo_data = response;
}
return rv;
}
if (addrinfo_data) {
munit_assert_int(addrinfo_data->state, ==, MockResultSet);
addrinfo_data->state = MockResultReturned;
rv = addrinfo_data->rv;
if (!rv) {
*res = addrinfo_data->result;
} else {
*res = NULL;
struct addrinfo_mock_data *response = addrinfo_data;
munit_assert_ptr((void *)response->result, ==, NULL);
addrinfo_data = response->next;
free(response);
}
return rv;
}
return EAI_FAIL;
}
static void invoke_system_freeaddrinfo(struct addrinfo *res)
{
int (*system_freeaddrinfo)(struct addrinfo * res);
*(void **)(&system_freeaddrinfo) = dlsym(RTLD_NEXT, "freeaddrinfo");
munit_assert_ptr(*(void **)&system_freeaddrinfo, !=, NULL);
(*system_freeaddrinfo)(res);
}
void freeaddrinfo(struct addrinfo *res)
{
struct addrinfo_mock_data **ptr;
struct addrinfo_mock_data *response;
// freeaddrinfo should not be invoked with a NULL pointer
munit_assert_ptr((void *)res, !=, NULL);
if (!addrinfo_mock_enabled) {
invoke_system_freeaddrinfo(res);
return;
}
for (ptr = &addrinfo_data; *ptr; ptr = &((*ptr)->next)) {
if ((*ptr)->result == res) {
break;
}
}
response = *ptr;
munit_assert_ptr((void *)response, !=, NULL);
*ptr = response->next;
if (response->state == SystemResult) {
invoke_system_freeaddrinfo(response->result);
} else {
munit_assert_int(response->state, ==, MockResultReturned);
res = response->result;
while (res) {
struct addrinfo *next = res->ai_next;
free(res->ai_addr);
free(res);
res = next;
}
}
free(response);
}
raft-0.22.1/test/lib/addrinfo.h 0000664 0000000 0000000 00000002377 14601504142 0016222 0 ustar 00root root 0000000 0000000 /* Support for getaddrinfo injection for test purpose
*
* Provide a local bound version to capture teh getaddrinfo/freeaddrinfo
* incovation The helper may operate in three different modes: a) Transparent
* forward calls to system getaddrinfo/freeaddrinfo function, if the
* SET_UP_ADDRINFO/TEAR_DOWN_ADDRINFO is not added to the test test case setup
* teardown. b) Check, if all results requested by getaddrinfo are freed using
* freeaddrinfo. Activated by adding the SET_UP_ADDRINFO/SET_UP_ADDRINFO macros
* to the test fixture. c) Inject artifical responses into the the getaddrinfo
* requests for test purpose additionally to b) by using
* AddrinfoInjectSetResponse before triggering the getaddrinfo calls.
*/
#ifndef TEST_ADDRINFO_H
#define TEST_ADDRINFO_H
#include "munit.h"
#define SET_UP_ADDRINFO AddrinfoInjectSetUp(params)
#define TEAR_DOWN_ADDRINFO AddrinfoInjectTearDown()
typedef struct AddrinfoResult
{
const char *ip;
const int port;
} AddrinfoResult_t;
void AddrinfoInjectSetResponse(int rv,
int num_results,
const struct AddrinfoResult *results);
void AddrinfoInjectSetUp(const MunitParameter params[]);
void AddrinfoInjectTearDown(void);
#endif // #ifndef TEST_ADDRINFO_H
raft-0.22.1/test/lib/aio.c 0000664 0000000 0000000 00000002714 14601504142 0015172 0 ustar 00root root 0000000 0000000 #include "aio.h"
#include
#include
#include
#include
#include "munit.h"
int AioFill(aio_context_t *ctx, unsigned n)
{
char buf[256];
int fd;
int rv;
int limit;
int used;
/* Figure out how many events are available. */
fd = open("/proc/sys/fs/aio-max-nr", O_RDONLY);
munit_assert_int(fd, !=, -1);
rv = read(fd, buf, sizeof buf);
munit_assert_int(rv, !=, -1);
close(fd);
limit = atoi(buf);
munit_assert_int(limit, >, 0);
/* Figure out how many events are in use. */
fd = open("/proc/sys/fs/aio-nr", O_RDONLY);
munit_assert_int(fd, !=, -1);
rv = read(fd, buf, sizeof buf);
munit_assert_int(rv, !=, -1);
close(fd);
used = atoi(buf);
munit_assert_int(used, >=, 0);
/* Best effort check that nothing process is using AIO. Our own unit tests
* case use up to 2 event slots at the time this function is called, so we
* don't consider those. */
if (used > 2) {
return -1;
}
rv = syscall(__NR_io_setup, limit - used - n, ctx);
if (rv != 0) {
/* The `limit - used - n` calculation is racy and io_setup can fail with
* EAGAIN if in meantime another proces has reserved some events */
munit_assert_int(errno, ==, EAGAIN);
return -1;
}
return 0;
}
void AioDestroy(aio_context_t ctx)
{
int rv;
rv = syscall(__NR_io_destroy, ctx);
munit_assert_int(rv, ==, 0);
}
raft-0.22.1/test/lib/aio.h 0000664 0000000 0000000 00000001201 14601504142 0015165 0 ustar 00root root 0000000 0000000 /* Utilities around the Kernel AIO sub-system. */
#ifndef TEST_AIO_H
#define TEST_AIO_H
#include
/* Fill the AIO subsystem resources by allocating a lot of events to the given
* context, and leaving only @n events available for subsequent calls to
* @io_setup.
*
* Return -1 if it looks like there is another process already using the AIO
* subsystem, which would most probably make the calling test flaky because
* there won't be exactly @n events available anymore. */
int AioFill(aio_context_t *ctx, unsigned n);
/* Destroy the given AIO context. */
void AioDestroy(aio_context_t ctx);
#endif /* TEST_AIO_H */
raft-0.22.1/test/lib/cluster.c 0000664 0000000 0000000 00000124073 14601504142 0016106 0 ustar 00root root 0000000 0000000 #define TEST_CLUSTER_V1
#include "../../src/queue.h"
#include "cluster.h"
/* Defaults */
#define DEFAULT_ELECTION_TIMEOUT 100
#define DEFAULT_HEARTBEAT_TIMEOUT 50
#define DEFAULT_NETWORK_LATENCY 10
#define DEFAULT_SNAPSHOT_THRESHOLD 64
#define DEFAULT_SNAPSHOT_TRAILING 32
#define DEFAULT_DISK_LATENCY 10
#define DEFAULT_DISK_SIZE 256 /* In bytes */
/* Maximum number of log entries. */
#define MAX_LOG_ENTRIES 15
/* Track the event to to fire in a cluster step. */
struct step
{
raft_id id; /* Target server ID. */
struct raft_event event; /* Event to fire. */
union {
struct
{
struct raft_entry *batch;
unsigned n;
} entries;
struct
{
struct raft_buffer chunk;
} snapshot;
};
queue queue;
};
/* Mark @id1 as disconnected from @id2. */
struct disconnect
{
raft_id id1;
raft_id id2;
queue queue;
};
/* Initialize an empty disk with no persisted data. */
static void diskInit(struct test_disk *d)
{
d->term = 0;
d->voted_for = 0;
d->snapshot = NULL;
d->start_index = 1;
d->entries = NULL;
d->n_entries = 0;
d->size = DEFAULT_DISK_SIZE;
}
/* Release all memory used by the disk snapshot, if present. */
static void diskDestroySnapshotIfPresent(struct test_disk *d)
{
if (d->snapshot == NULL) {
return;
}
raft_configuration_close(&d->snapshot->metadata.configuration);
free(d->snapshot->data.base);
free(d->snapshot);
d->snapshot = NULL;
}
/* Release all memory used by the disk. */
static void diskClose(struct test_disk *d)
{
unsigned i;
for (i = 0; i < d->n_entries; i++) {
free(d->entries[i].buf.base);
}
free(d->entries);
diskDestroySnapshotIfPresent(d);
}
/* Set the persisted term. */
static void diskSetTerm(struct test_disk *d, raft_term term)
{
d->term = term;
}
/* Set the persisted vote. */
static void diskSetVote(struct test_disk *d, raft_id vote)
{
d->voted_for = vote;
}
/* Return the remaining disk capacity. */
static unsigned short diskCapacity(struct test_disk *d)
{
unsigned short capacity = d->size;
unsigned i;
if (d->snapshot != NULL) {
munit_assert_ullong(d->snapshot->data.len, >, 0);
munit_assert_ullong(capacity, >=, d->snapshot->data.len);
capacity -= d->snapshot->data.len;
}
for (i = 0; i < d->n_entries; i++) {
struct raft_entry *entry = &d->entries[i];
munit_assert_ullong(entry->buf.len, >, 0);
munit_assert_ullong(capacity, >=, entry->buf.len);
capacity -= entry->buf.len;
}
return capacity;
}
/* Set the persisted snapshot. */
static void diskSetSnapshot(struct test_disk *d, struct test_snapshot *snapshot)
{
diskDestroySnapshotIfPresent(d);
munit_assert_ptr_not_null(snapshot->data.base);
munit_assert_ullong(snapshot->data.len, >, 0);
munit_assert_ullong(diskCapacity(d), >=, snapshot->data.len);
d->snapshot = snapshot;
/* If there are no entries, set the start index to the snapshot's last
* index. */
if (d->n_entries == 0) {
d->start_index = snapshot->metadata.index + 1;
}
}
static void entryCopy(const struct raft_entry *src, struct raft_entry *dst)
{
dst->term = src->term;
dst->type = src->type;
dst->buf.len = src->buf.len;
dst->buf.base = munit_malloc(dst->buf.len);
memcpy(dst->buf.base, src->buf.base, dst->buf.len);
dst->batch = NULL;
}
/* Append a new entry to the log. */
static void diskAddEntry(struct test_disk *d, const struct raft_entry *entry)
{
munit_assert_ullong(diskCapacity(d), >=, entry->buf.len);
d->n_entries++;
d->entries = realloc(d->entries, d->n_entries * sizeof *d->entries);
munit_assert_ptr_not_null(d->entries);
entryCopy(entry, &d->entries[d->n_entries - 1]);
}
/* Get the entry at the given index. */
static const struct raft_entry *diskGetEntry(struct test_disk *d,
raft_index index)
{
unsigned i;
if (index < d->start_index) {
return NULL;
}
i = (unsigned)(index - d->start_index);
if (i > d->n_entries - 1) {
return NULL;
}
return &d->entries[i];
}
/* Deep copy configuration object @src to @dst. */
static void confCopy(const struct raft_configuration *src,
struct raft_configuration *dst)
{
unsigned i;
int rv;
raft_configuration_init(dst);
for (i = 0; i < src->n; i++) {
struct raft_server *server = &src->servers[i];
rv = raft_configuration_add(dst, server->id, server->address,
server->role);
munit_assert_int(rv, ==, 0);
}
}
/* Copy snapshot metadata @src to @dst. */
static void snapshotCopy(const struct raft_snapshot_metadata *src,
struct raft_snapshot_metadata *dst)
{
dst->index = src->index;
dst->term = src->term;
confCopy(&src->configuration, &dst->configuration);
dst->configuration_index = src->configuration_index;
}
/* Load the metadata of latest snapshot. */
static void diskLoadSnapshotMetadata(struct test_disk *d,
struct raft_snapshot_metadata *metadata)
{
munit_assert_ptr_not_null(d->snapshot);
snapshotCopy(&d->snapshot->metadata, metadata);
}
/* Load snapshot data into the given buffer, asserting that index and term match
* the given metadata. */
static void diskLoadSnapshotData(struct test_disk *d,
raft_index index,
raft_term term,
struct raft_buffer *data)
{
munit_assert_ptr_not_null(d->snapshot);
munit_assert_ullong(d->snapshot->metadata.index, ==, index);
munit_assert_ullong(d->snapshot->metadata.term, ==, term);
data->len = d->snapshot->data.len;
data->base = raft_malloc(data->len);
munit_assert_ptr_not_null(data->base);
memcpy(data->base, d->snapshot->data.base, data->len);
}
/* Load all data persisted on the disk. */
static void diskLoad(struct test_disk *d,
raft_term *term,
raft_id *voted_for,
struct raft_snapshot_metadata **metadata,
raft_index *start_index,
struct raft_entry **entries,
unsigned *n_entries)
{
size_t size = 0;
void *batch;
uint8_t *cursor;
unsigned i;
*term = d->term;
*voted_for = d->voted_for;
if (d->snapshot != NULL) {
*metadata = munit_malloc(sizeof **metadata);
diskLoadSnapshotMetadata(d, *metadata);
} else {
*metadata = NULL;
}
*start_index = d->start_index;
*n_entries = d->n_entries;
if (*n_entries == 0) {
*entries = NULL;
return;
}
/* Calculate the total size of the entries content and allocate the
* batch. */
for (i = 0; i < d->n_entries; i++) {
size += d->entries[i].buf.len;
}
batch = raft_malloc(size);
munit_assert_ptr_not_null(batch);
/* Copy the entries. */
*entries = raft_malloc(d->n_entries * sizeof **entries);
munit_assert_ptr_not_null(*entries);
cursor = batch;
for (i = 0; i < d->n_entries; i++) {
(*entries)[i].term = d->entries[i].term;
(*entries)[i].type = d->entries[i].type;
(*entries)[i].buf.base = cursor;
(*entries)[i].buf.len = d->entries[i].buf.len;
(*entries)[i].batch = batch;
memcpy((*entries)[i].buf.base, d->entries[i].buf.base,
d->entries[i].buf.len);
cursor += d->entries[i].buf.len;
}
}
/* Truncate all entries from the given index onwards. If there are no entries at
* the given index, this is a no-op. */
static void diskTruncateEntries(struct test_disk *d, raft_index index)
{
unsigned i;
if (index == d->start_index + d->n_entries) {
return;
}
munit_assert_ulong(index, >=, d->start_index);
munit_assert_ulong(index, <=, d->start_index + d->n_entries);
for (i = (unsigned)(index - d->start_index); i < d->n_entries; i++) {
free(d->entries[i].buf.base);
d->n_entries--;
}
}
/* Append a new entry to the log. */
/* Custom emit tracer function which includes the server ID. */
static void serverEmit(struct raft_tracer *t, int type, const void *data)
{
struct test_server *server;
struct test_cluster *cluster;
const struct raft_tracer_info *info = data;
char trace[1024];
server = t->impl;
cluster = server->cluster;
if (cluster->in_tear_down) {
return;
}
if (type != RAFT_TRACER_DIAGNOSTIC) {
return;
}
if (info->diagnostic.level > 3) {
fprintf(stderr, "TRACE: %llu > %s\n", server->raft.id,
info->diagnostic.message);
return;
}
if (info->diagnostic.message[0] == '>') {
snprintf(trace, sizeof trace, "[%4lld] %llu %s", cluster->time,
server->raft.id, info->diagnostic.message);
} else {
snprintf(trace, sizeof trace, " %s", info->diagnostic.message);
}
strcat(cluster->trace, trace);
strcat(cluster->trace, "\n");
fprintf(stderr, "%s\n", trace);
}
static void serverSeed(struct test_server *s);
/* Set the election timeout and the randomized election timeout (timeout +
* delta). */
static void serverSetElectionTimeout(struct test_server *s,
unsigned timeout,
unsigned delta)
{
munit_assert_uint(delta, <=, timeout);
s->randomized_election_timeout = timeout;
s->randomized_election_timeout += delta;
s->raft.election_timeout = timeout;
serverSeed(s);
raft_set_election_timeout(&s->raft, timeout);
/* The current timeout might have changed now. */
s->timeout = raft_timeout(&s->raft);
}
/* Initialize a new server object. */
static void serverInit(struct test_server *s,
raft_id id,
struct test_cluster *cluster)
{
unsigned delta;
int rv;
s->tracer.impl = s;
s->tracer.version = 2;
s->tracer.emit = serverEmit;
s->randomized_election_timeout_prev = 0;
sprintf(s->address, "%llu", id);
rv = raft_init(&s->raft, NULL, NULL, id, s->address);
munit_assert_int(rv, ==, 0);
s->raft.tracer = &s->tracer;
/* By default servers have their randomized timeout increasing
* progressively, so they timeout in order. */
switch (id) {
case 1:
delta = 0;
break;
case 2:
delta = 30;
break;
case 3:
delta = 60;
break;
case 4:
delta = 80;
break;
case 5:
delta = 90;
break;
default:
delta = 90 + (unsigned)id;
break;
}
serverSetElectionTimeout(s, DEFAULT_ELECTION_TIMEOUT, delta);
raft_set_heartbeat_timeout(&s->raft, DEFAULT_HEARTBEAT_TIMEOUT);
raft_set_install_snapshot_timeout(&s->raft, 50);
raft_set_capacity_threshold(&s->raft, 64);
s->log.start = 1;
s->log.entries = munit_malloc(MAX_LOG_ENTRIES * sizeof *s->log.entries);
s->log.n = 0;
s->last_applied = 0;
s->cluster = cluster;
s->network_latency = DEFAULT_NETWORK_LATENCY;
s->disk_latency = DEFAULT_DISK_LATENCY;
s->snapshot.threshold = DEFAULT_SNAPSHOT_THRESHOLD;
s->snapshot.trailing = DEFAULT_SNAPSHOT_TRAILING;
s->snapshot.installing = false;
s->running = false;
}
static int serverStep(struct test_server *s, struct raft_event *event);
static void serverCancelEntries(struct test_server *s, struct step *step)
{
(void)s;
raft_free(step->entries.batch[0].batch);
raft_free(step->entries.batch);
}
static void serverCancelSnapshot(struct test_server *s, struct step *step)
{
struct raft_event *event = &step->event;
(void)s;
raft_free(step->snapshot.chunk.base);
raft_configuration_close(&event->persisted_snapshot.metadata.configuration);
}
/* Release the memory used by a RAFT_RECEIVE event. */
static void dropReceiveEvent(struct step *step)
{
struct raft_event *event = &step->event;
switch (event->receive.message->type) {
case RAFT_APPEND_ENTRIES:
if (event->receive.message->append_entries.n_entries > 0) {
struct raft_entry *entries =
event->receive.message->append_entries.entries;
raft_free(entries[0].buf.base);
raft_free(entries);
}
break;
case RAFT_INSTALL_SNAPSHOT:
raft_configuration_close(
&event->receive.message->install_snapshot.conf);
raft_free(event->receive.message->install_snapshot.data.base);
break;
default:
break;
}
free(event->receive.message);
}
/* Cancel all pending steps targeted to the given server */
static void serverCancelPending(struct test_server *s)
{
while (1) {
struct step *step = NULL;
queue *head;
QUEUE_FOREACH (head, &s->cluster->steps) {
struct step *current;
current = QUEUE_DATA(head, struct step, queue);
if (current->id == s->raft.id) {
step = current;
break;
}
}
if (step == NULL) {
break;
}
switch (step->event.type) {
case RAFT_PERSISTED_ENTRIES:
serverCancelEntries(s, step);
break;
case RAFT_PERSISTED_SNAPSHOT:
serverCancelSnapshot(s, step);
break;
case RAFT_RECEIVE:
dropReceiveEvent(step);
break;
case RAFT_CONFIGURATION:
raft_configuration_close(&step->event.configuration.conf);
break;
default:
break;
}
QUEUE_REMOVE(&step->queue);
free(step);
}
while (1) {
struct step *step = NULL;
queue *head;
QUEUE_FOREACH (head, &s->cluster->send) {
struct step *current;
struct raft_message *message;
current = QUEUE_DATA(head, struct step, queue);
munit_assert_uint(current->event.type, ==, RAFT_RECEIVE);
message = current->event.receive.message;
if (message->server_id /* sender */ == s->raft.id) {
step = current;
break;
}
}
if (step == NULL) {
break;
}
QUEUE_REMOVE(&step->queue);
dropReceiveEvent(step);
free(step);
}
}
static void serverStop(struct test_server *s)
{
unsigned i;
s->running = false;
for (i = 0; i < s->log.n; i++) {
free(s->log.entries[i].buf.base);
}
free(s->log.entries);
/* Re-initialized the raft object. */
raft_close(&s->raft, NULL);
serverInit(s, s->raft.id, s->cluster);
}
/* Release all resources used by a server object. */
static void serverClose(struct test_server *s)
{
if (s->running) {
serverStop(s);
}
free(s->log.entries);
raft_close(&s->raft, NULL);
diskClose(&s->disk);
}
/* Seed raft's internal pseudo random number generator so that the
* next time RandomWithinRange() is run it will return
* exactly the value stored in s->randomized_election_timeout. */
static void serverSeed(struct test_server *s)
{
unsigned timeout = s->raft.election_timeout;
if (s->randomized_election_timeout == s->randomized_election_timeout_prev) {
goto done;
}
s->seed = s->raft.random;
while (1) {
unsigned random = s->seed;
unsigned n = raft_random(&random, timeout, timeout * 2);
if (n == s->randomized_election_timeout) {
goto done;
}
s->seed = random;
}
done:
raft_seed(&s->raft, s->seed);
s->randomized_election_timeout_prev = s->randomized_election_timeout;
}
/* Truncate all in-memory entries from the given index onwards. If there are no
* entries at the given index, this is a no-op. */
static void serverTruncateEntries(struct test_server *s, raft_index index)
{
unsigned i;
unsigned n = s->log.n;
if (index == s->log.start + n) {
return;
}
munit_assert_ulong(index, >=, s->log.start);
munit_assert_ulong(index, <=, s->log.start + n);
for (i = (unsigned)(index - s->log.start); i < n; i++) {
free(s->log.entries[i].buf.base);
s->log.n--;
}
}
static void serverAddEntry(struct test_server *s,
const struct raft_entry *entry)
{
s->log.n++;
munit_assert_uint(s->log.n, <=, MAX_LOG_ENTRIES);
entryCopy(entry, &s->log.entries[s->log.n - 1]);
}
static void copyEntries(const struct raft_entry *src,
struct raft_entry **dst,
const size_t n);
static void serverProcessEntries(struct test_server *s,
raft_index first_index,
struct raft_entry *entries,
unsigned n)
{
struct step *step = munit_malloc(sizeof *step);
struct raft_event *event = &step->event;
unsigned i;
serverTruncateEntries(s, first_index);
for (i = 0; i < n; i++) {
serverAddEntry(s, &entries[i]);
}
copyEntries(entries, &step->entries.batch, n);
step->entries.n = n;
if (n > 0) {
munit_assert_ptr_not_null(entries[0].batch);
raft_free(entries[0].batch);
}
step->id = s->raft.id;
event->time = s->cluster->time + s->disk_latency;
event->type = RAFT_PERSISTED_ENTRIES;
event->persisted_entries.index = first_index + n - 1;
QUEUE_PUSH(&s->cluster->steps, &step->queue);
}
static void serverProcessSnapshot(struct test_server *s,
struct raft_snapshot_metadata *metadata,
size_t offset,
struct raft_buffer *chunk,
bool last)
{
struct step *step = munit_malloc(sizeof *step);
munit_assert_false(s->snapshot.installing);
s->snapshot.installing = true;
step->id = s->raft.id;
/* Update the in-memory log */
serverTruncateEntries(s, s->log.start);
munit_assert_uint(s->log.n, ==, 0);
s->log.start = metadata->index + 1;
step->snapshot.chunk = *chunk;
step->event.time = s->cluster->time + s->disk_latency;
step->event.type = RAFT_PERSISTED_SNAPSHOT;
step->event.persisted_snapshot.metadata = *metadata;
step->event.persisted_snapshot.offset = offset;
step->event.persisted_snapshot.last = last;
QUEUE_PUSH(&s->cluster->steps, &step->queue);
}
static void serverFillAppendEntries(struct test_server *s,
struct raft_append_entries *args);
static void serverFillInstallSnapshot(struct test_server *s,
struct raft_install_snapshot *args);
static void serverProcessMessages(struct test_server *s,
struct raft_message *messages,
unsigned n)
{
unsigned i;
for (i = 0; i < n; i++) {
struct step *step = munit_malloc(sizeof *step);
struct raft_event *event = &step->event;
struct raft_message *message = &messages[i];
step->id = message->server_id;
event->time = s->cluster->time + s->network_latency;
event->type = RAFT_RECEIVE;
event->receive.message = munit_malloc(sizeof *event->receive.message);
*event->receive.message = *message;
event->receive.message->server_id = s->raft.id;
event->receive.message->server_address = s->address;
switch (message->type) {
case RAFT_APPEND_ENTRIES:
serverFillAppendEntries(
s, &event->receive.message->append_entries);
break;
case RAFT_INSTALL_SNAPSHOT:
serverFillInstallSnapshot(
s, &event->receive.message->install_snapshot);
break;
default:
break;
}
QUEUE_PUSH(&s->cluster->send, &step->queue);
}
}
static void serverMaybeTakeSnapshot(struct test_server *s)
{
struct step *step;
struct raft *r = &s->raft;
struct raft_event *event;
raft_index last_snapshot_index = 0;
if (s->disk.snapshot != NULL) {
last_snapshot_index = s->disk.snapshot->metadata.index;
}
if (raft_commit_index(r) - last_snapshot_index < s->snapshot.threshold) {
return;
}
step = munit_malloc(sizeof *step);
step->id = s->raft.id;
event = &step->event;
event->time = s->cluster->time;
event->type = RAFT_SNAPSHOT;
event->snapshot.metadata.index = raft_commit_index(r);
event->snapshot.metadata.term = raft_current_term(r);
/* XXX: assume there is no uncommitted configuration. */
munit_assert_ullong(r->configuration_uncommitted_index, ==, 0);
munit_assert_ullong(r->configuration_committed_index, >, 0);
confCopy(&r->configuration, &event->snapshot.metadata.configuration);
event->snapshot.metadata.configuration_index =
r->configuration_committed_index;
event->snapshot.trailing = s->snapshot.trailing;
QUEUE_PUSH(&s->cluster->steps, &step->queue);
}
static void serverProcessCommitIndex(struct test_server *s)
{
struct step *step;
const struct raft_entry *entry;
raft_index commit_index = raft_commit_index(&s->raft);
raft_index index;
int rv;
for (index = s->last_applied + 1; index <= commit_index; index++) {
entry = diskGetEntry(&s->disk, index);
if (entry == NULL || entry->type != RAFT_CHANGE) {
continue;
}
step = munit_malloc(sizeof *step);
step->id = s->raft.id;
step->event.time = s->cluster->time;
step->event.type = RAFT_CONFIGURATION;
step->event.configuration.index = index;
rv = raft_configuration_decode(&entry->buf,
&step->event.configuration.conf);
munit_assert_int(rv, ==, 0);
QUEUE_PUSH(&s->cluster->steps, &step->queue);
}
s->last_applied = commit_index;
serverMaybeTakeSnapshot(s);
}
/* Fire the given event using raft_step() and process the resulting struct
* raft_update object. */
static int serverStep(struct test_server *s, struct raft_event *event)
{
struct raft *r = &s->raft;
struct raft_update update;
int rv;
event->capacity = diskCapacity(&s->disk);
munit_assert_true(s->running);
rv = raft_step(r, event, &update);
if (rv != 0) {
return rv;
}
if (update.flags & RAFT_UPDATE_CURRENT_TERM) {
diskSetTerm(&s->disk, raft_current_term(r));
}
if (update.flags & RAFT_UPDATE_VOTED_FOR) {
diskSetVote(&s->disk, raft_voted_for(r));
}
if (update.flags & RAFT_UPDATE_ENTRIES) {
serverProcessEntries(s, update.entries.index, update.entries.batch,
update.entries.n);
}
if (update.flags & RAFT_UPDATE_SNAPSHOT) {
serverProcessSnapshot(s, &update.snapshot.metadata,
update.snapshot.offset, &update.snapshot.chunk,
update.snapshot.last);
}
if (update.flags & RAFT_UPDATE_MESSAGES) {
serverProcessMessages(s, update.messages.batch, update.messages.n);
}
if (update.flags & RAFT_UPDATE_TIMEOUT) {
s->timeout = raft_timeout(&s->raft);
}
if (update.flags & RAFT_UPDATE_COMMIT_INDEX) {
serverProcessCommitIndex(s);
}
return 0;
}
/* Start the server by passing to raft_step() a RAFT_START event with the
* current disk state. */
static void serverStart(struct test_server *s)
{
struct raft_event event;
unsigned i;
int rv;
s->running = true;
serverSeed(s);
event.time = s->cluster->time;
event.type = RAFT_START;
diskLoad(&s->disk, &event.start.term, &event.start.voted_for,
&event.start.metadata, &event.start.start_index,
&event.start.entries, &event.start.n_entries);
s->log.start = event.start.start_index;
s->log.n = event.start.n_entries;
munit_assert_uint(s->log.n, <=, MAX_LOG_ENTRIES);
for (i = 0; i < s->log.n; i++) {
entryCopy(&event.start.entries[i], &s->log.entries[i]);
}
rv = serverStep(s, &event);
munit_assert_int(rv, ==, 0);
if (event.start.metadata != NULL) {
free(event.start.metadata);
}
if (event.start.entries != NULL) {
raft_free(event.start.entries[0].batch);
raft_free(event.start.entries);
}
}
/* Fire a RAFT_TIMEOUT event. */
static void serverTimeout(struct test_server *s)
{
struct raft_event event;
int rv;
s->cluster->time = s->timeout;
event.time = s->cluster->time;
event.type = RAFT_TIMEOUT;
rv = serverStep(s, &event);
munit_assert_int(rv, ==, 0);
}
/* Create a single batch of entries containing a copy of the given entries,
* including their data. Use raft_malloc() since memory ownership is going to be
* handed over to raft via raft_recv(). */
static void copyEntries(const struct raft_entry *src,
struct raft_entry **dst,
const size_t n)
{
size_t size = 0;
void *batch;
uint8_t *cursor;
unsigned i;
if (n == 0) {
*dst = NULL;
return;
}
/* Calculate the total size of the entries content and allocate the
* batch. */
for (i = 0; i < n; i++) {
size += src[i].buf.len;
}
batch = raft_malloc(size);
munit_assert_ptr_not_null(batch);
/* Copy the entries. */
*dst = raft_malloc(n * sizeof **dst);
munit_assert_ptr_not_null(*dst);
cursor = batch;
for (i = 0; i < n; i++) {
(*dst)[i].term = src[i].term;
(*dst)[i].type = src[i].type;
(*dst)[i].buf.base = cursor;
(*dst)[i].buf.len = src[i].buf.len;
(*dst)[i].batch = batch;
memcpy((*dst)[i].buf.base, src[i].buf.base, src[i].buf.len);
cursor += src[i].buf.len;
}
}
/* Use the log cache to populate the given AppendEntries message. */
static void serverFillAppendEntries(struct test_server *s,
struct raft_append_entries *args)
{
raft_index index = args->prev_log_index + 1;
unsigned i;
munit_assert_ullong(index, >=, s->log.start);
munit_assert_ullong(index + args->n_entries, <=, s->log.start + s->log.n);
if (args->n_entries == 0) {
args->entries = NULL;
return;
}
i = (unsigned)(index - s->log.start);
copyEntries(&s->log.entries[i], &args->entries, args->n_entries);
}
/* Load from disk the data of the snapshot being sent. */
static void serverFillInstallSnapshot(struct test_server *s,
struct raft_install_snapshot *args)
{
struct raft_snapshot_metadata metadata;
diskLoadSnapshotData(&s->disk, args->last_index, args->last_term,
&args->data);
diskLoadSnapshotMetadata(&s->disk, &metadata);
args->conf = metadata.configuration;
args->conf_index = metadata.configuration_index;
}
static void serverCompleteEntries(struct test_server *s, struct step *step)
{
struct raft_event *event = &step->event;
struct raft_entry *entries = step->entries.batch;
raft_index index = event->persisted_entries.index;
unsigned n = step->entries.n;
unsigned i;
int rv;
/* Possibly truncate stale entries. */
diskTruncateEntries(&s->disk, index - n + 1);
for (i = 0; i < n; i++) {
diskAddEntry(&s->disk, &entries[i]);
}
if (!s->snapshot.installing) {
rv = serverStep(s, event);
munit_assert_int(rv, ==, 0);
}
if (n > 0) {
raft_free(step->entries.batch[0].batch);
raft_free(step->entries.batch);
}
}
static void serverCompleteSnapshot(struct test_server *s, struct step *step)
{
struct test_snapshot *snapshot = munit_malloc(sizeof *snapshot);
struct raft_event *event = &step->event;
int rv;
s->snapshot.installing = false;
snapshot->metadata.index = event->persisted_snapshot.metadata.index;
snapshot->metadata.term = event->persisted_snapshot.metadata.term;
confCopy(&event->persisted_snapshot.metadata.configuration,
&snapshot->metadata.configuration);
snapshot->metadata.configuration_index =
event->persisted_snapshot.metadata.configuration_index;
snapshot->data = step->snapshot.chunk;
diskSetSnapshot(&s->disk, snapshot);
rv = serverStep(s, event);
munit_assert_int(rv, ==, 0);
}
/* Return true if the server with id1 is connected with the server with id2 */
static bool clusterAreConnected(struct test_cluster *c,
raft_id id1,
raft_id id2)
{
bool connected = true;
queue *head;
/* Check if there's a disconnection. */
QUEUE_FOREACH (head, &c->disconnect) {
struct disconnect *d = QUEUE_DATA(head, struct disconnect, queue);
if (d->id1 == id1 && d->id2 == id2) {
connected = false;
break;
}
}
return connected;
}
static void serverCompleteReceive(struct test_server *s, struct step *step)
{
struct raft_event *event = &step->event;
int rv;
if (!s->running) {
dropReceiveEvent(step);
return;
}
/* Check if there's a disconnection. */
if (!clusterAreConnected(s->cluster,
event->receive.message->server_id /* sender */,
s->raft.id /* receiver */)) {
dropReceiveEvent(step);
return;
}
rv = serverStep(s, event);
munit_assert_int(rv, ==, 0);
if (event->receive.message->type == RAFT_APPEND_ENTRIES) {
raft_free(event->receive.message->append_entries.entries);
}
free(event->receive.message);
}
static void serverCompleteConfiguration(struct test_server *s,
struct step *step)
{
struct raft *r = &s->raft;
struct raft_event *event = &step->event;
raft_index commit_index = raft_commit_index(r);
int rv;
rv = serverStep(s, event);
munit_assert_int(rv, ==, 0);
/* The last call to raft_step() did not change the commit index. */
munit_assert_ullong(raft_commit_index(r), ==, commit_index);
}
static void serverCompleteTakeSnapshot(struct test_server *s, struct step *step)
{
struct test_snapshot *snapshot = munit_malloc(sizeof *snapshot);
struct raft_event *event = &step->event;
int rv;
/* XXX: this assumes that the current term is the term of the last committed
* entry. */
snapshot->metadata.index = event->snapshot.metadata.index;
snapshot->metadata.term = event->snapshot.metadata.term;
confCopy(&event->snapshot.metadata.configuration,
&snapshot->metadata.configuration);
snapshot->data.len = 8;
snapshot->data.base = munit_malloc(snapshot->data.len);
diskSetSnapshot(&s->disk, snapshot);
rv = serverStep(s, event);
munit_assert_int(rv, ==, 0);
}
/* Complete some event involving I/O or user actions. */
static void serverComplete(struct test_server *s, struct step *step)
{
munit_assert_ulong(s->raft.id, ==, step->id);
s->cluster->time = step->event.time;
QUEUE_REMOVE(&step->queue);
switch (step->event.type) {
case RAFT_PERSISTED_ENTRIES:
serverCompleteEntries(s, step);
break;
case RAFT_PERSISTED_SNAPSHOT:
serverCompleteSnapshot(s, step);
break;
case RAFT_RECEIVE:
serverCompleteReceive(s, step);
break;
case RAFT_CONFIGURATION:
serverCompleteConfiguration(s, step);
break;
case RAFT_SNAPSHOT:
serverCompleteTakeSnapshot(s, step);
break;
default:
munit_errorf("unexpected step type %d", step->event.type);
break;
}
free(step);
}
void test_cluster_setup(const MunitParameter params[], struct test_cluster *c)
{
unsigned i;
(void)params;
for (i = 0; i < TEST_CLUSTER_N_SERVERS; i++) {
diskInit(&c->servers[i].disk);
serverInit(&c->servers[i], i + 1, c);
}
c->time = 0;
c->in_tear_down = false;
QUEUE_INIT(&c->steps);
QUEUE_INIT(&c->send);
QUEUE_INIT(&c->disconnect);
}
/* Return the server with the given @id. */
static struct test_server *clusterGetServer(struct test_cluster *c, raft_id id)
{
munit_assert_ulong(id, <=, TEST_CLUSTER_N_SERVERS);
return &c->servers[id - 1];
}
void test_cluster_tear_down(struct test_cluster *c)
{
unsigned i;
c->in_tear_down = true;
/* Cancel pending steps */
for (i = 0; i < TEST_CLUSTER_N_SERVERS; i++) {
struct test_server *server = &c->servers[i];
serverCancelPending(server);
}
munit_assert_true(QUEUE_IS_EMPTY(&c->steps));
munit_assert_true(QUEUE_IS_EMPTY(&c->send));
/* Drop outstanding disconnections */
while (!QUEUE_IS_EMPTY(&c->disconnect)) {
struct disconnect *disconnect;
queue *head;
head = QUEUE_HEAD(&c->disconnect);
disconnect = QUEUE_DATA(head, struct disconnect, queue);
QUEUE_REMOVE(&disconnect->queue);
free(disconnect);
}
for (i = 0; i < TEST_CLUSTER_N_SERVERS; i++) {
serverClose(&c->servers[i]);
}
}
struct raft *test_cluster_raft(struct test_cluster *c, raft_id id)
{
struct test_server *server = clusterGetServer(c, id);
return &server->raft;
}
void test_cluster_set_term(struct test_cluster *c, raft_id id, raft_term term)
{
struct test_server *server = clusterGetServer(c, id);
munit_assert_false(server->running);
diskSetTerm(&server->disk, term);
}
void test_cluster_set_vote(struct test_cluster *c, raft_id id, raft_id vote)
{
struct test_server *server = clusterGetServer(c, id);
munit_assert_false(server->running);
diskSetVote(&server->disk, vote);
}
void test_cluster_set_snapshot(struct test_cluster *c,
raft_id id,
struct test_snapshot *snapshot)
{
struct test_server *server = clusterGetServer(c, id);
munit_assert_false(server->running);
diskSetSnapshot(&server->disk, snapshot);
}
void test_cluster_add_entry(struct test_cluster *c,
raft_id id,
const struct raft_entry *entry)
{
struct test_server *server = clusterGetServer(c, id);
munit_assert_false(server->running);
diskAddEntry(&server->disk, entry);
}
void test_cluster_set_election_timeout(struct test_cluster *c,
raft_id id,
unsigned timeout,
unsigned delta)
{
struct test_server *server = clusterGetServer(c, id);
serverSetElectionTimeout(server, timeout, delta);
}
void test_cluster_set_network_latency(struct test_cluster *c,
raft_id id,
unsigned latency)
{
struct test_server *server = clusterGetServer(c, id);
server->network_latency = latency;
}
void test_cluster_set_snapshot_threshold(struct test_cluster *c,
raft_id id,
unsigned threshold)
{
struct test_server *server = clusterGetServer(c, id);
server->snapshot.threshold = threshold;
}
void test_cluster_set_snapshot_trailing(struct test_cluster *c,
raft_id id,
unsigned trailing)
{
struct test_server *server = clusterGetServer(c, id);
server->snapshot.trailing = trailing;
}
void test_cluster_set_disk_latency(struct test_cluster *c,
raft_id id,
unsigned latency)
{
struct test_server *server = clusterGetServer(c, id);
server->disk_latency = latency;
}
void test_cluster_start(struct test_cluster *c, raft_id id)
{
struct test_server *server = clusterGetServer(c, id);
serverStart(server);
}
void test_cluster_stop(struct test_cluster *c, raft_id id)
{
struct test_server *server = clusterGetServer(c, id);
serverCancelPending(server);
serverStop(server);
}
int test_cluster_submit(struct test_cluster *c,
raft_id id,
struct raft_entry *entry)
{
struct test_server *server = clusterGetServer(c, id);
struct raft_event event;
event.time = c->time;
event.type = RAFT_SUBMIT;
event.submit.n = 1;
event.submit.entries = entry;
return serverStep(server, &event);
}
void test_cluster_catch_up(struct test_cluster *c,
raft_id id,
raft_id catch_up_id)
{
struct test_server *server = clusterGetServer(c, id);
struct raft_event event;
int rv;
event.time = c->time;
event.type = RAFT_CATCH_UP;
event.catch_up.server_id = catch_up_id;
rv = serverStep(server, &event);
munit_assert_int(rv, ==, 0);
}
void test_cluster_transfer(struct test_cluster *c,
raft_id id,
raft_id transferee)
{
struct test_server *server = clusterGetServer(c, id);
struct raft_event event;
int rv;
event.time = c->time;
event.type = RAFT_TRANSFER;
event.transfer.server_id = transferee;
rv = serverStep(server, &event);
munit_assert_int(rv, ==, 0);
}
/* Update the PNRG seed of each server, to match the expected randomized
* election timeout. */
static void clusterSeed(struct test_cluster *c)
{
unsigned i;
for (i = 0; i < TEST_CLUSTER_N_SERVERS; i++) {
struct test_server *server = &c->servers[i];
serverSeed(server);
}
}
/* Return the schedule step with the lowest execution time. */
static struct step *clusterGetStepWithEarliestExecution(struct test_cluster *c)
{
struct step *step = NULL;
queue *head;
QUEUE_FOREACH (head, &c->steps) {
struct step *other = QUEUE_DATA(head, struct step, queue);
if (step == NULL || other->event.time < step->event.time) {
step = other;
}
}
return step;
}
/* Return the server with the earliest raft_timeout() value. */
static struct test_server *clusterGetServerWithEarliestTimeout(
struct test_cluster *c)
{
struct test_server *server = NULL;
unsigned i;
for (i = 0; i < TEST_CLUSTER_N_SERVERS; i++) {
struct test_server *other = &c->servers[i];
if (!other->running) {
continue;
}
if (server == NULL || other->timeout < server->timeout) {
server = other;
}
}
return server;
}
/* Consume the queue of pending messages to be sent, and enqueue them as regular
* RAFT_RECEIVE events in the steps queue . */
static void clusterEnqueueReceives(struct test_cluster *c)
{
while (!QUEUE_IS_EMPTY(&c->send)) {
struct step *step;
queue *head;
struct raft_message *message;
head = QUEUE_HEAD(&c->send);
step = QUEUE_DATA(head, struct step, queue);
munit_assert_uint(step->event.type, ==, RAFT_RECEIVE);
QUEUE_REMOVE(&step->queue);
message = step->event.receive.message;
if (!clusterAreConnected(c, message->server_id /* sender */,
step->id /* receiver */)) {
dropReceiveEvent(step);
free(step);
continue;
}
QUEUE_PUSH(&c->steps, &step->queue);
}
}
void test_cluster_step(struct test_cluster *c)
{
struct test_server *server;
struct step *step;
clusterEnqueueReceives(c);
clusterSeed(c);
server = clusterGetServerWithEarliestTimeout(c);
step = clusterGetStepWithEarliestExecution(c);
if (step == NULL || server->timeout < step->event.time) {
serverTimeout(server);
} else {
server = clusterGetServer(c, step->id);
serverComplete(server, step);
}
}
void test_cluster_elapse(struct test_cluster *c, unsigned msecs)
{
raft_time time = c->time + msecs;
while (1) {
struct test_server *server;
struct step *step;
clusterEnqueueReceives(c);
server = clusterGetServerWithEarliestTimeout(c);
step = clusterGetStepWithEarliestExecution(c);
/* If no server and no step is due to timeout/complete before the
* target time, then we can jump directly to that time. */
if (time <= server->timeout &&
(step == NULL || time <= step->event.time)) {
break;
}
/* Otherwise, process those events first. */
test_cluster_step(c);
}
c->time = time;
}
void test_cluster_disconnect(struct test_cluster *c, raft_id id1, raft_id id2)
{
struct disconnect *disconnect = munit_malloc(sizeof *disconnect);
disconnect->id1 = id1;
disconnect->id2 = id2;
QUEUE_PUSH(&c->disconnect, &disconnect->queue);
}
void test_cluster_reconnect(struct test_cluster *c, raft_id id1, raft_id id2)
{
queue *head;
QUEUE_FOREACH (head, &c->disconnect) {
struct disconnect *d = QUEUE_DATA(head, struct disconnect, queue);
if (d->id1 == id1 && d->id2 == id2) {
QUEUE_REMOVE(&d->queue);
free(d);
return;
}
}
}
void test_cluster_kill(struct test_cluster *c, raft_id id)
{
struct test_server *server = clusterGetServer(c, id);
server->running = false;
}
bool test_cluster_trace(struct test_cluster *c, const char *expected)
{
size_t n1;
size_t n2;
size_t i;
unsigned max_steps = 100;
consume:
if (max_steps == 0) {
goto mismatch;
}
max_steps -= 1;
n1 = strlen(c->trace);
n2 = strlen(expected);
for (i = 0; i < n1 && i < n2; i++) {
if (c->trace[i] != expected[i]) {
break;
}
}
/* Check if we produced more output than the expected one. */
if (n1 > n2) {
goto mismatch;
}
/* If there's more expected output, check that so far we're good, then
* step and repeat. */
if (n1 < n2) {
if (i != n1) {
goto mismatch;
}
c->trace[0] = 0;
expected += i;
test_cluster_step(c);
goto consume;
}
munit_assert_ulong(n1, ==, n2);
if (i != n1) {
goto mismatch;
}
c->trace[0] = 0;
return true;
mismatch:
fprintf(stderr, "==> Expected:\n");
fprintf(stderr, "%s\n", expected);
fprintf(stderr, "==> Actual:\n");
fprintf(stderr, "%s\n", c->trace);
return false;
}
raft-0.22.1/test/lib/cluster.h 0000664 0000000 0000000 00000043045 14601504142 0016112 0 ustar 00root root 0000000 0000000 /* Setup and drive a test raft cluster. */
#ifndef TEST_CLUSTER_H
#define TEST_CLUSTER_H
#include
#include "../../include/raft.h"
#include "fsm.h"
#include "heap.h"
#include "macros.h"
#include "munit.h"
#include "snapshot.h"
#define FIXTURE_CLUSTER \
FIXTURE_HEAP; \
struct test_cluster cluster_
#define SETUP_CLUSTER() test_cluster_setup(params, &f->cluster_)
#define TEAR_DOWN_CLUSTER() test_cluster_tear_down(&f->cluster_)
/* Start the server with the given ID, using the state persisted on its disk. */
#define CLUSTER_START(ID) test_cluster_start(&f->cluster_, ID)
/* Stop the server with the given ID. */
#define CLUSTER_STOP(ID) test_cluster_stop(&f->cluster_, ID);
/* Step the cluster until the all expected output is consumed. Fail the test if
* a mismatch is found. */
#define CLUSTER_TRACE(EXPECTED) \
if (!test_cluster_trace(&f->cluster_, EXPECTED)) { \
munit_error("trace does not match"); \
}
/* Step the cluster until the given amount of milliseconds has elapsed. */
#define CLUSTER_ELAPSE(MSECS) test_cluster_elapse(&f->cluster_, MSECS)
/* Disconnect the server with ID1 with the one with ID2. */
#define CLUSTER_DISCONNECT(ID1, ID2) \
test_cluster_disconnect(&f->cluster_, ID1, ID2)
/* Reconnect two servers. */
#define CLUSTER_RECONNECT(ID1, ID2) \
test_cluster_reconnect(&f->cluster_, ID1, ID2)
#define CLUSTER_SUBMIT__CHOOSER(...) \
GET_6TH_ARG(__VA_ARGS__, CLUSTER_SUBMIT__TYPE, CLUSTER_SUBMIT__TYPE, \
CLUSTER_SUBMIT__TYPE, CLUSTER_SUBMIT__RAW, )
/* Submit an entry */
#define CLUSTER_SUBMIT(...) CLUSTER_SUBMIT__CHOOSER(__VA_ARGS__)(__VA_ARGS__)
#define CLUSTER_SUBMIT__TYPE(ID, TYPE, ...) \
CLUSTER_SUBMIT__##TYPE(ID, __VA_ARGS__)
#define CLUSTER_SUBMIT__CHANGE(ID, N, N_VOTING, N_STANDBYS) \
do { \
struct raft_entry entry_; \
struct raft_configuration conf_; \
int rv_; \
CLUSTER_FILL_CONFIGURATION(&conf_, N, N_VOTING, N_STANDBYS); \
entry_.type = RAFT_CHANGE; \
entry_.term = raft_current_term(CLUSTER_RAFT(ID)); \
rv_ = raft_configuration_encode(&conf_, &entry_.buf); \
munit_assert_int(rv_, ==, 0); \
raft_configuration_close(&conf_); \
entry_.batch = entry_.buf.base; \
CLUSTER_SUBMIT__RAW(ID, &entry_); \
} while (0)
#define CLUSTER_SUBMIT__COMMAND(ID, SIZE) \
do { \
struct raft_entry entry_; \
entry_.type = RAFT_COMMAND; \
entry_.term = raft_current_term(CLUSTER_RAFT(ID)); \
entry_.buf.len = SIZE; \
entry_.buf.base = raft_malloc(entry_.buf.len); \
munit_assert_not_null(entry_.buf.base); \
entry_.batch = entry_.buf.base; \
CLUSTER_SUBMIT__RAW(ID, &entry_); \
} while (0)
#define CLUSTER_SUBMIT__RAW(ID, ENTRY) \
do { \
int rv__; \
rv__ = test_cluster_submit(&f->cluster_, ID, ENTRY); \
munit_assert_int(rv__, ==, 0); \
} while (0)
/* Set the persisted vote of the server with the given ID. Must me called before
* starting the server. */
#define CLUSTER_SET_VOTE(ID, VOTE) \
test_cluster_set_vote(&f->cluster_, ID, VOTE);
/* Set the persisted term of the server with the given ID. Must me called before
* starting the server. */
#define CLUSTER_SET_TERM(ID, TERM) test_cluster_set_term(&f->cluster_, ID, TERM)
/* Helper to populate a struct raft_configuration object CONF, adding N servers
* to it, among which N_VOTERS are voters and N_STANDBYS are standbys. */
#define CLUSTER_FILL_CONFIGURATION(CONF, N, N_VOTERS, N_STANDBYS) \
do { \
unsigned _i; \
int __rv; \
munit_assert_int(N, >=, 1); \
munit_assert_int(N_VOTERS, <=, N); \
raft_configuration_init(CONF); \
for (_i = 0; _i < N; _i++) { \
raft_id _id = _i + 1; \
int _role = RAFT_SPARE; \
char _address[64]; \
if (_i < N_VOTERS) { \
_role = RAFT_VOTER; \
} else if (N_STANDBYS > 0 && (int)(_i - N_VOTERS) < N_STANDBYS) { \
_role = RAFT_STANDBY; \
} \
sprintf(_address, "%llu", _id); \
__rv = raft_configuration_add(CONF, _id, _address, _role); \
munit_assert_int(__rv, ==, 0); \
} \
} while (0)
/* Set the persisted snapshot of the server with the given ID. Must me called
* before starting the server. */
#define CLUSTER_SET_SNAPSHOT(ID, INDEX, TERM, CONF_N, CONF_N_VOTING, \
CONF_INDEX) \
do { \
struct test_snapshot *_snapshot = munit_malloc(sizeof *_snapshot); \
_snapshot->metadata.index = INDEX; \
_snapshot->metadata.term = TERM; \
CLUSTER_FILL_CONFIGURATION(&_snapshot->metadata.configuration, CONF_N, \
CONF_N_VOTING, 0); \
_snapshot->metadata.configuration_index = CONF_INDEX; \
_snapshot->data.len = 8; \
_snapshot->data.base = munit_malloc(_snapshot->data.len); \
test_cluster_set_snapshot(&f->cluster_, ID, _snapshot); \
} while (0)
#define CLUSTER_ADD_ENTRY__CHOOSER(...) \
GET_5TH_ARG(__VA_ARGS__, CLUSTER_ADD_ENTRY__TYPE, CLUSTER_ADD_ENTRY__TYPE, \
CLUSTER_ADD_ENTRY__RAW, )
#define CLUSTER_ADD_ENTRY(...) \
CLUSTER_ADD_ENTRY__CHOOSER(__VA_ARGS__)(__VA_ARGS__)
#define CLUSTER_ADD_ENTRY__TYPE(ID, TYPE, ...) \
CLUSTER_ADD_ENTRY__##TYPE(ID, __VA_ARGS__)
/* Add an entry to the ones persisted on the server with the given ID. This must
* be called before starting the cluster. */
#define CLUSTER_ADD_ENTRY__RAW(ID, ENTRY) \
test_cluster_add_entry(&f->cluster_, ID, ENTRY)
#define CLUSTER_ADD_ENTRY__RAFT_CHANGE(ID, CONF_N, CONF_N_VOTING) \
do { \
struct raft_configuration _configuration; \
struct raft_entry _entry; \
int _rv; \
\
CLUSTER_FILL_CONFIGURATION(&_configuration, CONF_N, CONF_N_VOTING, 0); \
_entry.type = RAFT_CHANGE; \
_entry.term = 1; \
_rv = raft_configuration_encode(&_configuration, &_entry.buf); \
munit_assert_int(_rv, ==, 0); \
raft_configuration_close(&_configuration); \
\
CLUSTER_ADD_ENTRY__RAW(ID, &_entry); \
\
raft_free(_entry.buf.base); \
} while (0);
#define CLUSTER_ADD_ENTRY__RAFT_COMMAND(ID, TERM, PAYLOAD) \
do { \
uint64_t _payload = PAYLOAD; \
struct raft_entry _entry; \
\
_entry.type = RAFT_COMMAND; \
_entry.term = TERM; \
_entry.buf.base = &_payload; \
_entry.buf.len = sizeof _payload; \
CLUSTER_ADD_ENTRY__RAW(ID, &_entry); \
} while (0);
/* Return the struct raft object with the given ID. */
#define CLUSTER_RAFT(ID) test_cluster_raft(&f->cluster_, ID)
/* Set the snapshot threshold of the server with the given ID. */
#define CLUSTER_SET_SNAPSHOT_THRESHOLD(ID, THRESHOLD) \
test_cluster_set_snapshot_threshold(&f->cluster_, ID, THRESHOLD)
/* Set the trailing entries to keep after taking a snapshot on the server with
* the given ID. */
#define CLUSTER_SET_SNAPSHOT_TRAILING(ID, TRAILING) \
test_cluster_set_snapshot_trailing(&f->cluster_, ID, TRAILING)
/* Set the network latency for outgoing messages sent by the server with the
* given ID. */
#define CLUSTER_SET_NETWORK_LATENCY(ID, MSECS) \
test_cluster_set_network_latency(&f->cluster_, ID, MSECS)
/* Set the disk I/O latency of server I. */
#define CLUSTER_SET_DISK_LATENCY(ID, MSECS) \
test_cluster_set_disk_latency(&f->cluster_, ID, MSECS)
#define CLUSTER_SET_ELECTION_TIMEOUT(ID, TIMEOUT, DELTA) \
test_cluster_set_election_timeout(&f->cluster_, ID, TIMEOUT, DELTA)
/* Test snapshot that is just persisted in-memory. */
struct test_snapshot
{
struct raft_snapshot_metadata metadata;
struct raft_buffer data;
};
/* Persisted state of a single node.
*
* The data contained in this struct is passed to raft_step() as RAFT_START
* event when starting a server, and is updated as the server makes progress. */
struct test_disk
{
raft_term term;
raft_id voted_for;
struct test_snapshot *snapshot;
raft_index start_index;
struct raft_entry *entries;
unsigned n_entries;
unsigned short size; /* Disk size in bytes */
};
/* Wrap a @raft instance and maintain disk and network state. */
struct test_cluster;
struct test_server
{
struct test_disk disk; /* Persisted data */
struct raft_tracer tracer; /* Custom tracer */
struct raft raft; /* Raft instance */
struct test_cluster *cluster; /* Parent cluster */
raft_index last_applied; /* Last processed committed index. */
raft_time timeout; /* Next scheduled timeout */
unsigned network_latency; /* Network latency */
unsigned disk_latency; /* Disk latency */
char address[8]; /* Server address */
struct
{
unsigned threshold; /* Number of entries before taking a snapshot. */
unsigned trailing; /* Number of entries to leave after a snapshot. */
bool installing; /* True if installing a snapshot */
} snapshot;
bool running; /* Whether the server is running */
struct
{
raft_index start;
struct raft_entry *entries;
unsigned n;
} log; /* Cache of the on-disk log. */
/* The randomized_election_timeout field stores the value that the raft
* instance will obtain the next time it calls RandomWithinRange() to obtain
* a random number in the [election_timeout, election_timeout * 2] range. We
* do that by passing raft_seed() a value that makes the pseudor random
* number generator produce exactly randomized_election_timeout. That value
* is what we store in the seed field below. Since calculating the seed that
* matches the desired randomized_election_timeout is somehow expensive, we
* also use randomized_election_timeout_prev to store the previous value of
* randomized_election_timeout, in order to re-use the same seed if nothing
* has changed.
*
* See serverSeed for more details. */
unsigned randomized_election_timeout;
unsigned randomized_election_timeout_prev;
unsigned seed;
};
#define TEST_CLUSTER_N_SERVERS 8
/* Cluster of test raft servers instances with fake disk and network I/O. */
struct test_cluster
{
struct test_server servers[TEST_CLUSTER_N_SERVERS]; /* Cluster servers */
raft_time time; /* Global time */
bool in_tear_down; /* Tearing down */
char trace[8192]; /* Captured messages */
void *steps[2]; /* Pending events */
void *send[2]; /* Pending messages */
void *disconnect[2]; /* Network faults */
};
void test_cluster_setup(const MunitParameter params[], struct test_cluster *c);
void test_cluster_tear_down(struct test_cluster *c);
/* Return the raft object with the given @id. */
struct raft *test_cluster_raft(struct test_cluster *c, raft_id id);
/* Set the persisted term of the given server to the given value. Must me called
* before starting the server. */
void test_cluster_set_term(struct test_cluster *c, raft_id id, raft_term term);
/* Set the persisted vote of the given server to the given value. Must me called
* before starting the server. */
void test_cluster_set_vote(struct test_cluster *c, raft_id id, raft_id term);
/* Set the last persisted snapshot of the given server. Must me called before
* starting the server. */
void test_cluster_set_snapshot(struct test_cluster *c,
raft_id id,
struct test_snapshot *snapshot);
void test_cluster_add_entry(struct test_cluster *c,
raft_id id,
const struct raft_entry *entry);
/* Set a custom election timeout for the given server. Must me called
* before starting the server. The randomized timeout will be set to timeout +
* delta. */
void test_cluster_set_election_timeout(struct test_cluster *c,
raft_id id,
unsigned timeout,
unsigned delta);
/* Set the threshold for taking snapshots on the given server. */
void test_cluster_set_snapshot_threshold(struct test_cluster *c,
raft_id id,
unsigned threshold);
/* Set the number of entries to leave after a snapshot. */
void test_cluster_set_snapshot_trailing(struct test_cluster *c,
raft_id id,
unsigned trailing);
/* Set the network latency of messages sent by the given server. */
void test_cluster_set_network_latency(struct test_cluster *c,
raft_id id,
unsigned latency);
/* Set the network latency of disk writes issued by the given server. */
void test_cluster_set_disk_latency(struct test_cluster *c,
raft_id id,
unsigned latency);
/* Start the server with the given @id, using the current state persisted on its
* disk. */
void test_cluster_start(struct test_cluster *c, raft_id id);
/* Stop the server with the given @id. */
void test_cluster_stop(struct test_cluster *c, raft_id id);
/* Submit a new entry. */
int test_cluster_submit(struct test_cluster *c,
raft_id id,
struct raft_entry *entry);
/* Start to catch-up a server. */
void test_cluster_catch_up(struct test_cluster *c,
raft_id id,
raft_id catch_up_id);
/* Fire a leadership transfer. */
void test_cluster_transfer(struct test_cluster *c,
raft_id id,
raft_id transferee);
/* Advance the cluster by completing a single asynchronous operation or firing a
* timeout. */
void test_cluster_step(struct test_cluster *c);
/* Let the given number of milliseconds elapse. This requires that no event
* would be triggered by test_cluster_step() in the given time window. */
void test_cluster_elapse(struct test_cluster *c, unsigned msecs);
/* Stop delivering messages from id1 to id2 */
void test_cluster_disconnect(struct test_cluster *c, raft_id id1, raft_id id2);
/* Resume delivering messages from id1 to id2 */
void test_cluster_reconnect(struct test_cluster *c, raft_id id1, raft_id id2);
/* Crash a server and stop running it. */
void test_cluster_kill(struct test_cluster *c, raft_id id);
/* Compare the trace of all messages emitted by all servers with the given
* expected trace. If they don't match, print the last line which differs and
* return #false. */
bool test_cluster_trace(struct test_cluster *c, const char *expected);
#endif /* TEST_CLUSTER_H */
raft-0.22.1/test/lib/dir.c 0000664 0000000 0000000 00000022237 14601504142 0015202 0 ustar 00root root 0000000 0000000 #include "dir.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#define SEP "/"
#define TEMPLATE "raft-test-XXXXXX"
#define TEST_DIR_TEMPLATE "./tmp/%s/raft-test-XXXXXX"
static char *dirAll[] = {"tmpfs", "ext4", "btrfs", "xfs", "zfs", NULL};
static char *dirTmpfs[] = {"tmpfs", NULL};
static char *dirAio[] = {"btrfs", "ext4", "xfs", NULL};
static char *dirNoAio[] = {"tmpfs", "zfs", NULL};
MunitParameterEnum DirTmpfsParams[] = {
{DIR_FS_PARAM, dirTmpfs},
{NULL, NULL},
};
MunitParameterEnum DirAllParams[] = {
{DIR_FS_PARAM, dirAll},
{NULL, NULL},
};
MunitParameterEnum DirAioParams[] = {
{DIR_FS_PARAM, dirAio},
{NULL, NULL},
};
MunitParameterEnum DirNoAioParams[] = {
{DIR_FS_PARAM, dirNoAio},
{NULL, NULL},
};
/* Create a temporary directory in the given parent directory. */
static char *dirMakeTemp(const char *parent)
{
char *dir;
if (parent == NULL) {
return NULL;
}
dir = munit_malloc(strlen(parent) + strlen(SEP) + strlen(TEMPLATE) + 1);
sprintf(dir, "%s%s%s", parent, SEP, TEMPLATE);
if (mkdtemp(dir) == NULL) {
munit_error(strerror(errno));
}
return dir;
}
void *DirSetUp(MUNIT_UNUSED const MunitParameter params[],
MUNIT_UNUSED void *user_data)
{
const char *fs = munit_parameters_get(params, DIR_FS_PARAM);
if (fs == NULL) {
return dirMakeTemp("/tmp");
} else if (strcmp(fs, "tmpfs") == 0) {
return DirTmpfsSetUp(params, user_data);
} else if (strcmp(fs, "ext4") == 0) {
return DirExt4SetUp(params, user_data);
} else if (strcmp(fs, "btrfs") == 0) {
return DirBtrfsSetUp(params, user_data);
} else if (strcmp(fs, "zfs") == 0) {
return DirZfsSetUp(params, user_data);
} else if (strcmp(fs, "xfs") == 0) {
return DirXfsSetUp(params, user_data);
}
munit_errorf("Unsupported file system %s", fs);
return NULL;
}
void *DirTmpfsSetUp(MUNIT_UNUSED const MunitParameter params[],
MUNIT_UNUSED void *user_data)
{
return dirMakeTemp(getenv("RAFT_TMP_TMPFS"));
}
void *DirExt4SetUp(MUNIT_UNUSED const MunitParameter params[],
MUNIT_UNUSED void *user_data)
{
return dirMakeTemp(getenv("RAFT_TMP_EXT4"));
}
void *DirBtrfsSetUp(MUNIT_UNUSED const MunitParameter params[],
MUNIT_UNUSED void *user_data)
{
return dirMakeTemp(getenv("RAFT_TMP_BTRFS"));
}
void *DirZfsSetUp(MUNIT_UNUSED const MunitParameter params[],
MUNIT_UNUSED void *user_data)
{
return dirMakeTemp(getenv("RAFT_TMP_ZFS"));
}
void *DirXfsSetUp(MUNIT_UNUSED const MunitParameter params[],
MUNIT_UNUSED void *user_data)
{
return dirMakeTemp(getenv("RAFT_TMP_XFS"));
}
/* Wrapper around remove(), compatible with ntfw. */
static int dirRemoveFn(const char *path,
MUNIT_UNUSED const struct stat *sbuf,
MUNIT_UNUSED int type,
MUNIT_UNUSED struct FTW *ftwb)
{
return remove(path);
}
static void dirRemove(char *dir)
{
int rv;
rv = chmod(dir, 0755);
munit_assert_int(rv, ==, 0);
rv = nftw(dir, dirRemoveFn, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS);
munit_assert_int(rv, ==, 0);
}
static bool dirExists(const char *dir)
{
struct stat sb;
int rv;
rv = stat(dir, &sb);
if (rv == -1) {
munit_assert_int(errno, ==, ENOENT);
return false;
}
return true;
}
void DirTearDown(void *data)
{
char *dir = data;
if (dir == NULL) {
return;
}
if (dirExists(dir)) {
dirRemove(dir);
}
free(dir);
}
/* Join the given @dir and @filename into @path. */
static void joinPath(const char *dir, const char *filename, char *path)
{
strcpy(path, dir);
strcat(path, "/");
strcat(path, filename);
}
void DirWriteFile(const char *dir,
const char *filename,
const void *buf,
const size_t n)
{
char path[256];
int fd;
int rv;
joinPath(dir, filename, path);
fd = open(path, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
munit_assert_int(fd, !=, -1);
rv = write(fd, buf, n);
munit_assert_int(rv, ==, n);
close(fd);
}
void DirWriteFileWithZeros(const char *dir,
const char *filename,
const size_t n)
{
void *buf = munit_malloc(n);
DirWriteFile(dir, filename, buf, n);
free(buf);
}
void DirOverwriteFile(const char *dir,
const char *filename,
const void *buf,
const size_t n,
const off_t whence)
{
char path[256];
int fd;
int rv;
off_t size;
joinPath(dir, filename, path);
fd = open(path, O_RDWR, S_IRUSR | S_IWUSR);
munit_assert_int(fd, !=, -1);
/* Get the size of the file */
size = lseek(fd, 0, SEEK_END);
if (whence == 0) {
munit_assert_int(size, >=, n);
lseek(fd, 0, SEEK_SET);
} else if (whence > 0) {
munit_assert_int(whence, <=, size);
munit_assert_int(size - whence, >=, n);
lseek(fd, whence, SEEK_SET);
} else {
munit_assert_int(-whence, <=, size);
munit_assert_int(-whence, >=, n);
lseek(fd, whence, SEEK_END);
}
rv = write(fd, buf, n);
munit_assert_int(rv, ==, n);
close(fd);
}
void DirTruncateFile(const char *dir, const char *filename, const size_t n)
{
char path[256];
int fd;
int rv;
joinPath(dir, filename, path);
fd = open(path, O_RDWR, S_IRUSR | S_IWUSR);
munit_assert_int(fd, !=, -1);
rv = ftruncate(fd, n);
munit_assert_int(rv, ==, 0);
rv = close(fd);
munit_assert_int(rv, ==, 0);
}
void DirGrowFile(const char *dir, const char *filename, const size_t n)
{
char path[256];
int fd;
struct stat sb;
void *buf;
size_t size;
int rv;
joinPath(dir, filename, path);
fd = open(path, O_RDWR, S_IRUSR | S_IWUSR);
munit_assert_int(fd, !=, -1);
rv = fstat(fd, &sb);
munit_assert_int(rv, ==, 0);
munit_assert_int(sb.st_size, <=, n);
/* Fill with zeros. */
lseek(fd, sb.st_size, SEEK_SET);
size = n - sb.st_size;
buf = munit_malloc(size);
rv = write(fd, buf, size);
munit_assert_int(rv, ==, size);
free(buf);
rv = close(fd);
munit_assert_int(rv, ==, 0);
}
void DirRenameFile(const char *dir,
const char *filename1,
const char *filename2)
{
char path1[256];
char path2[256];
int rv;
joinPath(dir, filename1, path1);
joinPath(dir, filename2, path2);
rv = rename(path1, path2);
munit_assert_int(rv, ==, 0);
}
void DirRemoveFile(const char *dir, const char *filename)
{
char path[256];
int rv;
joinPath(dir, filename, path);
rv = unlink(path);
munit_assert_int(rv, ==, 0);
}
void DirReadFile(const char *dir,
const char *filename,
void *buf,
const size_t n)
{
char path[256];
int fd;
int rv;
joinPath(dir, filename, path);
fd = open(path, O_RDONLY);
if (fd == -1) {
munit_logf(MUNIT_LOG_ERROR, "read file '%s': %s", path,
strerror(errno));
}
rv = read(fd, buf, n);
munit_assert_int(rv, ==, n);
close(fd);
}
void DirMakeUnexecutable(const char *dir)
{
int rv;
rv = chmod(dir, 0);
munit_assert_int(rv, ==, 0);
}
void DirMakeUnwritable(const char *dir)
{
int rv;
rv = chmod(dir, 0500);
munit_assert_int(rv, ==, 0);
}
void DirMakeFileUnreadable(const char *dir, const char *filename)
{
char path[256];
int rv;
joinPath(dir, filename, path);
rv = chmod(path, 0);
munit_assert_int(rv, ==, 0);
}
bool DirHasFile(const char *dir, const char *filename)
{
char path[256];
int fd;
joinPath(dir, filename, path);
fd = open(path, O_RDONLY);
if (fd == -1) {
munit_assert_true(errno == ENOENT || errno == EACCES);
return false;
}
close(fd);
return true;
}
void DirFill(const char *dir, const size_t n)
{
char path[256];
const char *filename = ".fill";
struct statvfs fs;
size_t size;
int fd;
int rv;
rv = statvfs(dir, &fs);
munit_assert_int(rv, ==, 0);
size = fs.f_bsize * fs.f_bavail;
if (n > 0) {
munit_assert_int(size, >=, n);
}
joinPath(dir, filename, path);
fd = open(path, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
munit_assert_int(fd, !=, -1);
rv = posix_fallocate(fd, 0, size - n);
if (rv != 0) {
munit_logf(MUNIT_LOG_ERROR, "posix_fallocate %zu bytes: %s", size - n,
strerror(rv));
}
/* If n is zero, make sure any further write fails with ENOSPC */
if (n == 0) {
char buf[4096];
int i;
rv = lseek(fd, 0, SEEK_END);
munit_assert_int(rv, !=, -1);
for (i = 0; i < 40; i++) {
rv = write(fd, buf, sizeof buf);
if (rv < 0) {
break;
}
}
munit_assert_int(rv, ==, -1);
munit_assert_int(errno, ==, ENOSPC);
}
close(fd);
}
raft-0.22.1/test/lib/dir.h 0000664 0000000 0000000 00000012147 14601504142 0015206 0 ustar 00root root 0000000 0000000 /* Test directory utilities.
*
* This module sports helpers to create temporary directories backed by various
* file systems, read/write files in them, check for the presence of files
* etc. */
#ifndef TEST_DIR_H
#define TEST_DIR_H
#include
#include "munit.h"
/* Munit parameter defining the file system type backing the temporary directory
* created by test_dir_setup().
*
* The various file systems must have been previously setup with the fs.sh
* script. */
#define DIR_FS_PARAM "dir-fs"
#define FIXTURE_DIR char *dir
#define SET_UP_DIR \
f->dir = DirSetUp(params, user_data); \
if (f->dir == NULL) { /* Fs not available, test must be skipped. */ \
free(f); \
return NULL; \
}
#define TEAR_DOWN_DIR DirTearDown(f->dir)
/* Contain a single DIR_FS_PARAM parameter set to all supported file system
* types. */
extern MunitParameterEnum DirAllParams[];
/* Contain a single DIR_FS_PARAM parameter set to tmpfs. */
extern MunitParameterEnum DirTmpfsParams[];
/* Contain a single DIR_FS_PARAM parameter set to all file systems with
* proper AIO support (i.e. NOWAIT works). */
extern MunitParameterEnum DirAioParams[];
/* Contain a single DIR_FS_PARAM parameter set to all file systems without
* proper AIO support (i.e. NOWAIT does not work). */
extern MunitParameterEnum DirNoAioParams[];
/* Create a temporary test directory.
*
* Return a pointer the path of the created directory. */
void *DirSetUp(const MunitParameter params[], void *user_data);
/* Create a temporary test directory backed by tmpfs.
*
* Return a pointer the path of the created directory, or NULL if no tmpfs file
* system is available. */
void *DirTmpfsSetUp(const MunitParameter params[], void *user_data);
/* Create a temporary test directory backed by ext4.
*
* Return a pointer the path of the created directory, or NULL if no ext4 file
* system is available. */
void *DirExt4SetUp(const MunitParameter params[], void *user_data);
/* Create a temporary test directory backed by btrfs.
*
* Return a pointer the path of the created directory, or NULL if no btrfs file
* system is available. */
void *DirBtrfsSetUp(const MunitParameter params[], void *user_data);
/* Create a temporary test directory backed by zfs.
*
* Return a pointer the path of the created directory, or NULL if no zfs file
* system is available. */
void *DirZfsSetUp(const MunitParameter params[], void *user_data);
/* Create a temporary test directory backed by xfs.
*
* Return a pointer the path of the created directory, or NULL if no xfs file
* system is available. */
void *DirXfsSetUp(const MunitParameter params[], void *user_data);
/* Recursively remove a temporary directory. */
void DirTearDown(void *data);
/* Write the given @buf to the given @filename in the given @dir. */
void DirWriteFile(const char *dir,
const char *filename,
const void *buf,
const size_t n);
/* Write the given @filename and fill it with zeros. */
void DirWriteFileWithZeros(const char *dir,
const char *filename,
const size_t n);
/* Overwrite @n bytes of the given file with the given @buf data.
*
* If @whence is zero, overwrite the first @n bytes of the file. If @whence is
* positive overwrite the @n bytes starting at offset @whence. If @whence is
* negative overwrite @n bytes starting at @whence bytes from the end of the
* file. */
void DirOverwriteFile(const char *dir,
const char *filename,
const void *buf,
const size_t n,
const off_t whence);
/* Truncate the given file, leaving only the first @n bytes. */
void DirTruncateFile(const char *dir, const char *filename, const size_t n);
/* Grow the given file to the given size, filling the new bytes with zeros. */
void DirGrowFile(const char *dir, const char *filename, const size_t n);
/* Rename a file in the given directory from filename1 to filename2. */
void DirRenameFile(const char *dir,
const char *filename1,
const char *filename2);
/* Remove a file. */
void DirRemoveFile(const char *dir, const char *filename);
/* Read into @buf the content of the given @filename in the given @dir. */
void DirReadFile(const char *dir,
const char *filename,
void *buf,
const size_t n);
/* Make the given directory not executable, so files can't be open. */
void DirMakeUnexecutable(const char *dir);
/* Make the given directory not writable. */
void DirMakeUnwritable(const char *dir);
/* Make the given file not readable. */
void DirMakeFileUnreadable(const char *dir, const char *filename);
/* Check if the given directory has the given file. */
bool DirHasFile(const char *dir, const char *filename);
/* Fill the underlying file system of the given dir, leaving only n bytes free.
*/
void DirFill(const char *dir, const size_t n);
#endif /* TEST_DIR_H */
raft-0.22.1/test/lib/fault.c 0000664 0000000 0000000 00000002450 14601504142 0015532 0 ustar 00root root 0000000 0000000 #include "fault.h"
#include "munit.h"
void FaultInit(struct Fault *f)
{
f->countdown = -1;
f->n = -1;
f->paused = false;
}
bool FaultTick(struct Fault *f)
{
if (MUNIT_UNLIKELY(f->paused)) {
return false;
}
/* If the initial delay parameter was set to -1, then never fail. This is
* the most common case. */
if (MUNIT_LIKELY(f->countdown < 0)) {
return false;
}
/* If we did not yet reach 'delay' ticks, then just decrease the countdown.
*/
if (f->countdown > 0) {
f->countdown--;
return false;
}
munit_assert_int(f->countdown, ==, 0);
/* We reached 'delay' ticks, let's see how many times we have to trigger the
* fault, if any. */
if (f->n < 0) {
/* Trigger the fault forever. */
return true;
}
if (f->n > 0) {
/* Trigger the fault at least this time. */
f->n--;
return true;
}
munit_assert_int(f->n, ==, 0);
/* We reached 'repeat' ticks, let's stop triggering the fault. */
f->countdown--;
return false;
}
void FaultConfig(struct Fault *f, int delay, int repeat)
{
f->countdown = delay;
f->n = repeat;
}
void FaultPause(struct Fault *f)
{
f->paused = true;
}
void FaultResume(struct Fault *f)
{
f->paused = false;
}
raft-0.22.1/test/lib/fault.h 0000664 0000000 0000000 00000001604 14601504142 0015537 0 ustar 00root root 0000000 0000000 /* Helper for test components supporting fault injection. */
#ifndef TEST_FAULT_H
#define TEST_FAULT_H
#include
/* Information about a fault that should occur in a component. */
struct Fault
{
int countdown; /* Trigger the fault when this counter gets to zero. */
int n; /* Repeat the fault this many times. Default is -1. */
bool paused; /* Pause fault triggering. */
};
/* Initialize a fault. */
void FaultInit(struct Fault *f);
/* Advance the counters of the fault. Return true if the fault should be
* triggered, false otherwise. */
bool FaultTick(struct Fault *f);
/* Configure the fault with the given values. */
void FaultConfig(struct Fault *f, int delay, int repeat);
/* Pause triggering configured faults. */
void FaultPause(struct Fault *f);
/* Resume triggering configured faults. */
void FaultResume(struct Fault *f);
#endif /* TESTFAULT_H */
raft-0.22.1/test/lib/fs.sh 0000775 0000000 0000000 00000004453 14601504142 0015227 0 ustar 00root root 0000000 0000000 #!/bin/sh -e
# Setup loopback disk devices to test the raft I/O implementation against
# various file systems.
usage() {
echo "usage: $0 setup|teardown [types]"
}
if [ "${#}" -lt 1 ]; then
usage
exit 1
fi
cmd="${1}"
shift
types="tmpfs"
# Check if loop devices are available, we might be running inside an
# unprivileged container
if sudo losetup -f > /dev/null 2>&1; then
types="$types ext4"
if [ "$(sudo which mkfs.btrfs)" != "" ]; then
types="$types btrfs"
fi
if [ "$(sudo which mkfs.xfs)" != "" ]; then
types="$types xfs"
fi
if [ "$(sudo which zfs)" != "" ]; then
types="$types zfs"
fi
if [ "${#}" -gt 0 ]; then
types="${@}"
fi
fi
if [ "${cmd}" = "detect" ]; then
vars=""
for type in $types; do
vars="${vars}RAFT_TMP_$(echo ${type} | tr [a-z] [A-Z])=./tmp/${type} "
done
echo $vars
exit 0
fi
if [ "${cmd}" = "setup" ]; then
mkdir ./tmp
for type in $types; do
echo -n "Creating $type loop device mount..."
# Create the fs mount point
mkdir "./tmp/${type}"
if [ "$type" = "tmpfs" ]; then
# For tmpfs we don't need a loopback disk device.
sudo mount -t tmpfs -o size=32m tmpfs ./tmp/tmpfs
else
# Create a loopback disk device
dd if=/dev/zero of="./tmp/.${type}" bs=4096 count=86016 > /dev/null 2>&1
loop=$(sudo losetup -f)
sudo losetup "${loop}" "./tmp/.${type}"
# Initialize the file system
if [ "$type" = "zfs" ]; then
sudo zpool create raft "${loop}"
sudo zfs create -o mountpoint=$(pwd)/tmp/zfs raft/zfs
else
sudo mkfs.${type} "${loop}" > /dev/null 2>&1
sudo mount "${loop}" "./tmp/${type}"
fi
fi
sudo chown $USER "./tmp/${type}"
echo " done"
done
exit 0
fi
if [ "${cmd}" = "teardown" ]; then
for type in $types; do
echo -n "Deleting $type loop device mount..."
sudo umount "./tmp/${type}"
rm -rf "./tmp/${type}"
if [ "$type" != "tmpfs" ]; then
# For zfs we need to destroy the pool
if [ "$type" = "zfs" ]; then
sudo zpool destroy raft
fi
# For regular file systems, remove the loopback disk device.
loop=$(sudo losetup -a | grep ".${type}" | cut -f 1 -d :)
sudo losetup -d "${loop}"
rm "./tmp/.${type}"
fi
echo " done"
done
rmdir ./tmp
exit 0
fi
usage
exit 1
raft-0.22.1/test/lib/fsm.c 0000664 0000000 0000000 00000012276 14601504142 0015213 0 ustar 00root root 0000000 0000000 #include "fsm.h"
#include "../../src/byte.h"
#include "munit.h"
/* In-memory implementation of the raft_fsm interface. */
struct fsm
{
int x;
int y;
int lock;
void *data;
};
/* Command codes */
enum { SET_X = 1, SET_Y, ADD_X, ADD_Y };
static int fsmApply(struct raft_fsm *fsm,
const struct raft_buffer *buf,
void **result)
{
struct fsm *f = fsm->data;
const uint8_t *cursor = buf->base;
unsigned command;
int value;
if (buf->len != 16) {
return -1;
}
command = (unsigned)byteGet64(&cursor);
value = (int)byteGet64(&cursor);
switch (command) {
case SET_X:
f->x = value;
break;
case SET_Y:
f->y = value;
break;
case ADD_X:
f->x += value;
break;
case ADD_Y:
f->y += value;
break;
default:
return -1;
}
*result = NULL;
return 0;
}
static int fsmRestore(struct raft_fsm *fsm, struct raft_buffer *buf)
{
struct fsm *f = fsm->data;
const uint8_t *cursor = buf->base;
munit_assert_ullong(buf->len, ==, sizeof(uint64_t) * 2);
f->x = byteGet64(&cursor);
f->y = byteGet64(&cursor);
raft_free(buf->base);
return 0;
}
static int fsmEncodeSnapshot(int x,
int y,
struct raft_buffer *bufs[],
unsigned *n_bufs)
{
struct raft_buffer *buf;
uint8_t *cursor;
*n_bufs = 1;
*bufs = raft_malloc(sizeof **bufs);
if (*bufs == NULL) {
return RAFT_NOMEM;
}
buf = &(*bufs)[0];
buf->len = sizeof(uint64_t) * 2;
buf->base = raft_malloc(buf->len);
if (buf->base == NULL) {
return RAFT_NOMEM;
}
cursor = (*bufs)[0].base;
bytePut64(&cursor, x);
bytePut64(&cursor, y);
return 0;
}
/* For use with fsm->version 1 */
static int fsmSnapshot_v1(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs)
{
struct fsm *f = fsm->data;
return fsmEncodeSnapshot(f->x, f->y, bufs, n_bufs);
}
/* For use with fsmSnapshotFinalize and fsm->version >= 2 */
static int fsmSnapshot_v2(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs)
{
struct fsm *f = fsm->data;
munit_assert_int(f->lock, ==, 0);
f->lock = 1;
f->data = raft_malloc(8); /* Detect proper cleanup in finalize */
munit_assert_ptr_not_null(f->data);
return fsmEncodeSnapshot(f->x, f->y, bufs, n_bufs);
}
static int fsmSnapshotFinalize(struct raft_fsm *fsm,
struct raft_buffer *bufs[],
unsigned *n_bufs)
{
(void)bufs;
(void)n_bufs;
struct fsm *f = fsm->data;
if (*bufs != NULL) {
for (unsigned i = 0; i < *n_bufs; ++i) {
raft_free((*bufs)[i].base);
}
raft_free(*bufs);
}
*bufs = NULL;
*n_bufs = 0;
munit_assert_int(f->lock, ==, 1);
f->lock = 0;
munit_assert_ptr_not_null(f->data);
raft_free(f->data);
f->data = NULL;
return 0;
}
void FsmInit(struct raft_fsm *fsm, int version)
{
struct fsm *f = munit_malloc(sizeof *fsm);
memset(fsm, 'x', sizeof(*fsm)); /* Fill with garbage */
f->x = 0;
f->y = 0;
f->lock = 0;
f->data = NULL;
fsm->version = version;
fsm->data = f;
fsm->apply = fsmApply;
fsm->snapshot = fsmSnapshot_v1;
fsm->restore = fsmRestore;
if (version > 1) {
fsm->snapshot = fsmSnapshot_v2;
fsm->snapshot_finalize = fsmSnapshotFinalize;
}
}
void FsmClose(struct raft_fsm *fsm)
{
struct fsm *f = fsm->data;
free(f);
}
void FsmEncodeSetX(const int value, struct raft_buffer *buf)
{
uint8_t *cursor;
buf->base = raft_malloc(16);
buf->len = 16;
munit_assert_ptr_not_null(buf->base);
cursor = buf->base;
bytePut64(&cursor, SET_X);
bytePut64(&cursor, value);
}
void FsmEncodeAddX(const int value, struct raft_buffer *buf)
{
uint8_t *cursor;
buf->base = raft_malloc(16);
buf->len = 16;
munit_assert_ptr_not_null(buf->base);
cursor = buf->base;
bytePut64(&cursor, ADD_X);
bytePut64(&cursor, value);
}
void FsmEncodeSetY(const int value, struct raft_buffer *buf)
{
uint8_t *cursor;
buf->base = raft_malloc(16);
buf->len = 16;
munit_assert_ptr_not_null(buf->base);
cursor = buf->base;
bytePut64(&cursor, SET_Y);
bytePut64(&cursor, value);
}
void FsmEncodeAddY(const int value, struct raft_buffer *buf)
{
uint8_t *cursor;
buf->base = raft_malloc(16);
buf->len = 16;
munit_assert_ptr_not_null(buf->base);
cursor = buf->base;
bytePut64(&cursor, ADD_Y);
bytePut64(&cursor, value);
}
void FsmEncodeSnapshot(int x,
int y,
struct raft_buffer *bufs[],
unsigned *n_bufs)
{
int rc;
rc = fsmEncodeSnapshot(x, y, bufs, n_bufs);
munit_assert_int(rc, ==, 0);
}
int FsmGetX(struct raft_fsm *fsm)
{
struct fsm *f = fsm->data;
return f->x;
}
int FsmGetY(struct raft_fsm *fsm)
{
struct fsm *f = fsm->data;
return f->y;
}
raft-0.22.1/test/lib/fsm.h 0000664 0000000 0000000 00000002275 14601504142 0015216 0 ustar 00root root 0000000 0000000 /* Test implementation of the raft_fsm interface, with fault injection.
*
* The test FSM supports only two commands: setting x and setting y. */
#ifndef TEST_FSM_H
#define TEST_FSM_H
#include "../../include/raft.h"
void FsmInit(struct raft_fsm *fsm, int version);
/* Same as FsmInit but with asynchronous snapshots */
void FsmInitAsync(struct raft_fsm *fsm, int version);
void FsmClose(struct raft_fsm *fsm);
/* Encode a command to set x to the given value. */
void FsmEncodeSetX(int value, struct raft_buffer *buf);
/* Encode a command to add the given value to x. */
void FsmEncodeAddX(int value, struct raft_buffer *buf);
/* Encode a command to set y to the given value. */
void FsmEncodeSetY(int value, struct raft_buffer *buf);
/* Encode a command to add the given value to y. */
void FsmEncodeAddY(int value, struct raft_buffer *buf);
/* Encode a snapshot of an FSM with the given values for x and y. */
void FsmEncodeSnapshot(int x,
int y,
struct raft_buffer *bufs[],
unsigned *n_bufs);
/* Return the current value of x or y. */
int FsmGetX(struct raft_fsm *fsm);
int FsmGetY(struct raft_fsm *fsm);
#endif /* TEST_FSM_H */
raft-0.22.1/test/lib/heap.c 0000664 0000000 0000000 00000006053 14601504142 0015337 0 ustar 00root root 0000000 0000000 #include "heap.h"
#include
#include "fault.h"
#include "munit.h"
struct heap
{
int n; /* Number of outstanding allocations. */
size_t alignment; /* Value of last aligned alloc */
struct Fault fault; /* Fault trigger. */
};
static void heapInit(struct heap *h)
{
h->n = 0;
h->alignment = 0;
FaultInit(&h->fault);
}
static void *heapMalloc(void *data, size_t size)
{
struct heap *h = data;
if (FaultTick(&h->fault)) {
return NULL;
}
h->n++;
return munit_malloc(size);
}
static void heapFree(void *data, void *ptr)
{
struct heap *h = data;
h->n--;
free(ptr);
}
static void *heapCalloc(void *data, size_t nmemb, size_t size)
{
struct heap *h = data;
if (FaultTick(&h->fault)) {
return NULL;
}
h->n++;
return munit_calloc(nmemb, size);
}
static void *heapRealloc(void *data, void *ptr, size_t size)
{
struct heap *h = data;
if (FaultTick(&h->fault)) {
return NULL;
}
/* Increase the number of allocation only if ptr is NULL, since otherwise
* realloc is a malloc plus a free. */
if (ptr == NULL) {
h->n++;
}
ptr = realloc(ptr, size);
if (size == 0) {
munit_assert_ptr_null(ptr);
} else {
munit_assert_ptr_not_null(ptr);
}
return ptr;
}
static void *heapAlignedAlloc(void *data, size_t alignment, size_t size)
{
struct heap *h = data;
void *p;
if (FaultTick(&h->fault)) {
return NULL;
}
h->n++;
p = aligned_alloc(alignment, size);
munit_assert_ptr_not_null(p);
h->alignment = alignment;
return p;
}
static void heapAlignedFree(void *data, size_t alignment, void *ptr)
{
struct heap *h = data;
munit_assert_int(alignment, ==, h->alignment);
heapFree(data, ptr);
}
static int getIntParam(const MunitParameter params[], const char *name)
{
const char *value = munit_parameters_get(params, name);
return value != NULL ? atoi(value) : 0;
}
void HeapSetUp(const MunitParameter params[], struct raft_heap *h)
{
struct heap *heap = munit_malloc(sizeof *heap);
int delay = getIntParam(params, TEST_HEAP_FAULT_DELAY);
int repeat = getIntParam(params, TEST_HEAP_FAULT_REPEAT);
munit_assert_ptr_not_null(h);
heapInit(heap);
FaultConfig(&heap->fault, delay, repeat);
h->data = heap;
h->malloc = heapMalloc;
h->free = heapFree;
h->calloc = heapCalloc;
h->realloc = heapRealloc;
h->aligned_alloc = heapAlignedAlloc;
h->aligned_free = heapAlignedFree;
raft_heap_set(h);
FaultPause(&heap->fault);
}
void HeapTearDown(struct raft_heap *h)
{
struct heap *heap = h->data;
if (heap->n != 0) {
// munit_errorf("memory leak: %d outstanding allocations", heap->n);
}
free(heap);
raft_heap_set_default();
}
void HeapFaultConfig(struct raft_heap *h, int delay, int repeat)
{
struct heap *heap = h->data;
FaultConfig(&heap->fault, delay, repeat);
}
void HeapFaultEnable(struct raft_heap *h)
{
struct heap *heap = h->data;
FaultResume(&heap->fault);
}
raft-0.22.1/test/lib/heap.h 0000664 0000000 0000000 00000002317 14601504142 0015343 0 ustar 00root root 0000000 0000000 /* Add support for fault injection and leak detection to stdlib's malloc()
* family. */
#ifndef TEST_HEAP_H
#define TEST_HEAP_H
#include "../../include/raft.h"
#include "munit.h"
/* Munit parameter defining after how many API calls the test raft_heap
* implementation should start failing and return errors. The default is -1,
* meaning that no failure will ever occur. */
#define TEST_HEAP_FAULT_DELAY "heap-fault-delay"
/* Munit parameter defining how many consecutive times API calls against the
* test raft_heap implementation should keep failing after they started
* failing. This parameter has an effect only if 'store-fail-delay' is 0 or
* greater. The default is 1, and -1 means "keep failing forever". */
#define TEST_HEAP_FAULT_REPEAT "heap-fault-repeat"
/* Macro helpers. */
#define FIXTURE_HEAP struct raft_heap heap
#define SET_UP_HEAP HeapSetUp(params, &f->heap)
#define TEAR_DOWN_HEAP HeapTearDown(&f->heap)
#define HEAP_FAULT_ENABLE HeapFaultEnable(&f->heap)
void HeapSetUp(const MunitParameter params[], struct raft_heap *h);
void HeapTearDown(struct raft_heap *h);
void HeapFaultConfig(struct raft_heap *h, int delay, int repeat);
void HeapFaultEnable(struct raft_heap *h);
#endif /* TEST_HEAP_H */
raft-0.22.1/test/lib/legacy.c 0000664 0000000 0000000 00000003243 14601504142 0015664 0 ustar 00root root 0000000 0000000 #include "legacy.h"
static void randomize(struct raft_fixture *f, unsigned i, int what)
{
struct raft *raft = raft_fixture_get(f, i);
switch (what) {
case RAFT_FIXTURE_TICK:
/* TODO: provide an API to inspect how much time has elapsed since
* the last election timer reset */
if (raft->election_timer_start == raft->io->time(raft->io)) {
raft_fixture_set_randomized_election_timeout(
f, i,
munit_rand_int_range(raft->election_timeout,
raft->election_timeout * 2));
}
break;
case RAFT_FIXTURE_DISK:
/* XXX: Don't randomize disk latency like this, because we don't
* want io->append() requests to complete out of order. */
/*
raft_fixture_set_disk_latency(f, i, munit_rand_int_range(10, 25));
*/
break;
case RAFT_FIXTURE_NETWORK:
raft_fixture_set_network_latency(f, i,
munit_rand_int_range(25, 50));
break;
default:
munit_assert(0);
break;
}
}
void cluster_randomize_init(struct raft_fixture *f)
{
unsigned i;
for (i = 0; i < raft_fixture_n(f); i++) {
randomize(f, i, RAFT_FIXTURE_TICK);
randomize(f, i, RAFT_FIXTURE_DISK);
randomize(f, i, RAFT_FIXTURE_NETWORK);
}
}
void cluster_randomize(struct raft_fixture *f, struct raft_fixture_event *event)
{
unsigned index = raft_fixture_event_server_index(event);
int type = raft_fixture_event_type(event);
randomize(f, index, type);
}
raft-0.22.1/test/lib/legacy.h 0000664 0000000 0000000 00000052011 14601504142 0015666 0 ustar 00root root 0000000 0000000 /* Setup and drive a test raft cluster. */
#ifndef TEST_LEGACY_H
#define TEST_LEGACY_H
#include
#include "../../include/raft.h"
#include "../../include/raft/fixture.h"
#include "fsm.h"
#include "heap.h"
#include "macros.h"
#include "munit.h"
#include "snapshot.h"
#define FIXTURE_CLUSTER \
FIXTURE_HEAP; \
struct raft_fsm fsms[RAFT_FIXTURE_MAX_SERVERS]; \
struct raft_fixture cluster
/* N is the default number of servers, but can be tweaked with the cluster-n
* parameter. */
#define SETUP_CLUSTER(DEFAULT_N) \
SET_UP_HEAP; \
do { \
unsigned _n = DEFAULT_N; \
bool _pre_vote = false; \
int _fsm_version = 2; \
unsigned _hb = 0; \
unsigned _i; \
int _rv; \
if (munit_parameters_get(params, CLUSTER_N_PARAM) != NULL) { \
_n = atoi(munit_parameters_get(params, CLUSTER_N_PARAM)); \
} \
if (munit_parameters_get(params, CLUSTER_PRE_VOTE_PARAM) != NULL) { \
_pre_vote = \
atoi(munit_parameters_get(params, CLUSTER_PRE_VOTE_PARAM)); \
} \
if (munit_parameters_get(params, CLUSTER_HEARTBEAT_PARAM) != NULL) { \
_hb = atoi(munit_parameters_get(params, CLUSTER_HEARTBEAT_PARAM)); \
} \
if (munit_parameters_get(params, CLUSTER_FSM_VERSION_PARAM) != NULL) { \
_fsm_version = \
atoi(munit_parameters_get(params, CLUSTER_FSM_VERSION_PARAM)); \
} \
munit_assert_int(_n, >, 0); \
_rv = raft_fixture_init(&f->cluster); \
munit_assert_int(_rv, ==, 0); \
for (_i = 0; _i < _n; _i++) { \
FsmInit(&f->fsms[_i], _fsm_version); \
_rv = raft_fixture_grow(&f->cluster, &f->fsms[_i]); \
munit_assert_int(_rv, ==, 0); \
} \
for (_i = 0; _i < _n; _i++) { \
raft_set_pre_vote(raft_fixture_get(&f->cluster, _i), _pre_vote); \
if (_hb) { \
raft_set_heartbeat_timeout(raft_fixture_get(&f->cluster, _i), \
_hb); \
} \
} \
} while (0)
#define TEAR_DOWN_CLUSTER \
do { \
unsigned i; \
raft_fixture_close(&f->cluster); \
for (i = 0; i < CLUSTER_N; i++) { \
FsmClose(&f->fsms[i]); \
} \
} while (0); \
TEAR_DOWN_HEAP;
/* Munit parameter for setting the number of servers */
#define CLUSTER_N_PARAM "cluster-n"
/* Munit parameter for setting the number of voting servers */
#define CLUSTER_N_VOTING_PARAM "cluster-n-voting"
/* Munit parameter for enabling pre-vote */
#define CLUSTER_PRE_VOTE_PARAM "cluster-pre-vote"
/* Munit parameter for setting HeartBeat timeout */
#define CLUSTER_HEARTBEAT_PARAM "cluster-heartbeat"
/* Munit parameter for setting fsm version */
#define CLUSTER_FSM_VERSION_PARAM "fsm-version"
/* Get the number of servers in the cluster. */
#define CLUSTER_N raft_fixture_n(&f->cluster)
/* Get the cluster time. */
#define CLUSTER_TIME raft_fixture_time(&f->cluster)
/* Index of the current leader, or CLUSTER_N if there's no leader. */
#define CLUSTER_LEADER raft_fixture_leader_index(&f->cluster)
/* True if the cluster has a leader. */
#define CLUSTER_HAS_LEADER CLUSTER_LEADER < CLUSTER_N
/* Get the struct raft object of the I'th server. */
#define CLUSTER_RAFT(I) raft_fixture_get(&f->cluster, I)
/* Get the state of the I'th server. */
#define CLUSTER_STATE(I) raft_state(raft_fixture_get(&f->cluster, I))
/* Get the current term of the I'th server. */
#define CLUSTER_TERM(I) raft_fixture_get(&f->cluster, I)->current_term
/* Get the struct fsm object of the I'th server. */
#define CLUSTER_FSM(I) &f->fsms[I]
/* Return the last applied index on the I'th server. */
#define CLUSTER_LAST_APPLIED(I) \
raft_last_applied(raft_fixture_get(&f->cluster, I))
/* Return the ID of the server the I'th server has voted for. */
#define CLUSTER_VOTED_FOR(I) raft_fixture_voted_for(&f->cluster, I)
/* Return a description of the last error occurred on the I'th server. */
#define CLUSTER_ERRMSG(I) raft_errmsg(CLUSTER_RAFT(I))
/* Populate the given configuration with all servers in the fixture. All servers
* will be voting. */
#define CLUSTER_CONFIGURATION(CONF) \
{ \
int rv_; \
rv_ = raft_fixture_configuration(&f->cluster, CLUSTER_N, CONF); \
munit_assert_int(rv_, ==, 0); \
}
/* Bootstrap all servers in the cluster. All servers will be voting, unless the
* cluster-n-voting parameter is used. */
#define CLUSTER_BOOTSTRAP \
{ \
unsigned n_ = CLUSTER_N; \
int rv_; \
struct raft_configuration configuration; \
if (munit_parameters_get(params, CLUSTER_N_VOTING_PARAM) != NULL) { \
n_ = atoi(munit_parameters_get(params, CLUSTER_N_VOTING_PARAM)); \
} \
rv_ = raft_fixture_configuration(&f->cluster, n_, &configuration); \
munit_assert_int(rv_, ==, 0); \
rv_ = raft_fixture_bootstrap(&f->cluster, &configuration); \
munit_assert_int(rv_, ==, 0); \
raft_configuration_close(&configuration); \
}
/* Bootstrap all servers in the cluster. Only the first N servers will be
* voting. */
#define CLUSTER_BOOTSTRAP_N_VOTING(N) \
{ \
int rv_; \
struct raft_configuration configuration_; \
rv_ = raft_fixture_configuration(&f->cluster, N, &configuration_); \
munit_assert_int(rv_, ==, 0); \
rv_ = raft_fixture_bootstrap(&f->cluster, &configuration_); \
munit_assert_int(rv_, ==, 0); \
raft_configuration_close(&configuration_); \
}
/* Start all servers in the test cluster. */
#define CLUSTER_START() \
{ \
int rc; \
rc = raft_fixture_start(&f->cluster); \
munit_assert_int(rc, ==, 0); \
}
/* Step the cluster. */
#define CLUSTER_STEP raft_fixture_step(&f->cluster);
/* Step the cluster N times. */
#define CLUSTER_STEP_N(N) \
{ \
unsigned i_; \
for (i_ = 0; i_ < N; i_++) { \
raft_fixture_step(&f->cluster); \
} \
}
/* Step until the given function becomes true. */
#define CLUSTER_STEP_UNTIL(FUNC, ARG, MSECS) \
{ \
bool done_; \
done_ = raft_fixture_step_until(&f->cluster, FUNC, ARG, MSECS); \
munit_assert_true(done_); \
}
/* Step the cluster until a leader is elected or #MAX_MSECS have elapsed. */
#define CLUSTER_STEP_UNTIL_ELAPSED(MSECS) \
raft_fixture_step_until_elapsed(&f->cluster, MSECS)
/* Step the cluster until a leader is elected or #MAX_MSECS have elapsed. */
#define CLUSTER_STEP_UNTIL_HAS_LEADER(MAX_MSECS) \
{ \
bool done; \
done = raft_fixture_step_until_has_leader(&f->cluster, MAX_MSECS); \
munit_assert_true(done); \
munit_assert_true(CLUSTER_HAS_LEADER); \
}
/* Step the cluster until there's no leader or #MAX_MSECS have elapsed. */
#define CLUSTER_STEP_UNTIL_HAS_NO_LEADER(MAX_MSECS) \
{ \
bool done; \
done = raft_fixture_step_until_has_no_leader(&f->cluster, MAX_MSECS); \
munit_assert_true(done); \
munit_assert_false(CLUSTER_HAS_LEADER); \
}
/* Step the cluster until the given index was applied by the given server (or
* all if N) or #MAX_MSECS have elapsed. */
#define CLUSTER_STEP_UNTIL_APPLIED(I, INDEX, MAX_MSECS) \
{ \
bool done; \
done = \
raft_fixture_step_until_applied(&f->cluster, I, INDEX, MAX_MSECS); \
munit_assert_true(done); \
}
/* Step the cluster until the state of the server with the given index matches
* the given value, or #MAX_MSECS have elapsed. */
#define CLUSTER_STEP_UNTIL_STATE_IS(I, STATE, MAX_MSECS) \
{ \
bool done; \
done = raft_fixture_step_until_state_is(&f->cluster, I, STATE, \
MAX_MSECS); \
munit_assert_true(done); \
}
/* Step the cluster until the term of the server with the given index matches
* the given value, or #MAX_MSECS have elapsed. */
#define CLUSTER_STEP_UNTIL_TERM_IS(I, TERM, MAX_MSECS) \
{ \
bool done; \
done = \
raft_fixture_step_until_term_is(&f->cluster, I, TERM, MAX_MSECS); \
munit_assert_true(done); \
}
/* Step the cluster until server I has voted for server J, or #MAX_MSECS have
* elapsed. */
#define CLUSTER_STEP_UNTIL_VOTED_FOR(I, J, MAX_MSECS) \
{ \
bool done; \
done = \
raft_fixture_step_until_voted_for(&f->cluster, I, J, MAX_MSECS); \
munit_assert_true(done); \
}
/* Step the cluster until all messages from server I to server J have been
* delivered, or #MAX_MSECS elapse. */
#define CLUSTER_STEP_UNTIL_DELIVERED(I, J, MAX_MSECS) \
{ \
bool done; \
done = \
raft_fixture_step_until_delivered(&f->cluster, I, J, MAX_MSECS); \
munit_assert_true(done); \
}
/* Request to apply an FSM command to add the given value to x. */
#define CLUSTER_APPLY_ADD_X(I, REQ, VALUE, CB) \
{ \
struct raft_buffer buf_; \
struct raft *raft_; \
int rv_; \
FsmEncodeAddX(VALUE, &buf_); \
raft_ = raft_fixture_get(&f->cluster, I); \
rv_ = raft_apply(raft_, REQ, &buf_, 1, CB); \
munit_assert_int(rv_, ==, 0); \
}
/* Kill the I'th server. */
#define CLUSTER_KILL(I) raft_fixture_kill(&f->cluster, I);
/* Revive the I'th server */
#define CLUSTER_REVIVE(I) raft_fixture_revive(&f->cluster, I);
/* Kill the leader. */
#define CLUSTER_KILL_LEADER CLUSTER_KILL(CLUSTER_LEADER)
/* Kill a majority of servers, except the leader (if there is one). */
#define CLUSTER_KILL_MAJORITY \
{ \
size_t i2; \
size_t n; \
for (i2 = 0, n = 0; n < (CLUSTER_N / 2) + 1; i2++) { \
if (i2 == CLUSTER_LEADER) { \
continue; \
} \
CLUSTER_KILL(i2) \
n++; \
} \
}
/* Grow the cluster adding one server. */
#define CLUSTER_GROW \
{ \
int rv_; \
FsmInit(&f->fsms[CLUSTER_N], 2); \
rv_ = raft_fixture_grow(&f->cluster, &f->fsms[CLUSTER_N]); \
munit_assert_int(rv_, ==, 0); \
}
/* Add a new pristine server to the cluster, connected to all others. Then
* submit a request to add it to the configuration as an idle server. */
#define CLUSTER_ADD(REQ) \
{ \
int rc; \
struct raft *new_raft; \
CLUSTER_GROW; \
rc = raft_start(CLUSTER_RAFT(CLUSTER_N - 1)); \
munit_assert_int(rc, ==, 0); \
new_raft = CLUSTER_RAFT(CLUSTER_N - 1); \
rc = raft_add(CLUSTER_RAFT(CLUSTER_LEADER), REQ, new_raft->id, \
new_raft->address, NULL); \
munit_assert_int(rc, ==, 0); \
}
/* Assign the given role to the server that was added last. */
#define CLUSTER_ASSIGN(REQ, ROLE) \
do { \
unsigned _id; \
int _rv; \
_id = CLUSTER_N; /* Last server that was added. */ \
_rv = raft_assign(CLUSTER_RAFT(CLUSTER_LEADER), REQ, _id, ROLE, NULL); \
munit_assert_int(_rv, ==, 0); \
} while (0)
/* Ensure that the cluster can make progress from the current state.
*
* - If no leader is present, wait for one to be elected.
* - Submit a request to apply a new FSM command and wait for it to complete. */
#define CLUSTER_MAKE_PROGRESS \
{ \
struct raft_apply *req_ = munit_malloc(sizeof *req_); \
if (!(CLUSTER_HAS_LEADER)) { \
CLUSTER_STEP_UNTIL_HAS_LEADER(10000); \
} \
CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req_, 1, NULL); \
CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, req_->index, 3000); \
free(req_); \
}
/* Elect the I'th server. */
#define CLUSTER_ELECT(I) raft_fixture_elect(&f->cluster, I)
/* Start to elect the I'th server. */
#define CLUSTER_START_ELECT(I) raft_fixture_start_elect(&f->cluster, I)
/* Depose the current leader */
#define CLUSTER_DEPOSE raft_fixture_depose(&f->cluster)
/* Disconnect I from J. */
#define CLUSTER_DISCONNECT(I, J) raft_fixture_disconnect(&f->cluster, I, J)
/* Reconnect I to J. */
#define CLUSTER_RECONNECT(I, J) raft_fixture_reconnect(&f->cluster, I, J)
/* Saturate the connection from I to J. */
#define CLUSTER_SATURATE(I, J) raft_fixture_saturate(&f->cluster, I, J)
/* Saturate the connection from I to J and from J to I, in both directions. */
#define CLUSTER_SATURATE_BOTHWAYS(I, J) \
CLUSTER_SATURATE(I, J); \
CLUSTER_SATURATE(J, I)
/* Desaturate the connection between I and J, making messages flow again. */
#define CLUSTER_DESATURATE(I, J) raft_fixture_desaturate(&f->cluster, I, J)
/* Reconnect two servers. */
#define CLUSTER_DESATURATE_BOTHWAYS(I, J) \
CLUSTER_DESATURATE(I, J); \
CLUSTER_DESATURATE(J, I)
/* Set the network latency of outgoing messages of server I. */
#define CLUSTER_SET_NETWORK_LATENCY(I, MSECS) \
raft_fixture_set_network_latency(&f->cluster, I, MSECS)
/* Set the disk I/O latency of server I. */
#define CLUSTER_SET_DISK_LATENCY(I, MSECS) \
raft_fixture_set_disk_latency(&f->cluster, I, MSECS)
/* Set the term persisted on the I'th server. This must be called before
* starting the cluster. */
#define CLUSTER_SET_TERM(I, TERM) raft_fixture_set_term(&f->cluster, I, TERM)
/* Set the snapshot persisted on the I'th server. This must be called before
* starting the cluster. */
#define CLUSTER_SET_SNAPSHOT(I, LAST_INDEX, LAST_TERM, CONF_INDEX, X, Y) \
{ \
struct raft_configuration configuration_; \
struct raft_snapshot *snapshot_; \
CLUSTER_CONFIGURATION(&configuration_); \
CREATE_SNAPSHOT(snapshot_, LAST_INDEX, LAST_TERM, configuration_, \
CONF_INDEX, X, Y); \
raft_fixture_set_snapshot(&f->cluster, I, snapshot_); \
}
/* Add an entry to the ones persisted on the I'th server. This must be called
* before starting the cluster. */
#define CLUSTER_ADD_ENTRY(I, ENTRY) \
raft_fixture_add_entry(&f->cluster, I, ENTRY)
/* Make an I/O error occur on the I'th server after @DELAY operations. */
#define CLUSTER_IO_FAULT(I, DELAY, REPEAT) \
raft_fixture_io_fault(&f->cluster, I, DELAY, REPEAT)
/* Return the number of messages sent by the given server. */
#define CLUSTER_N_SEND(I, TYPE) raft_fixture_n_send(&f->cluster, I, TYPE)
/* Return the number of messages sent by the given server. */
#define CLUSTER_N_RECV(I, TYPE) raft_fixture_n_recv(&f->cluster, I, TYPE)
/* Set a fixture hook that randomizes election timeouts, disk latency and
* network latency. */
#define CLUSTER_RANDOMIZE \
cluster_randomize_init(&f->cluster); \
raft_fixture_hook(&f->cluster, cluster_randomize)
void cluster_randomize_init(struct raft_fixture *f);
void cluster_randomize(struct raft_fixture *f,
struct raft_fixture_event *event);
#endif /* TEST_LEGACY_H */
raft-0.22.1/test/lib/loop.c 0000664 0000000 0000000 00000000230 14601504142 0015362 0 ustar 00root root 0000000 0000000 #include "loop.h"
void test_loop_walk_cb(uv_handle_t *handle, void *arg)
{
(void)arg;
munit_logf(MUNIT_LOG_INFO, "handle %d", handle->type);
}
raft-0.22.1/test/lib/loop.h 0000664 0000000 0000000 00000013603 14601504142 0015377 0 ustar 00root root 0000000 0000000 /* Add support for using the libuv loop in tests. */
#ifndef TEST_LOOP_H
#define TEST_LOOP_H
#include
#include "../../include/raft.h"
#include "munit.h"
/* Max n. of loop iterations ran by a single function call */
#define LOOP_MAX_RUN 20
#define FIXTURE_LOOP struct uv_loop_s loop
/* Older libuv versions might try to free() memory that was not allocated. */
#if HAVE_DECL_UV_FS_O_CREAT
#define LOOP_REPLACE_ALLOCATOR \
_rv = uv_replace_allocator(raft_malloc, raft_realloc, raft_calloc, \
raft_free); \
munit_assert_int(_rv, ==, 0)
#else
#define LOOP_REPLACE_ALLOCATOR
#endif
#define SETUP_LOOP \
{ \
int _rv; \
LOOP_REPLACE_ALLOCATOR; \
_rv = uv_loop_init(&f->loop); \
munit_assert_int(_rv, ==, 0); \
}
#define TEAR_DOWN_LOOP \
{ \
int rv_; \
int alive_ = uv_loop_alive(&f->loop); \
if (alive_ != 0) { \
LOOP_STOP; \
} \
rv_ = uv_loop_close(&f->loop); \
if (rv_ != 0) { \
uv_walk(&f->loop, test_loop_walk_cb, NULL); \
munit_errorf("uv_loop_close: %s (%d)", uv_strerror(rv_), rv_); \
} \
rv_ = uv_replace_allocator(malloc, realloc, calloc, free); \
munit_assert_int(rv_, ==, 0); \
}
/* Run the loop until there are no pending active handles or the given amount of
* iterations is reached. */
#define LOOP_RUN(N) \
{ \
unsigned i__; \
int rv__; \
for (i__ = 0; i__ < N; i__++) { \
rv__ = uv_run(&f->loop, UV_RUN_ONCE); \
if (rv__ < 0) { \
munit_errorf("uv_run: %s (%d)", uv_strerror(rv__), rv__); \
} \
if (rv__ == 0) { \
break; \
} \
} \
}
/* Run the loop until the value stored through the given boolean pointer is
* true.
*
* If the loop exhausts all active handles or if #LOOP_MAX_RUN is reached, the
* test fails. */
#define LOOP_RUN_UNTIL(CONDITION) \
{ \
unsigned __i; \
int __rv; \
for (__i = 0; __i < LOOP_MAX_RUN; __i++) { \
if (*(CONDITION)) { \
break; \
} \
__rv = uv_run(&f->loop, UV_RUN_ONCE); \
if (__rv < 0) { \
munit_errorf("uv_run: %s (%d)", uv_strerror(__rv), __rv); \
} \
if (__rv == 0) { \
if (*(CONDITION)) { \
break; \
} \
munit_errorf("uv_run: stopped after %u iterations", __i + 1); \
} \
} \
if (!*(CONDITION)) { \
munit_errorf("uv_run: condition not met in %d iterations", \
LOOP_MAX_RUN); \
} \
}
/* Run the loop until there are no pending active handles.
*
* If there are still pending active handles after LOOP_MAX_RUN iterations, the
* test will fail.
*
* This is meant to be used in tear down functions. */
#define LOOP_STOP \
{ \
int alive__; \
LOOP_RUN(LOOP_MAX_RUN); \
alive__ = uv_loop_alive(&f->loop); \
if (alive__ != 0) { \
munit_error("loop has still pending active handles"); \
} \
}
void test_loop_walk_cb(uv_handle_t *handle, void *arg);
#endif /* TEST_LOOP_H */
raft-0.22.1/test/lib/macros.h 0000664 0000000 0000000 00000000601 14601504142 0015704 0 ustar 00root root 0000000 0000000 /**
* Miscellaneous test macros.
*/
#ifndef TEST_MACROS_H_
#define TEST_MACROS_H_
#define GET_2ND_ARG(arg1, arg2, ...) arg2
#define GET_3RD_ARG(arg1, arg2, arg3, ...) arg3
#define GET_4TH_ARG(arg1, arg2, arg3, arg4, ...) arg4
#define GET_5TH_ARG(arg1, arg2, arg3, arg4, arg5, ...) arg5
#define GET_6TH_ARG(arg1, arg2, arg3, arg4, arg5, arg6, ...) arg6
#endif /* TEST_MACROS_H_ */
raft-0.22.1/test/lib/munit.c 0000664 0000000 0000000 00000205646 14601504142 0015567 0 ustar 00root root 0000000 0000000 /* Copyright (c) 2013-2018 Evan Nemerson
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/*** Configuration ***/
/* This is just where the output from the test goes. It's really just
* meant to let you choose stdout or stderr, but if anyone really want
* to direct it to a file let me know, it would be fairly easy to
* support. */
#if !defined(MUNIT_OUTPUT_FILE)
# define MUNIT_OUTPUT_FILE stdout
#endif
/* This is a bit more useful; it tells µnit how to format the seconds in
* timed tests. If your tests run for longer you might want to reduce
* it, and if your computer is really fast and your tests are tiny you
* can increase it. */
#if !defined(MUNIT_TEST_TIME_FORMAT)
# define MUNIT_TEST_TIME_FORMAT "0.8f"
#endif
/* If you have long test names you might want to consider bumping
* this. The result information takes 43 characters. */
#if !defined(MUNIT_TEST_NAME_LEN)
# define MUNIT_TEST_NAME_LEN 37
#endif
/* If you don't like the timing information, you can disable it by
* defining MUNIT_DISABLE_TIMING. */
#if !defined(MUNIT_DISABLE_TIMING)
# define MUNIT_ENABLE_TIMING
#endif
/*** End configuration ***/
#if defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE < 200809L)
# undef _POSIX_C_SOURCE
#endif
#if !defined(_POSIX_C_SOURCE)
# define _POSIX_C_SOURCE 200809L
#endif
/* Solaris freaks out if you try to use a POSIX or SUS standard without
* the "right" C standard. */
#if defined(_XOPEN_SOURCE)
# undef _XOPEN_SOURCE
#endif
#if defined(__STDC_VERSION__)
# if __STDC_VERSION__ >= 201112L
# define _XOPEN_SOURCE 700
# elif __STDC_VERSION__ >= 199901L
# define _XOPEN_SOURCE 600
# endif
#endif
/* Because, according to Microsoft, POSIX is deprecated. You've got
* to appreciate the chutzpah. */
#if defined(_MSC_VER) && !defined(_CRT_NONSTDC_NO_DEPRECATE)
# define _CRT_NONSTDC_NO_DEPRECATE
#endif
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
# include
#elif defined(_WIN32)
/* https://msdn.microsoft.com/en-us/library/tf4dy80a.aspx */
#endif
#include
#include
#include
#include
#include
#include
#include
#include
#if !defined(MUNIT_NO_NL_LANGINFO) && !defined(_WIN32)
#define MUNIT_NL_LANGINFO
#include
#include
#include
#endif
#if !defined(_WIN32)
# include
# include
# include
#else
# include
# include
# include
# if !defined(STDERR_FILENO)
# define STDERR_FILENO _fileno(stderr)
# endif
#endif
#include "munit.h"
#define MUNIT_STRINGIFY(x) #x
#define MUNIT_XSTRINGIFY(x) MUNIT_STRINGIFY(x)
#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_CC) || defined(__IBMCPP__)
# define MUNIT_THREAD_LOCAL __thread
#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201102L)) || defined(_Thread_local)
# define MUNIT_THREAD_LOCAL _Thread_local
#elif defined(_WIN32)
# define MUNIT_THREAD_LOCAL __declspec(thread)
#endif
/* MSVC 12.0 will emit a warning at /W4 for code like 'do { ... }
* while (0)', or 'do { ... } while (true)'. I'm pretty sure nobody
* at Microsoft compiles with /W4. */
#if defined(_MSC_VER) && (_MSC_VER <= 1800)
#pragma warning(disable: 4127)
#endif
#if defined(_WIN32) || defined(__EMSCRIPTEN__)
# define MUNIT_NO_FORK
#endif
#if defined(__EMSCRIPTEN__)
# define MUNIT_NO_BUFFER
#endif
/*** Logging ***/
static MunitLogLevel munit_log_level_visible = MUNIT_LOG_INFO;
static MunitLogLevel munit_log_level_fatal = MUNIT_LOG_ERROR;
#if defined(MUNIT_THREAD_LOCAL)
static MUNIT_THREAD_LOCAL bool munit_error_jmp_buf_valid = false;
static MUNIT_THREAD_LOCAL jmp_buf munit_error_jmp_buf;
#endif
#if defined(MUNIT_THREAD_LOCAL) && defined(MUNIT_ALWAYS_TEAR_DOWN)
static MUNIT_THREAD_LOCAL bool munit_tear_down_jmp_buf_valid = false;
static MUNIT_THREAD_LOCAL jmp_buf munit_tear_down_jmp_buf;
#endif
/* At certain warning levels, mingw will trigger warnings about
* suggesting the format attribute, which we've explicitly *not* set
* because it will then choke on our attempts to use the MS-specific
* I64 modifier for size_t (which we have to use since MSVC doesn't
* support the C99 z modifier). */
#if defined(__MINGW32__) || defined(__MINGW64__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
#endif
MUNIT_PRINTF(5,0)
static void
munit_logf_exv(MunitLogLevel level, FILE* fp, const char* filename, int line, const char* format, va_list ap) {
if (level < munit_log_level_visible)
return;
switch (level) {
case MUNIT_LOG_DEBUG:
fputs("Debug", fp);
break;
case MUNIT_LOG_INFO:
fputs("Info", fp);
break;
case MUNIT_LOG_WARNING:
fputs("Warning", fp);
break;
case MUNIT_LOG_ERROR:
fputs("Error", fp);
break;
default:
munit_logf_ex(MUNIT_LOG_ERROR, filename, line, "Invalid log level (%d)", level);
return;
}
fputs(": ", fp);
if (filename != NULL)
fprintf(fp, "%s:%d: ", filename, line);
vfprintf(fp, format, ap);
fputc('\n', fp);
}
MUNIT_PRINTF(3,4)
static void
munit_logf_internal(MunitLogLevel level, FILE* fp, const char* format, ...) {
va_list ap;
va_start(ap, format);
munit_logf_exv(level, fp, NULL, 0, format, ap);
va_end(ap);
}
static void
munit_log_internal(MunitLogLevel level, FILE* fp, const char* message) {
munit_logf_internal(level, fp, "%s", message);
}
void
munit_logf_ex(MunitLogLevel level, const char* filename, int line, const char* format, ...) {
va_list ap;
va_start(ap, format);
munit_logf_exv(level, stderr, filename, line, format, ap);
va_end(ap);
if (level >= munit_log_level_fatal) {
#if defined(MUNIT_THREAD_LOCAL)
if (munit_error_jmp_buf_valid)
longjmp(munit_error_jmp_buf, 1);
#endif
abort();
}
}
void
munit_errorf_ex(const char* filename, int line, const char* format, ...) {
va_list ap;
va_start(ap, format);
munit_logf_exv(MUNIT_LOG_ERROR, stderr, filename, line, format, ap);
va_end(ap);
#if defined(MUNIT_THREAD_LOCAL) && defined(MUNIT_ALWAYS_TEAR_DOWN)
if (munit_tear_down_jmp_buf_valid)
longjmp(munit_tear_down_jmp_buf, 1);
#endif
#if defined(MUNIT_THREAD_LOCAL)
if (munit_error_jmp_buf_valid)
longjmp(munit_error_jmp_buf, 1);
#endif
abort();
}
#if defined(__MINGW32__) || defined(__MINGW64__)
#pragma GCC diagnostic pop
#endif
#if !defined(MUNIT_STRERROR_LEN)
# define MUNIT_STRERROR_LEN 80
#endif
static void
munit_log_errno(MunitLogLevel level, FILE* fp, const char* msg) {
#if defined(MUNIT_NO_STRERROR_R) || (defined(__MINGW32__) && !defined(MINGW_HAS_SECURE_API))
munit_logf_internal(level, fp, "%s: %s (%d)", msg, strerror(errno), errno);
#else
char munit_error_str[MUNIT_STRERROR_LEN];
munit_error_str[0] = '\0';
#if !defined(_WIN32)
strerror_r(errno, munit_error_str, MUNIT_STRERROR_LEN);
#else
strerror_s(munit_error_str, MUNIT_STRERROR_LEN, errno);
#endif
munit_logf_internal(level, fp, "%s: %s (%d)", msg, munit_error_str, errno);
#endif
}
/*** Memory allocation ***/
void*
munit_malloc_ex(const char* filename, int line, size_t size) {
void* ptr;
if (size == 0)
return NULL;
ptr = calloc(1, size);
if (MUNIT_UNLIKELY(ptr == NULL)) {
munit_logf_ex(MUNIT_LOG_ERROR, filename, line, "Failed to allocate %" MUNIT_SIZE_MODIFIER "u bytes.", size);
}
return ptr;
}
/*** Timer code ***/
#if defined(MUNIT_ENABLE_TIMING)
#define psnip_uint64_t munit_uint64_t
#define psnip_uint32_t munit_uint32_t
/* Code copied from portable-snippets
* . If you need to
* change something, please do it there so we can keep the code in
* sync. */
/* Clocks (v1)
* Portable Snippets - https://github.com/nemequ/portable-snippets
* Created by Evan Nemerson
*
* To the extent possible under law, the authors have waived all
* copyright and related or neighboring rights to this code. For
* details, see the Creative Commons Zero 1.0 Universal license at
* https://creativecommons.org/publicdomain/zero/1.0/
*/
#if !defined(PSNIP_CLOCK_H)
#define PSNIP_CLOCK_H
#if !defined(psnip_uint64_t)
# include "../exact-int/exact-int.h"
#endif
#if !defined(PSNIP_CLOCK_STATIC_INLINE)
# if defined(__GNUC__)
# define PSNIP_CLOCK__COMPILER_ATTRIBUTES __attribute__((__unused__))
# else
# define PSNIP_CLOCK__COMPILER_ATTRIBUTES
# endif
# define PSNIP_CLOCK__FUNCTION PSNIP_CLOCK__COMPILER_ATTRIBUTES static
#endif
enum PsnipClockType {
/* This clock provides the current time, in units since 1970-01-01
* 00:00:00 UTC not including leap seconds. In other words, UNIX
* time. Keep in mind that this clock doesn't account for leap
* seconds, and can go backwards (think NTP adjustments). */
PSNIP_CLOCK_TYPE_WALL = 1,
/* The CPU time is a clock which increases only when the current
* process is active (i.e., it doesn't increment while blocking on
* I/O). */
PSNIP_CLOCK_TYPE_CPU = 2,
/* Monotonic time is always running (unlike CPU time), but it only
ever moves forward unless you reboot the system. Things like NTP
adjustments have no effect on this clock. */
PSNIP_CLOCK_TYPE_MONOTONIC = 3
};
struct PsnipClockTimespec {
psnip_uint64_t seconds;
psnip_uint64_t nanoseconds;
};
/* Methods we support: */
#define PSNIP_CLOCK_METHOD_CLOCK_GETTIME 1
#define PSNIP_CLOCK_METHOD_TIME 2
#define PSNIP_CLOCK_METHOD_GETTIMEOFDAY 3
#define PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER 4
#define PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME 5
#define PSNIP_CLOCK_METHOD_CLOCK 6
#define PSNIP_CLOCK_METHOD_GETPROCESSTIMES 7
#define PSNIP_CLOCK_METHOD_GETRUSAGE 8
#define PSNIP_CLOCK_METHOD_GETSYSTEMTIMEPRECISEASFILETIME 9
#define PSNIP_CLOCK_METHOD_GETTICKCOUNT64 10
#include
#if defined(HEDLEY_UNREACHABLE)
# define PSNIP_CLOCK_UNREACHABLE() HEDLEY_UNREACHABLE()
#else
# define PSNIP_CLOCK_UNREACHABLE() assert(0)
#endif
/* Choose an implementation */
/* #undef PSNIP_CLOCK_WALL_METHOD */
/* #undef PSNIP_CLOCK_CPU_METHOD */
/* #undef PSNIP_CLOCK_MONOTONIC_METHOD */
/* We want to be able to detect the libc implementation, so we include
( isn't available everywhere). */
#if defined(__unix__) || defined(__unix) || defined(__linux__)
# include
# include
#endif
#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0)
/* These are known to work without librt. If you know of others
* please let us know so we can add them. */
# if \
(defined(__GLIBC__) && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17))) || \
(defined(__FreeBSD__))
# define PSNIP_CLOCK_HAVE_CLOCK_GETTIME
# elif !defined(PSNIP_CLOCK_NO_LIBRT)
# define PSNIP_CLOCK_HAVE_CLOCK_GETTIME
# endif
#endif
#if defined(_WIN32)
# if !defined(PSNIP_CLOCK_CPU_METHOD)
# define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_GETPROCESSTIMES
# endif
# if !defined(PSNIP_CLOCK_MONOTONIC_METHOD)
# define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER
# endif
#endif
#if defined(__MACH__) && !defined(__gnu_hurd__)
# if !defined(PSNIP_CLOCK_MONOTONIC_METHOD)
# define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME
# endif
#endif
#if defined(PSNIP_CLOCK_HAVE_CLOCK_GETTIME)
# include
# if !defined(PSNIP_CLOCK_WALL_METHOD)
# if defined(CLOCK_REALTIME_PRECISE)
# define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
# define PSNIP_CLOCK_CLOCK_GETTIME_WALL CLOCK_REALTIME_PRECISE
# elif !defined(__sun)
# define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
# define PSNIP_CLOCK_CLOCK_GETTIME_WALL CLOCK_REALTIME
# endif
# endif
# if !defined(PSNIP_CLOCK_CPU_METHOD)
# if defined(_POSIX_CPUTIME) || defined(CLOCK_PROCESS_CPUTIME_ID)
# define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
# define PSNIP_CLOCK_CLOCK_GETTIME_CPU CLOCK_PROCESS_CPUTIME_ID
# elif defined(CLOCK_VIRTUAL)
# define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
# define PSNIP_CLOCK_CLOCK_GETTIME_CPU CLOCK_VIRTUAL
# endif
# endif
# if !defined(PSNIP_CLOCK_MONOTONIC_METHOD)
# if defined(CLOCK_MONOTONIC_RAW)
# define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
# define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC
# elif defined(CLOCK_MONOTONIC_PRECISE)
# define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
# define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC_PRECISE
# elif defined(_POSIX_MONOTONIC_CLOCK) || defined(CLOCK_MONOTONIC)
# define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
# define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC
# endif
# endif
#endif
#if defined(_POSIX_VERSION) && (_POSIX_VERSION >= 200112L)
# if !defined(PSNIP_CLOCK_WALL_METHOD)
# define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_GETTIMEOFDAY
# endif
#endif
#if !defined(PSNIP_CLOCK_WALL_METHOD)
# define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_TIME
#endif
#if !defined(PSNIP_CLOCK_CPU_METHOD)
# define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK
#endif
/* Primarily here for testing. */
#if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) && defined(PSNIP_CLOCK_REQUIRE_MONOTONIC)
# error No monotonic clock found.
#endif
/* Implementations */
#if \
(defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \
(defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \
(defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \
(defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \
(defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \
(defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \
(defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_TIME)) || \
(defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME)) || \
(defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_TIME))
# include
#endif
#if \
(defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) || \
(defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) || \
(defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY))
# include
#endif
#if \
(defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \
(defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \
(defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \
(defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) || \
(defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) || \
(defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64))
# include
#endif
#if \
(defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) || \
(defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) || \
(defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE))
# include
# include
#endif
#if \
(defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) || \
(defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) || \
(defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME))
# include
# include
# include
#endif
/*** Implementations ***/
#define PSNIP_CLOCK_NSEC_PER_SEC ((psnip_uint32_t) (1000000000ULL))
#if \
(defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \
(defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \
(defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME))
PSNIP_CLOCK__FUNCTION psnip_uint32_t
psnip_clock__clock_getres (clockid_t clk_id) {
struct timespec res;
int r;
r = clock_getres(clk_id, &res);
if (r != 0)
return 0;
return (psnip_uint32_t) (PSNIP_CLOCK_NSEC_PER_SEC / res.tv_nsec);
}
PSNIP_CLOCK__FUNCTION int
psnip_clock__clock_gettime (clockid_t clk_id, struct PsnipClockTimespec* res) {
struct timespec ts;
if (clock_gettime(clk_id, &ts) != 0)
return -10;
res->seconds = (psnip_uint64_t) (ts.tv_sec);
res->nanoseconds = (psnip_uint64_t) (ts.tv_nsec);
return 0;
}
#endif
PSNIP_CLOCK__FUNCTION psnip_uint32_t
psnip_clock_wall_get_precision (void) {
#if !defined(PSNIP_CLOCK_WALL_METHOD)
return 0;
#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_WALL);
#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY
return 1000000;
#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME
return 1;
#else
return 0;
#endif
}
PSNIP_CLOCK__FUNCTION int
psnip_clock_wall_get_time (struct PsnipClockTimespec* res) {
(void) res;
#if !defined(PSNIP_CLOCK_WALL_METHOD)
return -2;
#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_WALL, res);
#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME
res->seconds = time(NULL);
res->nanoseconds = 0;
#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY
struct timeval tv;
if (gettimeofday(&tv, NULL) != 0)
return -6;
res->seconds = tv.tv_sec;
res->nanoseconds = tv.tv_usec * 1000;
#else
return -2;
#endif
return 0;
}
PSNIP_CLOCK__FUNCTION psnip_uint32_t
psnip_clock_cpu_get_precision (void) {
#if !defined(PSNIP_CLOCK_CPU_METHOD)
return 0;
#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_CPU);
#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK
return CLOCKS_PER_SEC;
#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES
return PSNIP_CLOCK_NSEC_PER_SEC / 100;
#else
return 0;
#endif
}
PSNIP_CLOCK__FUNCTION int
psnip_clock_cpu_get_time (struct PsnipClockTimespec* res) {
#if !defined(PSNIP_CLOCK_CPU_METHOD)
(void) res;
return -2;
#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_CPU, res);
#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK
clock_t t = clock();
if (t == ((clock_t) -1))
return -5;
res->seconds = t / CLOCKS_PER_SEC;
res->nanoseconds = (t % CLOCKS_PER_SEC) * (PSNIP_CLOCK_NSEC_PER_SEC / CLOCKS_PER_SEC);
#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES
FILETIME CreationTime, ExitTime, KernelTime, UserTime;
LARGE_INTEGER date, adjust;
if (!GetProcessTimes(GetCurrentProcess(), &CreationTime, &ExitTime, &KernelTime, &UserTime))
return -7;
/* http://www.frenk.com/2009/12/convert-filetime-to-unix-timestamp/ */
date.HighPart = UserTime.dwHighDateTime;
date.LowPart = UserTime.dwLowDateTime;
adjust.QuadPart = 11644473600000 * 10000;
date.QuadPart -= adjust.QuadPart;
res->seconds = date.QuadPart / 10000000;
res->nanoseconds = (date.QuadPart % 10000000) * (PSNIP_CLOCK_NSEC_PER_SEC / 100);
#elif PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE
struct rusage usage;
if (getrusage(RUSAGE_SELF, &usage) != 0)
return -8;
res->seconds = usage.ru_utime.tv_sec;
res->nanoseconds = tv.tv_usec * 1000;
#else
(void) res;
return -2;
#endif
return 0;
}
PSNIP_CLOCK__FUNCTION psnip_uint32_t
psnip_clock_monotonic_get_precision (void) {
#if !defined(PSNIP_CLOCK_MONOTONIC_METHOD)
return 0;
#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC);
#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME
static mach_timebase_info_data_t tbi = { 0, };
if (tbi.denom == 0)
mach_timebase_info(&tbi);
return (psnip_uint32_t) (tbi.numer / tbi.denom);
#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64
return 1000;
#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER
LARGE_INTEGER Frequency;
QueryPerformanceFrequency(&Frequency);
return (psnip_uint32_t) ((Frequency.QuadPart > PSNIP_CLOCK_NSEC_PER_SEC) ? PSNIP_CLOCK_NSEC_PER_SEC : Frequency.QuadPart);
#else
return 0;
#endif
}
PSNIP_CLOCK__FUNCTION int
psnip_clock_monotonic_get_time (struct PsnipClockTimespec* res) {
#if !defined(PSNIP_CLOCK_MONOTONIC_METHOD)
(void) res;
return -2;
#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC, res);
#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME
psnip_uint64_t nsec = mach_absolute_time();
static mach_timebase_info_data_t tbi = { 0, };
if (tbi.denom == 0)
mach_timebase_info(&tbi);
nsec *= ((psnip_uint64_t) tbi.numer) / ((psnip_uint64_t) tbi.denom);
res->seconds = nsec / PSNIP_CLOCK_NSEC_PER_SEC;
res->nanoseconds = nsec % PSNIP_CLOCK_NSEC_PER_SEC;
#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER
LARGE_INTEGER t, f;
if (QueryPerformanceCounter(&t) == 0)
return -12;
QueryPerformanceFrequency(&f);
res->seconds = t.QuadPart / f.QuadPart;
res->nanoseconds = t.QuadPart % f.QuadPart;
if (f.QuadPart > PSNIP_CLOCK_NSEC_PER_SEC)
res->nanoseconds /= f.QuadPart / PSNIP_CLOCK_NSEC_PER_SEC;
else
res->nanoseconds *= PSNIP_CLOCK_NSEC_PER_SEC / f.QuadPart;
#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64
const ULONGLONG msec = GetTickCount64();
res->seconds = msec / 1000;
res->nanoseconds = sec % 1000;
#else
return -2;
#endif
return 0;
}
/* Returns the number of ticks per second for the specified clock.
* For example, a clock with millisecond precision would return 1000,
* and a clock with 1 second (such as the time() function) would
* return 1.
*
* If the requested clock isn't available, it will return 0.
* Hopefully this will be rare, but if it happens to you please let us
* know so we can work on finding a way to support your system.
*
* Note that different clocks on the same system often have a
* different precisions.
*/
PSNIP_CLOCK__FUNCTION psnip_uint32_t
psnip_clock_get_precision (enum PsnipClockType clock_type) {
switch (clock_type) {
case PSNIP_CLOCK_TYPE_MONOTONIC:
return psnip_clock_monotonic_get_precision ();
case PSNIP_CLOCK_TYPE_CPU:
return psnip_clock_cpu_get_precision ();
case PSNIP_CLOCK_TYPE_WALL:
return psnip_clock_wall_get_precision ();
}
PSNIP_CLOCK_UNREACHABLE();
return 0;
}
/* Set the provided timespec to the requested time. Returns 0 on
* success, or a negative value on failure. */
PSNIP_CLOCK__FUNCTION int
psnip_clock_get_time (enum PsnipClockType clock_type, struct PsnipClockTimespec* res) {
assert(res != NULL);
switch (clock_type) {
case PSNIP_CLOCK_TYPE_MONOTONIC:
return psnip_clock_monotonic_get_time (res);
case PSNIP_CLOCK_TYPE_CPU:
return psnip_clock_cpu_get_time (res);
case PSNIP_CLOCK_TYPE_WALL:
return psnip_clock_wall_get_time (res);
}
return -1;
}
#endif /* !defined(PSNIP_CLOCK_H) */
static psnip_uint64_t
munit_clock_get_elapsed(struct PsnipClockTimespec* start, struct PsnipClockTimespec* end) {
psnip_uint64_t r = (end->seconds - start->seconds) * PSNIP_CLOCK_NSEC_PER_SEC;
if (end->nanoseconds < start->nanoseconds) {
r -= (start->nanoseconds - end->nanoseconds);
} else {
r += (end->nanoseconds - start->nanoseconds);
}
return r;
}
#else
# include
#endif /* defined(MUNIT_ENABLE_TIMING) */
/*** PRNG stuff ***/
/* This is (unless I screwed up, which is entirely possible) the
* version of PCG with 32-bit state. It was chosen because it has a
* small enough state that we should reliably be able to use CAS
* instead of requiring a lock for thread-safety.
*
* If I did screw up, I probably will not bother changing it unless
* there is a significant bias. It's really not important this be
* particularly strong, as long as it is fairly random it's much more
* important that it be reproducible, so bug reports have a better
* chance of being reproducible. */
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) && !defined(__EMSCRIPTEN__) && (!defined(__GNUC_MINOR__) || (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 8))
# define HAVE_STDATOMIC
#elif defined(__clang__)
# if __has_extension(c_atomic)
# define HAVE_CLANG_ATOMICS
# endif
#endif
/* Workaround for http://llvm.org/bugs/show_bug.cgi?id=26911 */
#if defined(__clang__) && defined(_WIN32)
# undef HAVE_STDATOMIC
# if defined(__c2__)
# undef HAVE_CLANG_ATOMICS
# endif
#endif
#if defined(_OPENMP)
# define ATOMIC_UINT32_T uint32_t
# define ATOMIC_UINT32_INIT(x) (x)
#elif defined(HAVE_STDATOMIC)
# include
# define ATOMIC_UINT32_T _Atomic uint32_t
# define ATOMIC_UINT32_INIT(x) ATOMIC_VAR_INIT(x)
#elif defined(HAVE_CLANG_ATOMICS)
# define ATOMIC_UINT32_T _Atomic uint32_t
# define ATOMIC_UINT32_INIT(x) (x)
#elif defined(_WIN32)
# define ATOMIC_UINT32_T volatile LONG
# define ATOMIC_UINT32_INIT(x) (x)
#else
# define ATOMIC_UINT32_T volatile uint32_t
# define ATOMIC_UINT32_INIT(x) (x)
#endif
static ATOMIC_UINT32_T munit_rand_state = ATOMIC_UINT32_INIT(42);
#if defined(_OPENMP)
static inline void
munit_atomic_store(ATOMIC_UINT32_T* dest, ATOMIC_UINT32_T value) {
#pragma omp critical (munit_atomics)
*dest = value;
}
static inline uint32_t
munit_atomic_load(ATOMIC_UINT32_T* src) {
int ret;
#pragma omp critical (munit_atomics)
ret = *src;
return ret;
}
static inline uint32_t
munit_atomic_cas(ATOMIC_UINT32_T* dest, ATOMIC_UINT32_T* expected, ATOMIC_UINT32_T desired) {
bool ret;
#pragma omp critical (munit_atomics)
{
if (*dest == *expected) {
*dest = desired;
ret = true;
} else {
ret = false;
}
}
return ret;
}
#elif defined(HAVE_STDATOMIC)
# define munit_atomic_store(dest, value) atomic_store(dest, value)
# define munit_atomic_load(src) atomic_load(src)
# define munit_atomic_cas(dest, expected, value) atomic_compare_exchange_weak(dest, expected, value)
#elif defined(HAVE_CLANG_ATOMICS)
# define munit_atomic_store(dest, value) __c11_atomic_store(dest, value, __ATOMIC_SEQ_CST)
# define munit_atomic_load(src) __c11_atomic_load(src, __ATOMIC_SEQ_CST)
# define munit_atomic_cas(dest, expected, value) __c11_atomic_compare_exchange_weak(dest, expected, value, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
#elif defined(__GNUC__) && (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)
# define munit_atomic_store(dest, value) __atomic_store_n(dest, value, __ATOMIC_SEQ_CST)
# define munit_atomic_load(src) __atomic_load_n(src, __ATOMIC_SEQ_CST)
# define munit_atomic_cas(dest, expected, value) __atomic_compare_exchange_n(dest, expected, value, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
#elif defined(__GNUC__) && (__GNUC__ >= 4)
# define munit_atomic_store(dest,value) do { *(dest) = (value); } while (0)
# define munit_atomic_load(src) (*(src))
# define munit_atomic_cas(dest, expected, value) __sync_bool_compare_and_swap(dest, *expected, value)
#elif defined(_WIN32) /* Untested */
# define munit_atomic_store(dest,value) do { *(dest) = (value); } while (0)
# define munit_atomic_load(src) (*(src))
# define munit_atomic_cas(dest, expected, value) InterlockedCompareExchange((dest), (value), *(expected))
#else
# warning No atomic implementation, PRNG will not be thread-safe
# define munit_atomic_store(dest, value) do { *(dest) = (value); } while (0)
# define munit_atomic_load(src) (*(src))
static inline bool
munit_atomic_cas(ATOMIC_UINT32_T* dest, ATOMIC_UINT32_T* expected, ATOMIC_UINT32_T desired) {
if (*dest == *expected) {
*dest = desired;
return true;
} else {
return false;
}
}
#endif
#define MUNIT_PRNG_MULTIPLIER (747796405U)
#define MUNIT_PRNG_INCREMENT (1729U)
static munit_uint32_t
munit_rand_next_state(munit_uint32_t state) {
return state * MUNIT_PRNG_MULTIPLIER + MUNIT_PRNG_INCREMENT;
}
static munit_uint32_t
munit_rand_from_state(munit_uint32_t state) {
munit_uint32_t res = ((state >> ((state >> 28) + 4)) ^ state) * (277803737U);
res ^= res >> 22;
return res;
}
void
munit_rand_seed(munit_uint32_t seed) {
munit_uint32_t state = munit_rand_next_state(seed + MUNIT_PRNG_INCREMENT);
munit_atomic_store(&munit_rand_state, state);
}
static munit_uint32_t
munit_rand_generate_seed(void) {
munit_uint32_t seed, state;
#if defined(MUNIT_ENABLE_TIMING)
struct PsnipClockTimespec wc = { 0, 0 };
psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wc);
seed = (munit_uint32_t) wc.nanoseconds;
#else
seed = (munit_uint32_t) time(NULL);
#endif
state = munit_rand_next_state(seed + MUNIT_PRNG_INCREMENT);
return munit_rand_from_state(state);
}
static munit_uint32_t
munit_rand_state_uint32(munit_uint32_t* state) {
const munit_uint32_t old = *state;
*state = munit_rand_next_state(old);
return munit_rand_from_state(old);
}
munit_uint32_t
munit_rand_uint32(void) {
munit_uint32_t old, state;
do {
old = munit_atomic_load(&munit_rand_state);
state = munit_rand_next_state(old);
} while (!munit_atomic_cas(&munit_rand_state, &old, state));
return munit_rand_from_state(old);
}
static void
munit_rand_state_memory(munit_uint32_t* state, size_t size, munit_uint8_t data[MUNIT_ARRAY_PARAM(size)]) {
size_t members_remaining = size / sizeof(munit_uint32_t);
size_t bytes_remaining = size % sizeof(munit_uint32_t);
munit_uint8_t* b = data;
munit_uint32_t rv;
while (members_remaining-- > 0) {
rv = munit_rand_state_uint32(state);
memcpy(b, &rv, sizeof(munit_uint32_t));
b += sizeof(munit_uint32_t);
}
if (bytes_remaining != 0) {
rv = munit_rand_state_uint32(state);
memcpy(b, &rv, bytes_remaining);
}
}
void
munit_rand_memory(size_t size, munit_uint8_t data[MUNIT_ARRAY_PARAM(size)]) {
munit_uint32_t old, state;
do {
state = old = munit_atomic_load(&munit_rand_state);
munit_rand_state_memory(&state, size, data);
} while (!munit_atomic_cas(&munit_rand_state, &old, state));
}
static munit_uint32_t
munit_rand_state_at_most(munit_uint32_t* state, munit_uint32_t salt, munit_uint32_t max) {
/* We want (UINT32_MAX + 1) % max, which in unsigned arithmetic is the same
* as (UINT32_MAX + 1 - max) % max = -max % max. We compute -max using not
* to avoid compiler warnings.
*/
const munit_uint32_t min = (~max + 1U) % max;
munit_uint32_t x;
if (max == (~((munit_uint32_t) 0U)))
return munit_rand_state_uint32(state) ^ salt;
max++;
do {
x = munit_rand_state_uint32(state) ^ salt;
} while (x < min);
return x % max;
}
static munit_uint32_t
munit_rand_at_most(munit_uint32_t salt, munit_uint32_t max) {
munit_uint32_t old, state;
munit_uint32_t retval;
do {
state = old = munit_atomic_load(&munit_rand_state);
retval = munit_rand_state_at_most(&state, salt, max);
} while (!munit_atomic_cas(&munit_rand_state, &old, state));
return retval;
}
int
munit_rand_int_range(int min, int max) {
munit_uint64_t range = (munit_uint64_t) max - (munit_uint64_t) min;
if (min > max)
return munit_rand_int_range(max, min);
if (range > (~((munit_uint32_t) 0U)))
range = (~((munit_uint32_t) 0U));
return min + munit_rand_at_most(0, (munit_uint32_t) range);
}
double
munit_rand_double(void) {
munit_uint32_t old, state;
double retval = 0.0;
do {
state = old = munit_atomic_load(&munit_rand_state);
/* See http://mumble.net/~campbell/tmp/random_real.c for how to do
* this right. Patches welcome if you feel that this is too
* biased. */
retval = munit_rand_state_uint32(&state) / ((~((munit_uint32_t) 0U)) + 1.0);
} while (!munit_atomic_cas(&munit_rand_state, &old, state));
return retval;
}
/*** Test suite handling ***/
typedef struct {
unsigned int successful;
unsigned int skipped;
unsigned int failed;
unsigned int errored;
#if defined(MUNIT_ENABLE_TIMING)
munit_uint64_t cpu_clock;
munit_uint64_t wall_clock;
#endif
} MunitReport;
typedef struct {
const char* prefix;
const MunitSuite* suite;
const char** tests;
munit_uint32_t seed;
unsigned int iterations;
MunitParameter* parameters;
bool single_parameter_mode;
void* user_data;
MunitReport report;
bool colorize;
bool fork;
bool show_stderr;
bool fatal_failures;
} MunitTestRunner;
const char*
munit_parameters_get(const MunitParameter params[], const char* key) {
const MunitParameter* param;
for (param = params ; param != NULL && param->name != NULL ; param++)
if (strcmp(param->name, key) == 0)
return param->value;
return NULL;
}
#if defined(MUNIT_ENABLE_TIMING)
static void
munit_print_time(FILE* fp, munit_uint64_t nanoseconds) {
fprintf(fp, "%" MUNIT_TEST_TIME_FORMAT, ((double) nanoseconds) / ((double) PSNIP_CLOCK_NSEC_PER_SEC));
}
#endif
/* Add a parameter to an array of parameters. */
static MunitResult
munit_parameters_add(size_t* params_size, MunitParameter* params[MUNIT_ARRAY_PARAM(*params_size)], char* name, char* value) {
*params = realloc(*params, sizeof(MunitParameter) * (*params_size + 2));
if (*params == NULL)
return MUNIT_ERROR;
(*params)[*params_size].name = name;
(*params)[*params_size].value = value;
(*params_size)++;
(*params)[*params_size].name = NULL;
(*params)[*params_size].value = NULL;
return MUNIT_OK;
}
/* Concatenate two strings, but just return one of the components
* unaltered if the other is NULL or "". */
static char*
munit_maybe_concat(size_t* len, char* prefix, char* suffix) {
char* res;
size_t res_l;
const size_t prefix_l = prefix != NULL ? strlen(prefix) : 0;
const size_t suffix_l = suffix != NULL ? strlen(suffix) : 0;
if (prefix_l == 0 && suffix_l == 0) {
res = NULL;
res_l = 0;
} else if (prefix_l == 0 && suffix_l != 0) {
res = suffix;
res_l = suffix_l;
} else if (prefix_l != 0 && suffix_l == 0) {
res = prefix;
res_l = prefix_l;
} else {
res_l = prefix_l + suffix_l;
res = malloc(res_l + 1);
memcpy(res, prefix, prefix_l);
memcpy(res + prefix_l, suffix, suffix_l);
res[res_l] = 0;
}
if (len != NULL)
*len = res_l;
return res;
}
/* Possibly free a string returned by munit_maybe_concat. */
static void
munit_maybe_free_concat(char* s, const char* prefix, const char* suffix) {
if (prefix != s && suffix != s)
free(s);
}
/* Cheap string hash function, just used to salt the PRNG. */
static munit_uint32_t
munit_str_hash(const char* name) {
const char *p;
munit_uint32_t h = 5381U;
for (p = name; *p != '\0'; p++)
h = (h << 5) + h + *p;
return h;
}
static void
munit_splice(int from, int to) {
munit_uint8_t buf[1024];
#if !defined(_WIN32)
ssize_t len;
ssize_t bytes_written;
ssize_t write_res;
#else
int len;
int bytes_written;
int write_res;
#endif
do {
len = read(from, buf, sizeof(buf));
if (len > 0) {
bytes_written = 0;
do {
write_res = write(to, buf + bytes_written, len - bytes_written);
if (write_res < 0)
break;
bytes_written += write_res;
} while (bytes_written < len);
}
else
break;
} while (true);
}
/* This is the part that should be handled in the child process */
static MunitResult
munit_test_runner_exec(MunitTestRunner* runner, const MunitTest* test, const MunitParameter params[], MunitReport* report) {
unsigned int iterations = runner->iterations;
MunitResult result = MUNIT_FAIL;
#if defined(MUNIT_ENABLE_TIMING)
struct PsnipClockTimespec wall_clock_begin = { 0, 0 }, wall_clock_end = { 0, 0 };
struct PsnipClockTimespec cpu_clock_begin = { 0, 0 }, cpu_clock_end = { 0, 0 };
#endif
unsigned int i = 0;
if ((test->options & MUNIT_TEST_OPTION_SINGLE_ITERATION) == MUNIT_TEST_OPTION_SINGLE_ITERATION)
iterations = 1;
else if (iterations == 0)
iterations = runner->suite->iterations;
munit_rand_seed(runner->seed);
do {
void* data = (test->setup == NULL) ? runner->user_data : test->setup(params, runner->user_data);
#if defined(MUNIT_ENABLE_TIMING)
psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wall_clock_begin);
psnip_clock_get_time(PSNIP_CLOCK_TYPE_CPU, &cpu_clock_begin);
#endif
#if defined(MUNIT_THREAD_LOCAL) && defined(MUNIT_ALWAYS_TEAR_DOWN)
if (test->tear_down != NULL) {
if (MUNIT_UNLIKELY(setjmp(munit_tear_down_jmp_buf) != 0)) {
test->tear_down(data);
longjmp(munit_error_jmp_buf, 1);
} else {
munit_tear_down_jmp_buf_valid = true;
}
}
#endif
result = test->test(params, data);
#if defined(MUNIT_ENABLE_TIMING)
psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wall_clock_end);
psnip_clock_get_time(PSNIP_CLOCK_TYPE_CPU, &cpu_clock_end);
#endif
if (test->tear_down != NULL)
test->tear_down(data);
if (MUNIT_LIKELY(result == MUNIT_OK)) {
report->successful++;
#if defined(MUNIT_ENABLE_TIMING)
report->wall_clock += munit_clock_get_elapsed(&wall_clock_begin, &wall_clock_end);
report->cpu_clock += munit_clock_get_elapsed(&cpu_clock_begin, &cpu_clock_end);
#endif
} else {
switch ((int) result) {
case MUNIT_SKIP:
report->skipped++;
break;
case MUNIT_FAIL:
report->failed++;
break;
case MUNIT_ERROR:
report->errored++;
break;
default:
break;
}
break;
}
} while (++i < iterations);
return result;
}
#if defined(MUNIT_EMOTICON)
# define MUNIT_RESULT_STRING_OK ":)"
# define MUNIT_RESULT_STRING_SKIP ":|"
# define MUNIT_RESULT_STRING_FAIL ":("
# define MUNIT_RESULT_STRING_ERROR ":o"
# define MUNIT_RESULT_STRING_TODO ":/"
#else
# define MUNIT_RESULT_STRING_OK "OK "
# define MUNIT_RESULT_STRING_SKIP "SKIP "
# define MUNIT_RESULT_STRING_FAIL "FAIL "
# define MUNIT_RESULT_STRING_ERROR "ERROR"
# define MUNIT_RESULT_STRING_TODO "TODO "
#endif
static void
munit_test_runner_print_color(const MunitTestRunner* runner, const char* string, char color) {
if (runner->colorize)
fprintf(MUNIT_OUTPUT_FILE, "\x1b[3%cm%s\x1b[39m", color, string);
else
fputs(string, MUNIT_OUTPUT_FILE);
}
#if !defined(MUNIT_NO_BUFFER)
static int
munit_replace_stderr(FILE* stderr_buf) {
if (stderr_buf != NULL) {
const int orig_stderr = dup(STDERR_FILENO);
int errfd = fileno(stderr_buf);
if (MUNIT_UNLIKELY(errfd == -1)) {
exit(EXIT_FAILURE);
}
dup2(errfd, STDERR_FILENO);
return orig_stderr;
}
return -1;
}
static void
munit_restore_stderr(int orig_stderr) {
if (orig_stderr != -1) {
dup2(orig_stderr, STDERR_FILENO);
close(orig_stderr);
}
}
#endif /* !defined(MUNIT_NO_BUFFER) */
/* Run a test with the specified parameters. */
static void
munit_test_runner_run_test_with_params(MunitTestRunner* runner, const MunitTest* test, const MunitParameter params[]) {
MunitResult result = MUNIT_OK;
MunitReport report = {
0, 0, 0, 0,
#if defined(MUNIT_ENABLE_TIMING)
0, 0
#endif
};
unsigned int output_l;
bool first;
const MunitParameter* param;
FILE* stderr_buf;
#if !defined(MUNIT_NO_FORK)
int pipefd[2];
pid_t fork_pid;
ssize_t bytes_written = 0;
ssize_t write_res;
ssize_t bytes_read = 0;
ssize_t read_res;
int status = 0;
pid_t changed_pid;
#endif
if (params != NULL) {
output_l = 2;
fputs(" ", MUNIT_OUTPUT_FILE);
first = true;
for (param = params ; param != NULL && param->name != NULL ; param++) {
if (!first) {
fputs(", ", MUNIT_OUTPUT_FILE);
output_l += 2;
} else {
first = false;
}
output_l += fprintf(MUNIT_OUTPUT_FILE, "%s=%s", param->name, param->value);
}
while (output_l++ < MUNIT_TEST_NAME_LEN) {
fputc(' ', MUNIT_OUTPUT_FILE);
}
}
fflush(MUNIT_OUTPUT_FILE);
stderr_buf = NULL;
#if !defined(_WIN32) || defined(__MINGW32__)
stderr_buf = tmpfile();
#else
tmpfile_s(&stderr_buf);
#endif
if (stderr_buf == NULL) {
munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to create buffer for stderr");
result = MUNIT_ERROR;
goto print_result;
}
#if !defined(MUNIT_NO_FORK)
if (runner->fork) {
pipefd[0] = -1;
pipefd[1] = -1;
if (pipe(pipefd) != 0) {
munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to create pipe");
result = MUNIT_ERROR;
goto print_result;
}
fork_pid = fork();
if (fork_pid == 0) {
int orig_stderr;
close(pipefd[0]);
orig_stderr = munit_replace_stderr(stderr_buf);
munit_test_runner_exec(runner, test, params, &report);
/* Note that we don't restore stderr. This is so we can buffer
* things written to stderr later on (such as by
* asan/tsan/ubsan, valgrind, etc.) */
close(orig_stderr);
do {
write_res = write(pipefd[1], ((munit_uint8_t*) (&report)) + bytes_written, sizeof(report) - bytes_written);
if (write_res < 0) {
if (stderr_buf != NULL) {
munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to write to pipe");
}
exit(EXIT_FAILURE);
}
bytes_written += write_res;
} while ((size_t) bytes_written < sizeof(report));
if (stderr_buf != NULL)
fclose(stderr_buf);
close(pipefd[1]);
exit(EXIT_SUCCESS);
} else if (fork_pid == -1) {
close(pipefd[0]);
close(pipefd[1]);
if (stderr_buf != NULL) {
munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to fork");
}
report.errored++;
result = MUNIT_ERROR;
} else {
close(pipefd[1]);
do {
read_res = read(pipefd[0], ((munit_uint8_t*) (&report)) + bytes_read, sizeof(report) - bytes_read);
if (read_res < 1)
break;
bytes_read += read_res;
} while (bytes_read < (ssize_t) sizeof(report));
changed_pid = waitpid(fork_pid, &status, 0);
if (MUNIT_LIKELY(changed_pid == fork_pid) && MUNIT_LIKELY(WIFEXITED(status))) {
if (bytes_read != sizeof(report)) {
munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child exited unexpectedly with status %d", WEXITSTATUS(status));
report.errored++;
} else if (WEXITSTATUS(status) != EXIT_SUCCESS) {
munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child exited with status %d", WEXITSTATUS(status));
report.errored++;
}
} else {
if (WIFSIGNALED(status)) {
#if defined(_XOPEN_VERSION) && (_XOPEN_VERSION >= 700)
munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child killed by signal %d (%s)", WTERMSIG(status), strsignal(WTERMSIG(status)));
#else
munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child killed by signal %d", WTERMSIG(status));
#endif
} else if (WIFSTOPPED(status)) {
munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child stopped by signal %d", WSTOPSIG(status));
}
report.errored++;
}
close(pipefd[0]);
waitpid(fork_pid, NULL, 0);
}
} else
#endif
{
#if !defined(MUNIT_NO_BUFFER)
const volatile int orig_stderr = munit_replace_stderr(stderr_buf);
#endif
#if defined(MUNIT_THREAD_LOCAL)
if (MUNIT_UNLIKELY(setjmp(munit_error_jmp_buf) != 0)) {
result = MUNIT_FAIL;
report.failed++;
} else {
munit_error_jmp_buf_valid = true;
result = munit_test_runner_exec(runner, test, params, &report);
}
#else
result = munit_test_runner_exec(runner, test, params, &report);
#endif
#if !defined(MUNIT_NO_BUFFER)
munit_restore_stderr(orig_stderr);
#endif
/* Here just so that the label is used on Windows and we don't get
* a warning */
goto print_result;
}
print_result:
fputs("[ ", MUNIT_OUTPUT_FILE);
if ((test->options & MUNIT_TEST_OPTION_TODO) == MUNIT_TEST_OPTION_TODO) {
if (report.failed != 0 || report.errored != 0 || report.skipped != 0) {
munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_TODO, '3');
result = MUNIT_OK;
} else {
munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_ERROR, '1');
if (MUNIT_LIKELY(stderr_buf != NULL))
munit_log_internal(MUNIT_LOG_ERROR, stderr_buf, "Test marked TODO, but was successful.");
runner->report.failed++;
result = MUNIT_ERROR;
}
} else if (report.failed > 0) {
munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_FAIL, '1');
runner->report.failed++;
result = MUNIT_FAIL;
} else if (report.errored > 0) {
munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_ERROR, '1');
runner->report.errored++;
result = MUNIT_ERROR;
} else if (report.skipped > 0) {
munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_SKIP, '3');
runner->report.skipped++;
result = MUNIT_SKIP;
} else if (report.successful > 1) {
munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_OK, '2');
#if defined(MUNIT_ENABLE_TIMING)
fputs(" ] [ ", MUNIT_OUTPUT_FILE);
munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock / report.successful);
fputs(" / ", MUNIT_OUTPUT_FILE);
munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock / report.successful);
fprintf(MUNIT_OUTPUT_FILE, " CPU ]\n %-" MUNIT_XSTRINGIFY(MUNIT_TEST_NAME_LEN) "s Total: [ ", "");
munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock);
fputs(" / ", MUNIT_OUTPUT_FILE);
munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock);
fputs(" CPU", MUNIT_OUTPUT_FILE);
#endif
runner->report.successful++;
result = MUNIT_OK;
} else if (report.successful > 0) {
munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_OK, '2');
#if defined(MUNIT_ENABLE_TIMING)
fputs(" ] [ ", MUNIT_OUTPUT_FILE);
munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock);
fputs(" / ", MUNIT_OUTPUT_FILE);
munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock);
fputs(" CPU", MUNIT_OUTPUT_FILE);
#endif
runner->report.successful++;
result = MUNIT_OK;
}
fputs(" ]\n", MUNIT_OUTPUT_FILE);
if (stderr_buf != NULL) {
if (result == MUNIT_FAIL || result == MUNIT_ERROR || runner->show_stderr) {
fflush(MUNIT_OUTPUT_FILE);
rewind(stderr_buf);
munit_splice(fileno(stderr_buf), STDERR_FILENO);
fflush(stderr);
}
fclose(stderr_buf);
}
}
static void
munit_test_runner_run_test_wild(MunitTestRunner* runner,
const MunitTest* test,
const char* test_name,
MunitParameter* params,
MunitParameter* p) {
const MunitParameterEnum* pe;
char** values;
MunitParameter* next;
for (pe = test->parameters ; pe != NULL && pe->name != NULL ; pe++) {
if (p->name == pe->name)
break;
}
if (pe == NULL)
return;
for (values = pe->values ; *values != NULL ; values++) {
next = p + 1;
p->value = *values;
if (next->name == NULL) {
munit_test_runner_run_test_with_params(runner, test, params);
} else {
munit_test_runner_run_test_wild(runner, test, test_name, params, next);
}
if (runner->fatal_failures && (runner->report.failed != 0 || runner->report.errored != 0))
break;
}
}
/* Run a single test, with every combination of parameters
* requested. */
static void
munit_test_runner_run_test(MunitTestRunner* runner,
const MunitTest* test,
const char* prefix) {
char* test_name = munit_maybe_concat(NULL, (char*) prefix, (char*) test->name);
/* The array of parameters to pass to
* munit_test_runner_run_test_with_params */
MunitParameter* params = NULL;
size_t params_l = 0;
/* Wildcard parameters are parameters which have possible values
* specified in the test, but no specific value was passed to the
* CLI. That means we want to run the test once for every
* possible combination of parameter values or, if --single was
* passed to the CLI, a single time with a random set of
* parameters. */
MunitParameter* wild_params = NULL;
size_t wild_params_l = 0;
const MunitParameterEnum* pe;
const MunitParameter* cli_p;
bool filled;
unsigned int possible;
char** vals;
size_t first_wild;
const MunitParameter* wp;
int pidx;
munit_rand_seed(runner->seed);
fprintf(MUNIT_OUTPUT_FILE, "%-" MUNIT_XSTRINGIFY(MUNIT_TEST_NAME_LEN) "s", test_name);
if (test->parameters == NULL) {
/* No parameters. Simple, nice. */
munit_test_runner_run_test_with_params(runner, test, NULL);
} else {
fputc('\n', MUNIT_OUTPUT_FILE);
for (pe = test->parameters ; pe != NULL && pe->name != NULL ; pe++) {
/* Did we received a value for this parameter from the CLI? */
filled = false;
for (cli_p = runner->parameters ; cli_p != NULL && cli_p->name != NULL ; cli_p++) {
if (strcmp(cli_p->name, pe->name) == 0) {
if (MUNIT_UNLIKELY(munit_parameters_add(¶ms_l, ¶ms, pe->name, cli_p->value) != MUNIT_OK))
goto cleanup;
filled = true;
break;
}
}
if (filled)
continue;
/* Nothing from CLI, is the enum NULL/empty? We're not a
* fuzzer… */
if (pe->values == NULL || pe->values[0] == NULL)
continue;
/* If --single was passed to the CLI, choose a value from the
* list of possibilities randomly. */
if (runner->single_parameter_mode) {
possible = 0;
for (vals = pe->values ; *vals != NULL ; vals++)
possible++;
/* We want the tests to be reproducible, even if you're only
* running a single test, but we don't want every test with
* the same number of parameters to choose the same parameter
* number, so use the test name as a primitive salt. */
pidx = munit_rand_at_most(munit_str_hash(test_name), possible - 1);
if (MUNIT_UNLIKELY(munit_parameters_add(¶ms_l, ¶ms, pe->name, pe->values[pidx]) != MUNIT_OK))
goto cleanup;
} else {
/* We want to try every permutation. Put in a placeholder
* entry, we'll iterate through them later. */
if (MUNIT_UNLIKELY(munit_parameters_add(&wild_params_l, &wild_params, pe->name, NULL) != MUNIT_OK))
goto cleanup;
}
}
if (wild_params_l != 0) {
first_wild = params_l;
for (wp = wild_params ; wp != NULL && wp->name != NULL ; wp++) {
for (pe = test->parameters ; pe != NULL && pe->name != NULL && pe->values != NULL ; pe++) {
if (strcmp(wp->name, pe->name) == 0) {
if (MUNIT_UNLIKELY(munit_parameters_add(¶ms_l, ¶ms, pe->name, pe->values[0]) != MUNIT_OK))
goto cleanup;
}
}
}
munit_test_runner_run_test_wild(runner, test, test_name, params, params + first_wild);
} else {
munit_test_runner_run_test_with_params(runner, test, params);
}
cleanup:
free(params);
free(wild_params);
}
munit_maybe_free_concat(test_name, prefix, test->name);
}
/* Recurse through the suite and run all the tests. If a list of
* tests to run was provided on the command line, run only those
* tests. */
static void
munit_test_runner_run_suite(MunitTestRunner* runner,
const MunitSuite* suite,
const char* prefix) {
size_t pre_l;
char* pre = munit_maybe_concat(&pre_l, (char*) prefix, (char*) suite->prefix);
const MunitTest* test;
const char** test_name;
const MunitSuite* child_suite;
/* Run the tests. */
for (test = suite->tests ; test != NULL && test->test != NULL ; test++) {
if (runner->tests != NULL) { /* Specific tests were requested on the CLI */
for (test_name = runner->tests ; test_name != NULL && *test_name != NULL ; test_name++) {
if ((pre_l == 0 || strncmp(pre, *test_name, pre_l) == 0) &&
strncmp(test->name, *test_name + pre_l, strlen(*test_name + pre_l)) == 0) {
munit_test_runner_run_test(runner, test, pre);
if (runner->fatal_failures && (runner->report.failed != 0 || runner->report.errored != 0))
goto cleanup;
}
}
} else { /* Run all tests */
munit_test_runner_run_test(runner, test, pre);
}
}
if (runner->fatal_failures && (runner->report.failed != 0 || runner->report.errored != 0))
goto cleanup;
/* Run any child suites. */
for (child_suite = suite->suites ; child_suite != NULL && child_suite->prefix != NULL ; child_suite++) {
munit_test_runner_run_suite(runner, child_suite, pre);
}
cleanup:
munit_maybe_free_concat(pre, prefix, suite->prefix);
}
static void
munit_test_runner_run(MunitTestRunner* runner) {
munit_test_runner_run_suite(runner, runner->suite, NULL);
}
static void
munit_print_help(int argc, char* const argv[MUNIT_ARRAY_PARAM(argc + 1)], void* user_data, const MunitArgument arguments[]) {
const MunitArgument* arg;
(void) argc;
printf("USAGE: %s [OPTIONS...] [TEST...]\n\n", argv[0]);
puts(" --seed SEED\n"
" Value used to seed the PRNG. Must be a 32-bit integer in decimal\n"
" notation with no separators (commas, decimals, spaces, etc.), or\n"
" hexadecimal prefixed by \"0x\".\n"
" --iterations N\n"
" Run each test N times. 0 means the default number.\n"
" --param name value\n"
" A parameter key/value pair which will be passed to any test with\n"
" takes a parameter of that name. If not provided, the test will be\n"
" run once for each possible parameter value.\n"
" --list Write a list of all available tests.\n"
" --list-params\n"
" Write a list of all available tests and their possible parameters.\n"
" --single Run each parameterized test in a single configuration instead of\n"
" every possible combination\n"
" --log-visible debug|info|warning|error\n"
" --log-fatal debug|info|warning|error\n"
" Set the level at which messages of different severities are visible,\n"
" or cause the test to terminate.\n"
#if !defined(MUNIT_NO_FORK)
" --no-fork Do not execute tests in a child process. If this option is supplied\n"
" and a test crashes (including by failing an assertion), no further\n"
" tests will be performed.\n"
#endif
" --fatal-failures\n"
" Stop executing tests as soon as a failure is found.\n"
" --show-stderr\n"
" Show data written to stderr by the tests, even if the test succeeds.\n"
" --color auto|always|never\n"
" Colorize (or don't) the output.\n"
/* 12345678901234567890123456789012345678901234567890123456789012345678901234567890 */
" --help Print this help message and exit.\n");
#if defined(MUNIT_NL_LANGINFO)
setlocale(LC_ALL, "");
fputs((strcasecmp("UTF-8", nl_langinfo(CODESET)) == 0) ? "µnit" : "munit", stdout);
#else
puts("munit");
#endif
printf(" %d.%d.%d\n"
"Full documentation at: https://nemequ.github.io/munit/\n",
(MUNIT_CURRENT_VERSION >> 16) & 0xff,
(MUNIT_CURRENT_VERSION >> 8) & 0xff,
(MUNIT_CURRENT_VERSION >> 0) & 0xff);
for (arg = arguments ; arg != NULL && arg->name != NULL ; arg++)
arg->write_help(arg, user_data);
}
static const MunitArgument*
munit_arguments_find(const MunitArgument arguments[], const char* name) {
const MunitArgument* arg;
for (arg = arguments ; arg != NULL && arg->name != NULL ; arg++)
if (strcmp(arg->name, name) == 0)
return arg;
return NULL;
}
static void
munit_suite_list_tests(const MunitSuite* suite, bool show_params, const char* prefix) {
size_t pre_l;
char* pre = munit_maybe_concat(&pre_l, (char*) prefix, (char*) suite->prefix);
const MunitTest* test;
const MunitParameterEnum* params;
bool first;
char** val;
const MunitSuite* child_suite;
for (test = suite->tests ;
test != NULL && test->name != NULL ;
test++) {
if (pre != NULL)
fputs(pre, stdout);
puts(test->name);
if (show_params) {
for (params = test->parameters ;
params != NULL && params->name != NULL ;
params++) {
fprintf(stdout, " - %s: ", params->name);
if (params->values == NULL) {
puts("Any");
} else {
first = true;
for (val = params->values ;
*val != NULL ;
val++ ) {
if(!first) {
fputs(", ", stdout);
} else {
first = false;
}
fputs(*val, stdout);
}
putc('\n', stdout);
}
}
}
}
for (child_suite = suite->suites ; child_suite != NULL && child_suite->prefix != NULL ; child_suite++) {
munit_suite_list_tests(child_suite, show_params, pre);
}
munit_maybe_free_concat(pre, prefix, suite->prefix);
}
static bool
munit_stream_supports_ansi(FILE *stream) {
#if !defined(_WIN32)
return isatty(fileno(stream));
#else
#if !defined(__MINGW32__)
size_t ansicon_size = 0;
#endif
if (isatty(fileno(stream))) {
#if !defined(__MINGW32__)
getenv_s(&ansicon_size, NULL, 0, "ANSICON");
return ansicon_size != 0;
#else
return getenv("ANSICON") != NULL;
#endif
}
return false;
#endif
}
int
munit_suite_main_custom(const MunitSuite* suite, void* user_data,
int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)],
const MunitArgument arguments[]) {
int result = EXIT_FAILURE;
MunitTestRunner runner;
size_t parameters_size = 0;
size_t tests_size = 0;
int arg;
char* envptr;
unsigned long ts;
char* endptr;
unsigned long long iterations;
MunitLogLevel level;
const MunitArgument* argument;
const char** runner_tests;
unsigned int tests_run;
unsigned int tests_total;
runner.prefix = NULL;
runner.suite = NULL;
runner.tests = NULL;
runner.seed = 0;
runner.iterations = 0;
runner.parameters = NULL;
runner.single_parameter_mode = false;
runner.user_data = NULL;
runner.report.successful = 0;
runner.report.skipped = 0;
runner.report.failed = 0;
runner.report.errored = 0;
#if defined(MUNIT_ENABLE_TIMING)
runner.report.cpu_clock = 0;
runner.report.wall_clock = 0;
#endif
runner.colorize = false;
#if !defined(_WIN32)
runner.fork = true;
#else
runner.fork = false;
#endif
runner.show_stderr = false;
runner.fatal_failures = false;
runner.suite = suite;
runner.user_data = user_data;
runner.seed = munit_rand_generate_seed();
runner.colorize = munit_stream_supports_ansi(MUNIT_OUTPUT_FILE);
for (arg = 1 ; arg < argc ; arg++) {
if (strncmp("--", argv[arg], 2) == 0) {
if (strcmp("seed", argv[arg] + 2) == 0) {
if (arg + 1 >= argc) {
munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]);
goto cleanup;
}
envptr = argv[arg + 1];
ts = strtoul(argv[arg + 1], &envptr, 0);
if (*envptr != '\0' || ts > (~((munit_uint32_t) 0U))) {
munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]);
goto cleanup;
}
runner.seed = (munit_uint32_t) ts;
arg++;
} else if (strcmp("iterations", argv[arg] + 2) == 0) {
if (arg + 1 >= argc) {
munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]);
goto cleanup;
}
endptr = argv[arg + 1];
iterations = strtoul(argv[arg + 1], &endptr, 0);
if (*endptr != '\0' || iterations > UINT_MAX) {
munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]);
goto cleanup;
}
runner.iterations = (unsigned int) iterations;
arg++;
} else if (strcmp("param", argv[arg] + 2) == 0) {
if (arg + 2 >= argc) {
munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires two arguments", argv[arg]);
goto cleanup;
}
runner.parameters = realloc(runner.parameters, sizeof(MunitParameter) * (parameters_size + 2));
if (runner.parameters == NULL) {
munit_log_internal(MUNIT_LOG_ERROR, stderr, "failed to allocate memory");
goto cleanup;
}
runner.parameters[parameters_size].name = (char*) argv[arg + 1];
runner.parameters[parameters_size].value = (char*) argv[arg + 2];
parameters_size++;
runner.parameters[parameters_size].name = NULL;
runner.parameters[parameters_size].value = NULL;
arg += 2;
} else if (strcmp("color", argv[arg] + 2) == 0) {
if (arg + 1 >= argc) {
munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]);
goto cleanup;
}
if (strcmp(argv[arg + 1], "always") == 0)
runner.colorize = true;
else if (strcmp(argv[arg + 1], "never") == 0)
runner.colorize = false;
else if (strcmp(argv[arg + 1], "auto") == 0)
runner.colorize = munit_stream_supports_ansi(MUNIT_OUTPUT_FILE);
else {
munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]);
goto cleanup;
}
arg++;
} else if (strcmp("help", argv[arg] + 2) == 0) {
munit_print_help(argc, argv, user_data, arguments);
result = EXIT_SUCCESS;
goto cleanup;
} else if (strcmp("single", argv[arg] + 2) == 0) {
runner.single_parameter_mode = true;
} else if (strcmp("show-stderr", argv[arg] + 2) == 0) {
runner.show_stderr = true;
#if !defined(_WIN32)
} else if (strcmp("no-fork", argv[arg] + 2) == 0) {
runner.fork = false;
#endif
} else if (strcmp("fatal-failures", argv[arg] + 2) == 0) {
runner.fatal_failures = true;
} else if (strcmp("log-visible", argv[arg] + 2) == 0 ||
strcmp("log-fatal", argv[arg] + 2) == 0) {
if (arg + 1 >= argc) {
munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]);
goto cleanup;
}
if (strcmp(argv[arg + 1], "debug") == 0)
level = MUNIT_LOG_DEBUG;
else if (strcmp(argv[arg + 1], "info") == 0)
level = MUNIT_LOG_INFO;
else if (strcmp(argv[arg + 1], "warning") == 0)
level = MUNIT_LOG_WARNING;
else if (strcmp(argv[arg + 1], "error") == 0)
level = MUNIT_LOG_ERROR;
else {
munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]);
goto cleanup;
}
if (strcmp("log-visible", argv[arg] + 2) == 0)
munit_log_level_visible = level;
else
munit_log_level_fatal = level;
arg++;
} else if (strcmp("list", argv[arg] + 2) == 0) {
munit_suite_list_tests(suite, false, NULL);
result = EXIT_SUCCESS;
goto cleanup;
} else if (strcmp("list-params", argv[arg] + 2) == 0) {
munit_suite_list_tests(suite, true, NULL);
result = EXIT_SUCCESS;
goto cleanup;
} else {
argument = munit_arguments_find(arguments, argv[arg] + 2);
if (argument == NULL) {
munit_logf_internal(MUNIT_LOG_ERROR, stderr, "unknown argument ('%s')", argv[arg]);
goto cleanup;
}
if (!argument->parse_argument(suite, user_data, &arg, argc, argv))
goto cleanup;
}
} else {
runner_tests = realloc((void*) runner.tests, sizeof(char*) * (tests_size + 2));
if (runner_tests == NULL) {
munit_log_internal(MUNIT_LOG_ERROR, stderr, "failed to allocate memory");
goto cleanup;
}
runner.tests = runner_tests;
runner.tests[tests_size++] = argv[arg];
runner.tests[tests_size] = NULL;
}
}
fflush(stderr);
fprintf(MUNIT_OUTPUT_FILE, "Running test suite with seed 0x%08" PRIx32 "...\n", runner.seed);
munit_test_runner_run(&runner);
tests_run = runner.report.successful + runner.report.failed + runner.report.errored;
tests_total = tests_run + runner.report.skipped;
if (tests_run == 0) {
fprintf(stderr, "No tests run, %d (100%%) skipped.\n", runner.report.skipped);
} else {
fprintf(MUNIT_OUTPUT_FILE, "%d of %d (%0.0f%%) tests successful, %d (%0.0f%%) test skipped.\n",
runner.report.successful, tests_run,
(((double) runner.report.successful) / ((double) tests_run)) * 100.0,
runner.report.skipped,
(((double) runner.report.skipped) / ((double) tests_total)) * 100.0);
}
if (runner.report.failed == 0 && runner.report.errored == 0) {
result = EXIT_SUCCESS;
}
cleanup:
free(runner.parameters);
free((void*) runner.tests);
return result;
}
int
munit_suite_main(const MunitSuite* suite, void* user_data,
int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)]) {
return munit_suite_main_custom(suite, user_data, argc, argv, NULL);
}
raft-0.22.1/test/lib/munit.h 0000664 0000000 0000000 00000042213 14601504142 0015561 0 ustar 00root root 0000000 0000000 /* µnit Testing Framework
* Copyright (c) 2013-2017 Evan Nemerson
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#if !defined(MUNIT_H)
#define MUNIT_H
#include
#include
#define MUNIT_VERSION(major, minor, revision) \
(((major) << 16) | ((minor) << 8) | (revision))
#define MUNIT_CURRENT_VERSION MUNIT_VERSION(0, 4, 1)
#if defined(_MSC_VER) && (_MSC_VER < 1600)
# define munit_int8_t __int8
# define munit_uint8_t unsigned __int8
# define munit_int16_t __int16
# define munit_uint16_t unsigned __int16
# define munit_int32_t __int32
# define munit_uint32_t unsigned __int32
# define munit_int64_t __int64
# define munit_uint64_t unsigned __int64
#else
# include
# define munit_int8_t int8_t
# define munit_uint8_t uint8_t
# define munit_int16_t int16_t
# define munit_uint16_t uint16_t
# define munit_int32_t int32_t
# define munit_uint32_t uint32_t
# define munit_int64_t int64_t
# define munit_uint64_t uint64_t
#endif
#if defined(_MSC_VER) && (_MSC_VER < 1800)
# if !defined(PRIi8)
# define PRIi8 "i"
# endif
# if !defined(PRIi16)
# define PRIi16 "i"
# endif
# if !defined(PRIi32)
# define PRIi32 "i"
# endif
# if !defined(PRIi64)
# define PRIi64 "I64i"
# endif
# if !defined(PRId8)
# define PRId8 "d"
# endif
# if !defined(PRId16)
# define PRId16 "d"
# endif
# if !defined(PRId32)
# define PRId32 "d"
# endif
# if !defined(PRId64)
# define PRId64 "I64d"
# endif
# if !defined(PRIx8)
# define PRIx8 "x"
# endif
# if !defined(PRIx16)
# define PRIx16 "x"
# endif
# if !defined(PRIx32)
# define PRIx32 "x"
# endif
# if !defined(PRIx64)
# define PRIx64 "I64x"
# endif
# if !defined(PRIu8)
# define PRIu8 "u"
# endif
# if !defined(PRIu16)
# define PRIu16 "u"
# endif
# if !defined(PRIu32)
# define PRIu32 "u"
# endif
# if !defined(PRIu64)
# define PRIu64 "I64u"
# endif
# if !defined(bool)
# define bool int
# endif
# if !defined(true)
# define true (!0)
# endif
# if !defined(false)
# define false (!!0)
# endif
#else
# include
# include
#endif
#if defined(__cplusplus)
extern "C" {
#endif
#if defined(__GNUC__)
# define MUNIT_LIKELY(expr) (__builtin_expect ((expr), 1))
# define MUNIT_UNLIKELY(expr) (__builtin_expect ((expr), 0))
# define MUNIT_UNUSED __attribute__((__unused__))
#else
# define MUNIT_LIKELY(expr) (expr)
# define MUNIT_UNLIKELY(expr) (expr)
# define MUNIT_UNUSED
#endif
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__PGI)
# define MUNIT_ARRAY_PARAM(name) name
#else
# define MUNIT_ARRAY_PARAM(name)
#endif
#if !defined(_WIN32)
# define MUNIT_SIZE_MODIFIER "z"
# define MUNIT_CHAR_MODIFIER "hh"
# define MUNIT_SHORT_MODIFIER "h"
#else
# if defined(_M_X64) || defined(__amd64__)
# define MUNIT_SIZE_MODIFIER "I64"
# else
# define MUNIT_SIZE_MODIFIER ""
# endif
# define MUNIT_CHAR_MODIFIER ""
# define MUNIT_SHORT_MODIFIER ""
#endif
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
# define MUNIT_NO_RETURN _Noreturn
#elif defined(__GNUC__)
# define MUNIT_NO_RETURN __attribute__((__noreturn__))
#elif defined(_MSC_VER)
# define MUNIT_NO_RETURN __declspec(noreturn)
#else
# define MUNIT_NO_RETURN
#endif
#if defined(_MSC_VER) && (_MSC_VER >= 1500)
# define MUNIT__PUSH_DISABLE_MSVC_C4127 __pragma(warning(push)) __pragma(warning(disable:4127))
# define MUNIT__POP_DISABLE_MSVC_C4127 __pragma(warning(pop))
#else
# define MUNIT__PUSH_DISABLE_MSVC_C4127
# define MUNIT__POP_DISABLE_MSVC_C4127
#endif
typedef enum {
MUNIT_LOG_DEBUG,
MUNIT_LOG_INFO,
MUNIT_LOG_WARNING,
MUNIT_LOG_ERROR
} MunitLogLevel;
#if defined(__GNUC__) && !defined(__MINGW32__)
# define MUNIT_PRINTF(string_index, first_to_check) __attribute__((format (printf, string_index, first_to_check)))
#else
# define MUNIT_PRINTF(string_index, first_to_check)
#endif
MUNIT_PRINTF(4, 5)
void munit_logf_ex(MunitLogLevel level, const char* filename, int line, const char* format, ...);
#define munit_logf(level, format, ...) \
munit_logf_ex(level, __FILE__, __LINE__, format, __VA_ARGS__)
#define munit_log(level, msg) \
munit_logf(level, "%s", msg)
MUNIT_NO_RETURN
MUNIT_PRINTF(3, 4)
void munit_errorf_ex(const char* filename, int line, const char* format, ...);
#define munit_errorf(format, ...) \
munit_errorf_ex(__FILE__, __LINE__, format, __VA_ARGS__)
#define munit_error(msg) \
munit_errorf("%s", msg)
#define munit_assert(expr) \
do { \
if (!MUNIT_LIKELY(expr)) { \
munit_error("assertion failed: " #expr); \
} \
MUNIT__PUSH_DISABLE_MSVC_C4127 \
} while (0) \
MUNIT__POP_DISABLE_MSVC_C4127
#define munit_assert_true(expr) \
do { \
if (!MUNIT_LIKELY(expr)) { \
munit_error("assertion failed: " #expr " is not true"); \
} \
MUNIT__PUSH_DISABLE_MSVC_C4127 \
} while (0) \
MUNIT__POP_DISABLE_MSVC_C4127
#define munit_assert_false(expr) \
do { \
if (!MUNIT_LIKELY(!(expr))) { \
munit_error("assertion failed: " #expr " is not false"); \
} \
MUNIT__PUSH_DISABLE_MSVC_C4127 \
} while (0) \
MUNIT__POP_DISABLE_MSVC_C4127
#define munit_assert_type_full(prefix, suffix, T, fmt, a, op, b) \
do { \
T munit_tmp_a_ = (a); \
T munit_tmp_b_ = (b); \
if (!(munit_tmp_a_ op munit_tmp_b_)) { \
munit_errorf("assertion failed: %s %s %s (" prefix "%" fmt suffix " %s " prefix "%" fmt suffix ")", \
#a, #op, #b, munit_tmp_a_, #op, munit_tmp_b_); \
} \
MUNIT__PUSH_DISABLE_MSVC_C4127 \
} while (0) \
MUNIT__POP_DISABLE_MSVC_C4127
#define munit_assert_type(T, fmt, a, op, b) \
munit_assert_type_full("", "", T, fmt, a, op, b)
#define munit_assert_char(a, op, b) \
munit_assert_type_full("'\\x", "'", char, "02" MUNIT_CHAR_MODIFIER "x", a, op, b)
#define munit_assert_uchar(a, op, b) \
munit_assert_type_full("'\\x", "'", unsigned char, "02" MUNIT_CHAR_MODIFIER "x", a, op, b)
#define munit_assert_short(a, op, b) \
munit_assert_type(short, MUNIT_SHORT_MODIFIER "d", a, op, b)
#define munit_assert_ushort(a, op, b) \
munit_assert_type(unsigned short, MUNIT_SHORT_MODIFIER "u", a, op, b)
#define munit_assert_int(a, op, b) \
munit_assert_type(int, "d", a, op, b)
#define munit_assert_uint(a, op, b) \
munit_assert_type(unsigned int, "u", a, op, b)
#define munit_assert_long(a, op, b) \
munit_assert_type(long int, "ld", a, op, b)
#define munit_assert_ulong(a, op, b) \
munit_assert_type(unsigned long int, "lu", a, op, b)
#define munit_assert_llong(a, op, b) \
munit_assert_type(long long int, "lld", a, op, b)
#define munit_assert_ullong(a, op, b) \
munit_assert_type(unsigned long long int, "llu", a, op, b)
#define munit_assert_size(a, op, b) \
munit_assert_type(size_t, MUNIT_SIZE_MODIFIER "u", a, op, b)
#define munit_assert_float(a, op, b) \
munit_assert_type(float, "f", a, op, b)
#define munit_assert_double(a, op, b) \
munit_assert_type(double, "g", a, op, b)
#define munit_assert_ptr(a, op, b) \
munit_assert_type(const void*, "p", a, op, b)
#define munit_assert_int8(a, op, b) \
munit_assert_type(munit_int8_t, PRIi8, a, op, b)
#define munit_assert_uint8(a, op, b) \
munit_assert_type(munit_uint8_t, PRIu8, a, op, b)
#define munit_assert_int16(a, op, b) \
munit_assert_type(munit_int16_t, PRIi16, a, op, b)
#define munit_assert_uint16(a, op, b) \
munit_assert_type(munit_uint16_t, PRIu16, a, op, b)
#define munit_assert_int32(a, op, b) \
munit_assert_type(munit_int32_t, PRIi32, a, op, b)
#define munit_assert_uint32(a, op, b) \
munit_assert_type(munit_uint32_t, PRIu32, a, op, b)
#define munit_assert_int64(a, op, b) \
munit_assert_type(munit_int64_t, PRIi64, a, op, b)
#define munit_assert_uint64(a, op, b) \
munit_assert_type(munit_uint64_t, PRIu64, a, op, b)
#define munit_assert_double_equal(a, b, precision) \
do { \
const double munit_tmp_a_ = (a); \
const double munit_tmp_b_ = (b); \
const double munit_tmp_diff_ = ((munit_tmp_a_ - munit_tmp_b_) < 0) ? \
-(munit_tmp_a_ - munit_tmp_b_) : \
(munit_tmp_a_ - munit_tmp_b_); \
if (MUNIT_UNLIKELY(munit_tmp_diff_ > 1e-##precision)) { \
munit_errorf("assertion failed: %s == %s (%0." #precision "g == %0." #precision "g)", \
#a, #b, munit_tmp_a_, munit_tmp_b_); \
} \
MUNIT__PUSH_DISABLE_MSVC_C4127 \
} while (0) \
MUNIT__POP_DISABLE_MSVC_C4127
#include
#define munit_assert_string_equal(a, b) \
do { \
const char* munit_tmp_a_ = a; \
const char* munit_tmp_b_ = b; \
if (MUNIT_UNLIKELY(strcmp(munit_tmp_a_, munit_tmp_b_) != 0)) { \
munit_errorf("assertion failed: string %s == %s (\"%s\" == \"%s\")", \
#a, #b, munit_tmp_a_, munit_tmp_b_); \
} \
MUNIT__PUSH_DISABLE_MSVC_C4127 \
} while (0) \
MUNIT__POP_DISABLE_MSVC_C4127
#define munit_assert_string_not_equal(a, b) \
do { \
const char* munit_tmp_a_ = a; \
const char* munit_tmp_b_ = b; \
if (MUNIT_UNLIKELY(strcmp(munit_tmp_a_, munit_tmp_b_) == 0)) { \
munit_errorf("assertion failed: string %s != %s (\"%s\" == \"%s\")", \
#a, #b, munit_tmp_a_, munit_tmp_b_); \
} \
MUNIT__PUSH_DISABLE_MSVC_C4127 \
} while (0) \
MUNIT__POP_DISABLE_MSVC_C4127
#define munit_assert_memory_equal(size, a, b) \
do { \
const unsigned char* munit_tmp_a_ = (const unsigned char*) (a); \
const unsigned char* munit_tmp_b_ = (const unsigned char*) (b); \
const size_t munit_tmp_size_ = (size); \
if (MUNIT_UNLIKELY(memcmp(munit_tmp_a_, munit_tmp_b_, munit_tmp_size_)) != 0) { \
size_t munit_tmp_pos_; \
for (munit_tmp_pos_ = 0 ; munit_tmp_pos_ < munit_tmp_size_ ; munit_tmp_pos_++) { \
if (munit_tmp_a_[munit_tmp_pos_] != munit_tmp_b_[munit_tmp_pos_]) { \
munit_errorf("assertion failed: memory %s == %s, at offset %" MUNIT_SIZE_MODIFIER "u", \
#a, #b, munit_tmp_pos_); \
break; \
} \
} \
} \
MUNIT__PUSH_DISABLE_MSVC_C4127 \
} while (0) \
MUNIT__POP_DISABLE_MSVC_C4127
#define munit_assert_memory_not_equal(size, a, b) \
do { \
const unsigned char* munit_tmp_a_ = (const unsigned char*) (a); \
const unsigned char* munit_tmp_b_ = (const unsigned char*) (b); \
const size_t munit_tmp_size_ = (size); \
if (MUNIT_UNLIKELY(memcmp(munit_tmp_a_, munit_tmp_b_, munit_tmp_size_)) == 0) { \
munit_errorf("assertion failed: memory %s != %s (%zu bytes)", \
#a, #b, munit_tmp_size_); \
} \
MUNIT__PUSH_DISABLE_MSVC_C4127 \
} while (0) \
MUNIT__POP_DISABLE_MSVC_C4127
#define munit_assert_ptr_equal(a, b) \
munit_assert_ptr(a, ==, b)
#define munit_assert_ptr_not_equal(a, b) \
munit_assert_ptr(a, !=, b)
#define munit_assert_null(ptr) \
munit_assert_ptr(ptr, ==, NULL)
#define munit_assert_not_null(ptr) \
munit_assert_ptr(ptr, !=, NULL)
#define munit_assert_ptr_null(ptr) \
munit_assert_ptr(ptr, ==, NULL)
#define munit_assert_ptr_not_null(ptr) \
munit_assert_ptr(ptr, !=, NULL)
/*** Memory allocation ***/
void* munit_malloc_ex(const char* filename, int line, size_t size);
#define munit_malloc(size) \
munit_malloc_ex(__FILE__, __LINE__, (size))
#define munit_new(type) \
((type*) munit_malloc(sizeof(type)))
#define munit_calloc(nmemb, size) \
munit_malloc((nmemb) * (size))
#define munit_newa(type, nmemb) \
((type*) munit_calloc((nmemb), sizeof(type)))
/*** Random number generation ***/
void munit_rand_seed(munit_uint32_t seed);
munit_uint32_t munit_rand_uint32(void);
int munit_rand_int_range(int min, int max);
double munit_rand_double(void);
void munit_rand_memory(size_t size, munit_uint8_t buffer[MUNIT_ARRAY_PARAM(size)]);
/*** Tests and Suites ***/
typedef enum {
/* Test successful */
MUNIT_OK,
/* Test failed */
MUNIT_FAIL,
/* Test was skipped */
MUNIT_SKIP,
/* Test failed due to circumstances not intended to be tested
* (things like network errors, invalid parameter value, failure to
* allocate memory in the test harness, etc.). */
MUNIT_ERROR
} MunitResult;
typedef struct {
char* name;
char** values;
} MunitParameterEnum;
typedef struct {
char* name;
char* value;
} MunitParameter;
const char* munit_parameters_get(const MunitParameter params[], const char* key);
typedef enum {
MUNIT_TEST_OPTION_NONE = 0,
MUNIT_TEST_OPTION_SINGLE_ITERATION = 1 << 0,
MUNIT_TEST_OPTION_TODO = 1 << 1
} MunitTestOptions;
typedef MunitResult (* MunitTestFunc)(const MunitParameter params[], void* user_data_or_fixture);
typedef void* (* MunitTestSetup)(const MunitParameter params[], void* user_data);
typedef void (* MunitTestTearDown)(void* fixture);
typedef struct {
char* name;
MunitTestFunc test;
MunitTestSetup setup;
MunitTestTearDown tear_down;
MunitTestOptions options;
MunitParameterEnum* parameters;
} MunitTest;
typedef enum {
MUNIT_SUITE_OPTION_NONE = 0
} MunitSuiteOptions;
typedef struct MunitSuite_ MunitSuite;
struct MunitSuite_ {
char* prefix;
MunitTest* tests;
MunitSuite* suites;
unsigned int iterations;
MunitSuiteOptions options;
};
int munit_suite_main(const MunitSuite* suite, void* user_data, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)]);
/* Note: I'm not very happy with this API; it's likely to change if I
* figure out something better. Suggestions welcome. */
typedef struct MunitArgument_ MunitArgument;
struct MunitArgument_ {
char* name;
bool (* parse_argument)(const MunitSuite* suite, void* user_data, int* arg, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)]);
void (* write_help)(const MunitArgument* argument, void* user_data);
};
int munit_suite_main_custom(const MunitSuite* suite,
void* user_data,
int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)],
const MunitArgument arguments[]);
#if defined(MUNIT_ENABLE_ASSERT_ALIASES)
#define assert_true(expr) munit_assert_true(expr)
#define assert_false(expr) munit_assert_false(expr)
#define assert_char(a, op, b) munit_assert_char(a, op, b)
#define assert_uchar(a, op, b) munit_assert_uchar(a, op, b)
#define assert_short(a, op, b) munit_assert_short(a, op, b)
#define assert_ushort(a, op, b) munit_assert_ushort(a, op, b)
#define assert_int(a, op, b) munit_assert_int(a, op, b)
#define assert_uint(a, op, b) munit_assert_uint(a, op, b)
#define assert_long(a, op, b) munit_assert_long(a, op, b)
#define assert_ulong(a, op, b) munit_assert_ulong(a, op, b)
#define assert_llong(a, op, b) munit_assert_llong(a, op, b)
#define assert_ullong(a, op, b) munit_assert_ullong(a, op, b)
#define assert_size(a, op, b) munit_assert_size(a, op, b)
#define assert_float(a, op, b) munit_assert_float(a, op, b)
#define assert_double(a, op, b) munit_assert_double(a, op, b)
#define assert_ptr(a, op, b) munit_assert_ptr(a, op, b)
#define assert_int8(a, op, b) munit_assert_int8(a, op, b)
#define assert_uint8(a, op, b) munit_assert_uint8(a, op, b)
#define assert_int16(a, op, b) munit_assert_int16(a, op, b)
#define assert_uint16(a, op, b) munit_assert_uint16(a, op, b)
#define assert_int32(a, op, b) munit_assert_int32(a, op, b)
#define assert_uint32(a, op, b) munit_assert_uint32(a, op, b)
#define assert_int64(a, op, b) munit_assert_int64(a, op, b)
#define assert_uint64(a, op, b) munit_assert_uint64(a, op, b)
#define assert_double_equal(a, b, precision) munit_assert_double_equal(a, b, precision)
#define assert_string_equal(a, b) munit_assert_string_equal(a, b)
#define assert_string_not_equal(a, b) munit_assert_string_not_equal(a, b)
#define assert_memory_equal(size, a, b) munit_assert_memory_equal(size, a, b)
#define assert_memory_not_equal(size, a, b) munit_assert_memory_not_equal(size, a, b)
#define assert_ptr_equal(a, b) munit_assert_ptr_equal(a, b)
#define assert_ptr_not_equal(a, b) munit_assert_ptr_not_equal(a, b)
#define assert_ptr_null(ptr) munit_assert_null_equal(ptr)
#define assert_ptr_not_null(ptr) munit_assert_not_null(ptr)
#define assert_null(ptr) munit_assert_null(ptr)
#define assert_not_null(ptr) munit_assert_not_null(ptr)
#endif /* defined(MUNIT_ENABLE_ASSERT_ALIASES) */
#if defined(__cplusplus)
}
#endif
#endif /* !defined(MUNIT_H) */
#if defined(MUNIT_ENABLE_ASSERT_ALIASES)
# if defined(assert)
# undef assert
# endif
# define assert(expr) munit_assert(expr)
#endif
raft-0.22.1/test/lib/runner.h 0000664 0000000 0000000 00000013353 14601504142 0015741 0 ustar 00root root 0000000 0000000 /* Convenience macros to reduce munit boiler plate. */
#ifndef TEST_RUNNER_H_
#define TEST_RUNNER_H_
#include "munit.h"
/* Top-level suites array declaration.
*
* These top-level suites hold all module-level child suites and must be defined
* and then set as child suites of a root suite created at runtime by the test
* runner's main(). This can be done using the TEST_RUNNER macro. */
extern MunitSuite _main_suites[];
extern int _main_suites_n;
/* Maximum number of test cases for each suite */
#define SUITE__CAP 128
/* Define the top-level suites array and the main() function of the test. */
#define RUNNER(NAME) \
MunitSuite _main_suites[SUITE__CAP]; \
int _main_suites_n = 0; \
\
int main(int argc, char *argv[MUNIT_ARRAY_PARAM(argc)]) \
{ \
MunitSuite suite = {(char *)"", NULL, _main_suites, 1, 0}; \
return munit_suite_main(&suite, (void *)NAME, argc, argv); \
}
/* Declare and register a new test suite #S belonging to the file's test module.
*
* A test suite is a pair of static variables:
*
* static MunitTest _##S##_suites[SUITE__CAP]
* static MunitTest _##S##_tests[SUITE__CAP]
*
* The tests and suites attributes of the next available MunitSuite slot in the
* _module_suites array will be set to the suite's tests and suites arrays, and
* the prefix attribute of the slot will be set to /S. */
#define SUITE(S) \
SUITE__DECLARE(S) \
SUITE__ADD_CHILD(main, #S, S)
/* Declare and register a new test. */
#define TEST(S, C, SETUP, TEAR_DOWN, OPTIONS, PARAMS) \
static MunitResult test_##S##_##C(const MunitParameter params[], \
void *data); \
TEST__ADD_TO_SUITE(S, C, SETUP, TEAR_DOWN, OPTIONS, PARAMS) \
static MunitResult test_##S##_##C( \
MUNIT_UNUSED const MunitParameter params[], MUNIT_UNUSED void *data)
#define SKIP_IF_NO_FIXTURE \
if (f == NULL) { \
return MUNIT_SKIP; \
}
/* Declare the MunitSuite[] and the MunitTest[] arrays that compose the test
* suite identified by S. */
#define SUITE__DECLARE(S) \
static MunitSuite _##S##_suites[SUITE__CAP]; \
static MunitTest _##S##_tests[SUITE__CAP]; \
static MunitTestSetup _##S##_setup = NULL; \
static MunitTestTearDown _##S##_tear_down = NULL; \
static int _##S##_suites_n = 0; \
static int _##S##_tests_n = 0; \
__attribute__((constructor(101))) static void _##S##_init(void) \
{ \
memset(_##S##_suites, 0, sizeof(_##S##_suites)); \
memset(_##S##_tests, 0, sizeof(_##S##_tests)); \
(void)_##S##_suites_n; \
(void)_##S##_tests_n; \
(void)_##S##_setup; \
(void)_##S##_tear_down; \
}
/* Set the tests and suites attributes of the next available slot of the
* MunitSuite[] array of S1 to the MunitTest[] and MunitSuite[] arrays of S2,
* using the given PREFIX. */
#define SUITE__ADD_CHILD(S1, PREFIX, S2) \
__attribute__((constructor(102))) static void _##S1##_##S2##_init(void) \
{ \
int n = _##S1##_suites_n; \
_##S1##_suites[n].prefix = PREFIX; \
_##S1##_suites[n].tests = _##S2##_tests; \
_##S1##_suites[n].suites = _##S2##_suites; \
_##S1##_suites[n].iterations = 0; \
_##S1##_suites[n].options = 0; \
_##S1##_suites_n = n + 1; \
}
/* Add a test case to the MunitTest[] array of suite S. */
#define TEST__ADD_TO_SUITE(S, C, SETUP, TEAR_DOWN, OPTIONS, PARAMS) \
__attribute__((constructor(103))) static void _##S##_tests_##C##_init( \
void) \
{ \
MunitTest *tests = _##S##_tests; \
int n = _##S##_tests_n; \
TEST__SET_IN_ARRAY(tests, n, "/" #C, test_##S##_##C, SETUP, TEAR_DOWN, \
OPTIONS, PARAMS); \
_##S##_tests_n = n + 1; \
}
/* Set the values of the I'th test case slot in the given test array */
#define TEST__SET_IN_ARRAY(TESTS, I, NAME, FUNC, SETUP, TEAR_DOWN, OPTIONS, \
PARAMS) \
TESTS[I].name = NAME; \
TESTS[I].test = FUNC; \
TESTS[I].setup = SETUP; \
TESTS[I].tear_down = TEAR_DOWN; \
TESTS[I].options = OPTIONS; \
TESTS[I].parameters = PARAMS
#endif /* TEST_RUNNER_H_ */
raft-0.22.1/test/lib/snapshot.h 0000664 0000000 0000000 00000002043 14601504142 0016261 0 ustar 00root root 0000000 0000000 /**
* Raft snapshot test helpers.
*/
#ifndef TEST_SNAPSHOT_H
#define TEST_SNAPSHOT_H
#include "../../include/raft.h"
#include "../../src/configuration.h"
/**
* Allocate and create the given snapshot, using the given @LAST_INDEX,
* @LAST_TERM, the given @CONF, and generating an FSM snapshot using @X and @Y.
*/
#define CREATE_SNAPSHOT(SNAPSHOT, LAST_INDEX, LAST_TERM, CONF, CONF_INDEX, X, \
Y) \
SNAPSHOT = raft_malloc(sizeof *SNAPSHOT); \
munit_assert_ptr_not_null(SNAPSHOT); \
SNAPSHOT->index = LAST_INDEX; \
SNAPSHOT->term = LAST_TERM; \
SNAPSHOT->configuration = CONF; \
SNAPSHOT->configuration_index = CONF_INDEX; \
FsmEncodeSnapshot(X, Y, &SNAPSHOT->bufs, &SNAPSHOT->n_bufs)
#endif /* TEST_CONFIGURATION_H */
raft-0.22.1/test/lib/tcp.c 0000664 0000000 0000000 00000013020 14601504142 0015200 0 ustar 00root root 0000000 0000000 #include "tcp.h"
#include
#include
#include
#include
void TcpServerInit(struct TcpServer *s)
{
struct sockaddr_in addr;
socklen_t size = sizeof addr;
int rv;
/* Initialize the socket address structure. */
memset(&addr, 0, size);
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = inet_addr("127.0.0.1");
addr.sin_port = 0; /* Get a random free port */
/* Create the server socket. */
s->socket = socket(AF_INET, SOCK_STREAM, 0);
if (s->socket == -1) {
munit_errorf("tcp server: socket(): %s", strerror(errno));
}
/* Bind the socket. */
rv = bind(s->socket, (struct sockaddr *)&addr, size);
if (rv == -1) {
munit_errorf("tcp server: bind(): %s", strerror(errno));
}
/* Start listening. */
rv = listen(s->socket, 1);
if (rv == -1) {
munit_errorf("tcp server: listen(): %s", strerror(errno));
}
/* Get the actual addressed assigned by the kernel and save it back in the
* relevant field. */
rv = getsockname(s->socket, (struct sockaddr *)&addr, &size);
if (rv != 0) {
munit_errorf("tcp: getsockname(): %s", strerror(errno));
}
s->port = htons(addr.sin_port);
sprintf(s->address, "127.0.0.1:%d", s->port);
}
void TcpServerClose(struct TcpServer *s)
{
int rv;
if (s->socket == -1) {
return;
}
rv = close(s->socket);
if (rv == -1) {
munit_errorf("tcp server: close(): %s", strerror(errno));
}
}
int TcpServerAccept(struct TcpServer *s)
{
int socket;
struct sockaddr_in address;
socklen_t size;
size = sizeof(address);
socket = accept(s->socket, (struct sockaddr *)&address, &size);
if (socket < 0) {
munit_errorf("tcp server: accept(): %s", strerror(errno));
}
return socket;
}
void TcpServerStop(struct TcpServer *s)
{
int rv;
rv = close(s->socket);
if (rv == -1) {
munit_errorf("tcp server: close(): %s", strerror(errno));
}
s->socket = -1;
}
void test_tcp_setup(const MunitParameter params[], struct test_tcp *t)
{
(void)params;
t->server.socket = -1;
t->client.socket = -1;
}
void test_tcp_tear_down(struct test_tcp *t)
{
int rv;
if (t->server.socket != -1) {
rv = close(t->server.socket);
if (rv == -1) {
munit_errorf("tcp: close(): %s", strerror(errno));
}
}
if (t->client.socket != -1) {
rv = close(t->client.socket);
if (rv == -1) {
munit_errorf("tcp: close(): %s", strerror(errno));
}
}
}
void test_tcp_listen(struct test_tcp *t)
{
struct sockaddr_in addr;
socklen_t size = sizeof addr;
int rv;
/* Initialize the socket address structure. */
memset(&addr, 0, size);
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = inet_addr("127.0.0.1");
addr.sin_port = 0; /* Get a random free port */
/* Create the server socket. */
t->server.socket = socket(AF_INET, SOCK_STREAM, 0);
if (t->server.socket == -1) {
munit_errorf("tcp: socket(): %s", strerror(errno));
}
/* Bind the socket. */
rv = bind(t->server.socket, (struct sockaddr *)&addr, size);
if (rv == -1) {
munit_errorf("tcp: bind(): %s", strerror(errno));
}
/* Start listening. */
rv = listen(t->server.socket, 1);
if (rv == -1) {
munit_errorf("tcp: listen(): %s", strerror(errno));
}
/* Get the actual addressed assigned by the kernel and save it back in
* the relevant test_socket__server field (pointed to by address). */
rv = getsockname(t->server.socket, (struct sockaddr *)&addr, &size);
if (rv != 0) {
munit_errorf("tcp: getsockname(): %s", strerror(errno));
}
sprintf(t->server.address, "127.0.0.1:%d", htons(addr.sin_port));
}
const char *test_tcp_address(struct test_tcp *t)
{
return t->server.address;
}
void test_tcp_connect(struct test_tcp *t, int port)
{
struct sockaddr_in addr;
int rv;
/* Create the client socket. */
t->client.socket = socket(AF_INET, SOCK_STREAM, 0);
if (t->client.socket == -1) {
munit_errorf("tcp: socket(): %s", strerror(errno));
}
/* Initialize the socket address structure. */
memset(&addr, 0, sizeof addr);
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = inet_addr("127.0.0.1");
addr.sin_port = htons(port);
/* Connect */
rv = connect(t->client.socket, (struct sockaddr *)&addr, sizeof addr);
if (rv == -1) {
munit_errorf("tcp: connect(): %s", strerror(errno));
}
}
void test_tcp_close(struct test_tcp *t)
{
int rv;
rv = close(t->client.socket);
if (rv == -1) {
munit_errorf("tcp: close(): %s", strerror(errno));
}
t->client.socket = -1;
}
void test_tcp_stop(struct test_tcp *t)
{
int rv;
rv = close(t->server.socket);
if (rv == -1) {
munit_errorf("tcp: close(): %s", strerror(errno));
}
t->server.socket = -1;
}
void test_tcp_send(struct test_tcp *t, const void *buf, int len)
{
int rv;
rv = write(t->client.socket, buf, len);
if (rv == -1) {
munit_errorf("tcp: write(): %s", strerror(errno));
}
if (rv != len) {
munit_errorf("tcp: write(): only %d bytes written", rv);
}
}
int test_tcp_accept(struct test_tcp *t)
{
int socket;
struct sockaddr_in address;
socklen_t size;
size = sizeof(address);
socket = accept(t->server.socket, (struct sockaddr *)&address, &size);
if (socket < 0) {
munit_errorf("tcp: accept(): %s", strerror(errno));
}
return socket;
}
raft-0.22.1/test/lib/tcp.h 0000664 0000000 0000000 00000005344 14601504142 0015217 0 ustar 00root root 0000000 0000000 /* Test TCP utilities.
*
* This module sports helpers to create server or client sockets, and
* send/receive data through them.
*/
#ifndef TEST_TCP_H
#define TEST_TCP_H
#include "munit.h"
/* Macro helpers. */
#define FIXTURE_TCP_SERVER struct TcpServer server
#define SETUP_TCP_SERVER TcpServerInit(&f->server)
#define TEAR_DOWN_TCP_SERVER TcpServerClose(&f->server)
#define TCP_SERVER_STOP TcpServerStop(&f->server)
#define TCP_SERVER_PORT f->server.port
#define TCP_SERVER_ADDRESS f->server.address
#define FIXTURE_TCP struct test_tcp tcp
#define SETUP_TCP test_tcp_setup(params, &f->tcp)
#define TEAR_DOWN_TCP test_tcp_tear_down(&f->tcp)
#define TCP_CLIENT_CONNECT(PORT) test_tcp_connect(&f->tcp, PORT)
#define TCP_CLIENT_SEND(BUF, N) test_tcp_send(&f->tcp, BUF, N)
#define TCP_CLIENT_CLOSE test_tcp_close(&f->tcp)
struct TcpServer
{
int socket; /* Socket listening to incoming connections */
int port;
char address[128]; /* IPv4 address of the server, with port */
};
void TcpServerInit(struct TcpServer *s);
void TcpServerClose(struct TcpServer *s);
/* Accept inbound client connection and return the relevant socket. */
int TcpServerAccept(struct TcpServer *s);
/* Close the server socket. */
void TcpServerStop(struct TcpServer *s);
struct TcpClient
{
int socket; /* Socket connected to a server. */
};
void TcpClientInit(struct TcpClient *s);
void TcpClientClose(struct TcpClient *s);
/* Object that can be used to setup and control a TCP server and/or client. */
struct test_tcp
{
struct
{
int socket; /* Socket listening to incoming connections */
char address[128]; /* IPv4 address of the server, with port */
} server;
struct
{
int socket; /* Socket connected to another host */
} client;
};
/**
* Bind the server socket of the given test TCP host to localhost and start
* listening to it.
*/
void test_tcp_setup(const MunitParameter params[], struct test_tcp *t);
void test_tcp_tear_down(struct test_tcp *t);
/**
* Start listening to a random free port on localhost.
*/
void test_tcp_listen(struct test_tcp *t);
/**
* Return the address of the server socket created with @test_tcp_listen.
*/
const char *test_tcp_address(struct test_tcp *t);
/**
* Connect the client socket to the given port on localhost.
*/
void test_tcp_connect(struct test_tcp *t, int port);
/**
* Close the client socket.
*/
void test_tcp_close(struct test_tcp *t);
/**
* Send data using the client socket.
*/
void test_tcp_send(struct test_tcp *t, const void *buf, int len);
/**
* Accept inbound client connection and return the relevant socket.
*/
int test_tcp_accept(struct test_tcp *t);
/**
* Close the server socket.
*/
void test_tcp_stop(struct test_tcp *t);
#endif /* TEST_TCP_H */
raft-0.22.1/test/lib/tracer.c 0000664 0000000 0000000 00000000670 14601504142 0015701 0 ustar 00root root 0000000 0000000 #include "tracer.h"
#include "munit.h"
static void traceDiagnostic(const struct raft_tracer_info *info)
{
fprintf(stderr, "%20s:%*d - %s\n", info->diagnostic.file, 3,
info->diagnostic.line, info->diagnostic.message);
}
void TracerEmit(struct raft_tracer *t, int type, const void *info)
{
(void)t;
switch (type) {
case RAFT_TRACER_DIAGNOSTIC:
traceDiagnostic(info);
break;
};
}
raft-0.22.1/test/lib/tracer.h 0000664 0000000 0000000 00000000727 14601504142 0015711 0 ustar 00root root 0000000 0000000 /* Raft tracer that emits messages to stderr. */
#ifndef TEST_TRACER_H
#define TEST_TRACER_H
#include "../../include/raft.h"
#define FIXTURE_TRACER struct raft_tracer tracer
#define SET_UP_TRACER \
do { \
f->tracer.emit = TracerEmit; \
f->tracer.version = 2; \
} while (0)
#define TEAR_DOWN_TRACER
void TracerEmit(struct raft_tracer *t, int type, const void *info);
#endif /* TEST_TRACER_H */
raft-0.22.1/test/lib/uv.h 0000664 0000000 0000000 00000004470 14601504142 0015062 0 ustar 00root root 0000000 0000000 /* Helpers around the libuv-based implementation of the raft_io interface. */
#ifndef TEST_UV_H
#define TEST_UV_H
#include "../../include/raft.h"
#include "../../include/raft/uv.h"
#include "dir.h"
#include "heap.h"
#include "loop.h"
#include "tracer.h"
#define FIXTURE_UV_TRANSPORT struct raft_uv_transport transport
#define SETUP_UV_TRANSPORT \
do { \
int rv_; \
f->transport.version = 1; \
rv_ = raft_uv_tcp_init(&f->transport, &f->loop); \
munit_assert_int(rv_, ==, 0); \
} while (0)
#define TEAR_DOWN_UV_TRANSPORT raft_uv_tcp_close(&f->transport)
#define FIXTURE_UV_DEPS \
FIXTURE_DIR; \
FIXTURE_HEAP; \
FIXTURE_LOOP; \
FIXTURE_TRACER; \
FIXTURE_UV_TRANSPORT
#define SETUP_UV_DEPS \
SET_UP_DIR; \
SET_UP_HEAP; \
SETUP_LOOP; \
SET_UP_TRACER; \
SETUP_UV_TRANSPORT
#define TEAR_DOWN_UV_DEPS \
TEAR_DOWN_UV_TRANSPORT; \
TEAR_DOWN_TRACER; \
TEAR_DOWN_LOOP; \
TEAR_DOWN_HEAP; \
TEAR_DOWN_DIR
#define FIXTURE_UV struct raft_io io
#define SETUP_UV \
do { \
int rv_; \
rv_ = raft_uv_init(&f->io, &f->loop, f->dir, &f->transport); \
munit_assert_int(rv_, ==, 0); \
raft_uv_set_tracer(&f->io, &f->tracer); \
raft_uv_set_auto_recovery(&f->io, false); \
rv_ = f->io.init(&f->io, 1, "127.0.0.1:9001"); \
munit_assert_int(rv_, ==, 0); \
} while (0)
MUNIT_UNUSED static void uvCloseCb(struct raft_io *io)
{
bool *closed = io->data;
*closed = true;
}
#define TEAR_DOWN_UV \
do { \
bool _closed = false; \
f->io.data = &_closed; \
f->io.close(&f->io, uvCloseCb); \
LOOP_RUN_UNTIL(&_closed); \
raft_uv_close(&f->io); \
} while (0)
#endif /* TEST_UV_H */
raft-0.22.1/test/unit/ 0000775 0000000 0000000 00000000000 14601504142 0014463 5 ustar 00root root 0000000 0000000 raft-0.22.1/test/unit/main_core.c 0000664 0000000 0000000 00000000053 14601504142 0016561 0 ustar 00root root 0000000 0000000 #include "../lib/runner.h"
RUNNER("core")
raft-0.22.1/test/unit/main_uv.c 0000664 0000000 0000000 00000000051 14601504142 0016261 0 ustar 00root root 0000000 0000000 #include "../lib/runner.h"
RUNNER("uv")
raft-0.22.1/test/unit/test_byte.c 0000664 0000000 0000000 00000012576 14601504142 0016644 0 ustar 00root root 0000000 0000000 #include
#include
#include "../../src/byte.h"
#include "../lib/runner.h"
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
#define CRC32(VALUE) byteCrc32(&(VALUE), sizeof VALUE, 0)
/******************************************************************************
*
* byteCrc32
*
*****************************************************************************/
SUITE(byteCrc32)
/* The same data produces the same sum. */
TEST(byteCrc32, valid, NULL, NULL, 0, NULL)
{
uint64_t value1 = 123456789;
uint64_t value2 = 123456789;
munit_assert_int(CRC32(value1), ==, CRC32(value2));
return MUNIT_OK;
}
/* Different data produces a different sum. */
TEST(byteCrc32, invalid, NULL, NULL, 0, NULL)
{
uint64_t value1 = 123456789;
uint64_t value2 = 123466789;
munit_assert_int(CRC32(value1), !=, CRC32(value2));
return MUNIT_OK;
}
/******************************************************************************
*
* Convert to little endian representation (least significant byte first).
*
*****************************************************************************/
SUITE(byteFlip)
/* Convert a 16-bit number. */
TEST(byteFlip, 16, NULL, NULL, 0, NULL)
{
uint16_t value;
unsigned i;
value = byteFlip16(0x0100);
for (i = 0; i < 2; i++) {
munit_assert_int(*((uint8_t *)&value + i), ==, i);
}
return MUNIT_OK;
}
/* Convert a 32-bit number. */
TEST(byteFlip, 32, NULL, NULL, 0, NULL)
{
uint32_t value;
unsigned i;
value = byteFlip32(0x03020100);
for (i = 0; i < 4; i++) {
munit_assert_int(*((uint8_t *)&value + i), ==, i);
}
return MUNIT_OK;
}
/* Convert a 64-bit number. */
TEST(byteFlip, 64, NULL, NULL, 0, NULL)
{
uint64_t value;
unsigned i;
value = byteFlip64(0x0706050403020100);
for (i = 0; i < 8; i++) {
munit_assert_int(*((uint8_t *)&value + i), ==, i);
}
return MUNIT_OK;
}
/******************************************************************************
*
* byteGetString
*
*****************************************************************************/
SUITE(byteGetString)
TEST(byteGetString, success, NULL, NULL, 0, NULL)
{
uint8_t buf[] = {'h', 'e', 'l', 'l', 'o', 0};
const uint8_t *cursor = buf;
munit_assert_string_equal(byteGetString(&cursor, sizeof buf), "hello");
munit_assert_ptr_equal(cursor, buf + sizeof buf);
return MUNIT_OK;
}
TEST(byteGetString, malformed, NULL, NULL, 0, NULL)
{
uint8_t buf[] = {'h', 'e', 'l', 'l', 'o', 'w'};
const uint8_t *cursor = buf;
munit_assert_ptr_equal(byteGetString(&cursor, sizeof buf), NULL);
munit_assert_ptr_equal(cursor, buf);
return MUNIT_OK;
}
/******************************************************************************
*
* byteGet64
*
*****************************************************************************/
SUITE(byteGet64)
TEST(byteGet64, success, NULL, NULL, 0, NULL)
{
uint8_t *buf = munit_malloc(sizeof(uint64_t) * 2);
uint8_t *cursor1 = buf + 1;
const uint8_t *cursor2 = buf + 1;
bytePut64(&cursor1, 1);
munit_assert_ullong(byteGet64(&cursor2), ==, 1);
free(buf);
return MUNIT_OK;
}
/******************************************************************************
*
* byteSha1
*
*****************************************************************************/
/* Assert that the 20 bytes contained in VALUE match the given DIGEST
* hexadecimal representation. */
#define ASSERT_SHA1(VALUE, DIGEST) \
do { \
char _digest[41]; \
unsigned _i; \
for (_i = 0; _i < 20; _i++) { \
unsigned _j = _i * 2; \
sprintf(&_digest[_j], "%.2x", value[_i]); \
_digest[_j] = toupper(_digest[_j]); \
_digest[_j + 1] = toupper(_digest[_j + 1]); \
} \
_digest[40] = '\0'; \
munit_assert_string_equal(_digest, DIGEST); \
} while (0)
SUITE(byteSha1)
TEST(byteSha1, abc, NULL, NULL, 0, NULL)
{
struct byteSha1 sha1;
uint8_t text[] = "abc";
uint8_t value[20];
byteSha1Init(&sha1);
byteSha1Update(&sha1, text, sizeof text - 1);
byteSha1Digest(&sha1, value);
ASSERT_SHA1(value, "A9993E364706816ABA3E25717850C26C9CD0D89D");
return MUNIT_OK;
}
TEST(byteSha1, abcWithZeroLen, NULL, NULL, 0, NULL)
{
struct byteSha1 sha1;
uint8_t text[] = "abc";
uint8_t garbage[] = "garbage";
uint8_t value[20];
byteSha1Init(&sha1);
byteSha1Update(&sha1, text, sizeof text - 1);
/* Update with 0 length buffer doesn't change digest */
byteSha1Update(&sha1, garbage, 0);
byteSha1Digest(&sha1, value);
ASSERT_SHA1(value, "A9993E364706816ABA3E25717850C26C9CD0D89D");
return MUNIT_OK;
}
TEST(byteSha1, abcbd, NULL, NULL, 0, NULL)
{
struct byteSha1 sha1;
uint8_t text[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
uint8_t value[20];
byteSha1Init(&sha1);
byteSha1Update(&sha1, text, sizeof text - 1);
byteSha1Digest(&sha1, value);
ASSERT_SHA1(value, "84983E441C3BD26EBAAE4AA1F95129E5E54670F1");
return MUNIT_OK;
}
raft-0.22.1/test/unit/test_compress.c 0000664 0000000 0000000 00000003707 14601504142 0017530 0 ustar 00root root 0000000 0000000 #include "../../src/byte.h"
#include "../../src/compress.h"
#include "../lib/munit.h"
#include "../lib/runner.h"
SUITE(Compress)
#ifdef LZ4_AVAILABLE
static unsigned char lz4_data[] = {
0x4, 0x22, 0x4d, 0x18, 0x4c, 0x40, 0xd, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x39, 0xd, 0x0, 0x0, 0x80, 0x68,
0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64,
0xa, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0xb4, 0x59, 0x85};
TEST(Compress, decompress, NULL, NULL, 0, NULL)
{
char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0};
struct raft_buffer compressed;
struct raft_buffer decompressed;
int rv;
compressed.base = lz4_data;
compressed.len = sizeof lz4_data;
rv = Decompress(compressed, &decompressed, errmsg);
munit_assert_int(rv, ==, 0);
munit_assert_string_equal(decompressed.base, "hello world\n");
raft_free(decompressed.base);
return MUNIT_OK;
}
#else
TEST(Compress, lz4Disabled, NULL, NULL, 0, NULL)
{
struct raft_buffer buf = {0};
struct raft_buffer decompressed;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
munit_assert_int(Decompress(buf, &decompressed, errmsg), ==, RAFT_INVALID);
munit_assert_string_equal(errmsg, "LZ4 not available");
return MUNIT_OK;
}
#endif /* LZ4_AVAILABLE */
static const char LZ4_MAGIC[4] = {0x04, 0x22, 0x4d, 0x18};
TEST(Compress, isCompressedTooSmall, NULL, NULL, 0, NULL)
{
munit_assert_false(IsCompressed(&LZ4_MAGIC[1], sizeof(LZ4_MAGIC) - 1));
return MUNIT_OK;
}
TEST(Compress, isCompressedNull, NULL, NULL, 0, NULL)
{
munit_assert_false(IsCompressed(NULL, sizeof(LZ4_MAGIC)));
return MUNIT_OK;
}
TEST(Compress, isCompressed, NULL, NULL, 0, NULL)
{
munit_assert_true(IsCompressed(LZ4_MAGIC, sizeof(LZ4_MAGIC)));
return MUNIT_OK;
}
TEST(Compress, notCompressed, NULL, NULL, 0, NULL)
{
char not_compressed[4] = {0x18, 0x4d, 0x22, 0x04};
munit_assert_false(IsCompressed(not_compressed, sizeof(not_compressed)));
return MUNIT_OK;
}
raft-0.22.1/test/unit/test_configuration.c 0000664 0000000 0000000 00000051007 14601504142 0020540 0 ustar 00root root 0000000 0000000 #include "../../src/byte.h"
#include "../../src/configuration.h"
#include "../lib/heap.h"
#include "../lib/runner.h"
/******************************************************************************
*
* Fixture
*
*****************************************************************************/
struct fixture
{
FIXTURE_HEAP;
struct raft_configuration configuration;
};
static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SET_UP_HEAP;
configurationInit(&f->configuration);
return f;
}
static void tearDownNoClose(void *data)
{
struct fixture *f = data;
TEAR_DOWN_HEAP;
free(f);
}
static void tearDown(void *data)
{
struct fixture *f = data;
configurationClose(&f->configuration);
tearDownNoClose(data);
}
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
/* Accessors */
#define VOTER_COUNT configurationVoterCount(&f->configuration)
#define INDEX_OF(ID) configurationIndexOf(&f->configuration, ID)
#define INDEX_OF_VOTER(ID) configurationIndexOfVoter(&f->configuration, ID)
#define GET(ID) configurationGet(&f->configuration, ID)
/* Add a server to the fixture's configuration. */
#define ADD_RV(ID, ADDRESS, ROLE) \
configurationAdd(&f->configuration, ID, ADDRESS, ROLE)
#define ADD(...) munit_assert_int(ADD_RV(__VA_ARGS__), ==, 0)
#define ADD_ERROR(RV, ...) munit_assert_int(ADD_RV(__VA_ARGS__), ==, RV)
/* Remove a server from the fixture's configuration */
#define REMOVE_RV(ID) configurationRemove(&f->configuration, ID)
#define REMOVE(...) munit_assert_int(REMOVE_RV(__VA_ARGS__), ==, 0)
#define REMOVE_ERROR(RV, ...) munit_assert_int(REMOVE_RV(__VA_ARGS__), ==, RV)
/* Copy the fixture's configuration into the given one. */
#define COPY_RV(CONF) configurationCopy(&f->configuration, CONF)
#define COPY(...) munit_assert_int(COPY_RV(__VA_ARGS__), ==, 0)
#define COPY_ERROR(RV, ...) munit_assert_int(COPY_RV(__VA_ARGS__), ==, RV)
/* Encode the fixture's configuration into the given buffer. */
#define ENCODE_RV(BUF) configurationEncode(&f->configuration, BUF)
#define ENCODE(...) munit_assert_int(ENCODE_RV(__VA_ARGS__), ==, 0)
#define ENCODE_ERROR(RV, ...) munit_assert_int(ENCODE_RV(__VA_ARGS__), ==, RV)
/* Decode the given buffer into the fixture's configuration. */
#define DECODE_RV(BUF) configurationDecode(BUF, &f->configuration)
#define DECODE(...) munit_assert_int(DECODE_RV(__VA_ARGS__), ==, 0)
#define DECODE_ERROR(RV, ...) munit_assert_int(DECODE_RV(__VA_ARGS__), ==, RV)
/******************************************************************************
*
* Assertions
*
*****************************************************************************/
/* Assert that the fixture's configuration has n servers. */
#define ASSERT_N(N) \
{ \
munit_assert_int(f->configuration.n, ==, N); \
if (N == 0) { \
munit_assert_ptr_null(f->configuration.servers); \
} else { \
munit_assert_ptr_not_null(f->configuration.servers); \
} \
}
/* Assert that the attributes of the I'th server in the fixture's configuration
* match the given values. */
#define ASSERT_SERVER(I, ID, ADDRESS, ROLE) \
{ \
struct raft_server *server; \
munit_assert_int(I, <, f->configuration.n); \
server = &f->configuration.servers[I]; \
munit_assert_int(server->id, ==, ID); \
munit_assert_string_equal(server->address, ADDRESS); \
munit_assert_int(server->role, ==, ROLE); \
}
/******************************************************************************
*
* configurationVoterCount
*
*****************************************************************************/
SUITE(configurationVoterCount)
/* All servers are voting. */
TEST(configurationVoterCount, all_voters, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "192.168.1.1:666", RAFT_VOTER);
ADD(2, "192.168.1.2:666", RAFT_VOTER);
munit_assert_int(VOTER_COUNT, ==, 2);
return MUNIT_OK;
}
/* Return only voting servers. */
TEST(configurationVoterCount, filter, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "192.168.1.1:666", RAFT_VOTER);
ADD(2, "192.168.1.2:666", RAFT_STANDBY);
munit_assert_int(VOTER_COUNT, ==, 1);
return MUNIT_OK;
}
/******************************************************************************
*
* configurationIndexOf
*
*****************************************************************************/
SUITE(configurationIndexOf)
/* If a matching server is found, it's index is returned. */
TEST(configurationIndexOf, match, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "192.168.1.1:666", RAFT_VOTER);
ADD(2, "192.168.1.2:666", RAFT_STANDBY);
munit_assert_int(INDEX_OF(2), ==, 1);
return MUNIT_OK;
}
/* If no matching server is found, the length of the configuration is
* returned. */
TEST(configurationIndexOf, no_match, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "127.0.0.1:666", RAFT_VOTER);
munit_assert_int(INDEX_OF(3), ==, f->configuration.n);
return MUNIT_OK;
}
/******************************************************************************
*
* configurationIndexOfVoter
*
*****************************************************************************/
SUITE(configurationIndexOfVoter)
/* The index of the matching voting server (relative to the number of voting
servers) is returned. */
TEST(configurationIndexOfVoter, match, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "192.168.1.1:666", RAFT_STANDBY);
ADD(2, "192.168.1.2:666", RAFT_VOTER);
ADD(3, "192.168.1.3:666", RAFT_VOTER);
munit_assert_int(INDEX_OF_VOTER(3), ==, 1);
return MUNIT_OK;
}
/* If no matching server is found, the length of the configuration is
* returned. */
TEST(configurationIndexOfVoter, no_match, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "192.168.1.1:666", RAFT_VOTER);
munit_assert_int(INDEX_OF_VOTER(3), ==, 1);
return MUNIT_OK;
}
/* If the server exists but is non-voting, the length of the configuration is
* returned. */
TEST(configurationIndexOfVoter, non_voting, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "192.168.1.1:666", RAFT_STANDBY);
munit_assert_int(INDEX_OF_VOTER(1), ==, 1);
return MUNIT_OK;
}
/******************************************************************************
*
* configurationGet
*
*****************************************************************************/
SUITE(configurationGet)
/* If a matching server is found, it's returned. */
TEST(configurationGet, match, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
const struct raft_server *server;
ADD(1, "192.168.1.1:666", RAFT_VOTER);
ADD(2, "192.168.1.2:666", RAFT_STANDBY);
server = GET(2);
munit_assert_ptr_not_null(server);
munit_assert_int(server->id, ==, 2);
munit_assert_string_equal(server->address, "192.168.1.2:666");
return MUNIT_OK;
}
/* If no matching server is found, NULL is returned. */
TEST(configurationGet, no_match, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "127.0.0.1:666", RAFT_VOTER);
munit_assert_ptr_null(GET(3));
return MUNIT_OK;
}
/******************************************************************************
*
* configurationCopy
*
*****************************************************************************/
SUITE(configurationCopy)
/* Copy a configuration containing two servers */
TEST(configurationCopy, two, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_configuration configuration;
ADD(1, "192.168.1.1:666", RAFT_STANDBY);
ADD(2, "192.168.1.2:666", RAFT_VOTER);
COPY(&configuration);
munit_assert_int(configuration.n, ==, 2);
munit_assert_int(configuration.servers[0].id, ==, 1);
munit_assert_int(configuration.servers[1].id, ==, 2);
configurationClose(&configuration);
return MUNIT_OK;
}
static char *copy_oom_heap_fault_delay[] = {"0", "1", "2", NULL};
static char *copy_oom_heap_fault_repeat[] = {"1", NULL};
static MunitParameterEnum copy_oom_params[] = {
{TEST_HEAP_FAULT_DELAY, copy_oom_heap_fault_delay},
{TEST_HEAP_FAULT_REPEAT, copy_oom_heap_fault_repeat},
{NULL, NULL},
};
/* Out of memory */
TEST(configurationCopy, oom, setUp, tearDown, 0, copy_oom_params)
{
struct fixture *f = data;
struct raft_configuration configuration;
ADD(1, "192.168.1.1:666", RAFT_STANDBY);
ADD(2, "192.168.1.2:666", RAFT_VOTER);
HEAP_FAULT_ENABLE;
COPY_ERROR(RAFT_NOMEM, &configuration);
return MUNIT_OK;
}
/******************************************************************************
*
* raft_configuration_add
*
*****************************************************************************/
SUITE(configurationAdd)
/* Add a server to the configuration. */
TEST(configurationAdd, one, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "127.0.0.1:666", RAFT_VOTER);
ASSERT_N(1);
ASSERT_SERVER(0, 1, "127.0.0.1:666", RAFT_VOTER);
return MUNIT_OK;
}
/* Add two servers to the configuration. */
TEST(configurationAdd, two, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "127.0.0.1:666", RAFT_VOTER);
ADD(2, "192.168.1.1:666", RAFT_STANDBY);
ASSERT_N(2);
ASSERT_SERVER(0, 1, "127.0.0.1:666", RAFT_VOTER);
ASSERT_SERVER(1, 2, "192.168.1.1:666", RAFT_STANDBY);
return MUNIT_OK;
}
/* Add a server with an ID which is already in use. */
TEST(configurationAdd, duplicateId, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "127.0.0.1:666", RAFT_VOTER);
ADD_ERROR(RAFT_DUPLICATEID, 1, "192.168.1.1:666", RAFT_STANDBY);
return MUNIT_OK;
}
/* Add a server with an address which is already in use. */
TEST(configurationAdd, duplicateAddress, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "127.0.0.1:666", RAFT_VOTER);
ADD_ERROR(RAFT_DUPLICATEADDRESS, 2, "127.0.0.1:666", RAFT_STANDBY);
return MUNIT_OK;
}
/* Add a server with an invalid role. */
TEST(configurationAdd, invalidRole, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD_ERROR(RAFT_BADROLE, 2, "127.0.0.1:666", 666);
return MUNIT_OK;
}
static char *add_oom_heap_fault_delay[] = {"0", "1", NULL};
static char *add_oom_heap_fault_repeat[] = {"1", NULL};
static MunitParameterEnum add_oom_params[] = {
{TEST_HEAP_FAULT_DELAY, add_oom_heap_fault_delay},
{TEST_HEAP_FAULT_REPEAT, add_oom_heap_fault_repeat},
{NULL, NULL},
};
/* Out of memory. */
TEST(configurationAdd, oom, setUp, tearDown, 0, add_oom_params)
{
struct fixture *f = data;
HeapFaultEnable(&f->heap);
ADD_ERROR(RAFT_NOMEM, 1, "127.0.0.1:666", RAFT_VOTER);
munit_assert_null(f->configuration.servers);
return MUNIT_OK;
}
/******************************************************************************
*
* configurationRemove
*
*****************************************************************************/
SUITE(configurationRemove)
/* Remove the last and only server. */
TEST(configurationRemove, last, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "127.0.0.1:666", RAFT_VOTER);
REMOVE(1);
ASSERT_N(0);
return MUNIT_OK;
}
/* Remove the first server. */
TEST(configurationRemove, first, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "127.0.0.1:666", RAFT_VOTER);
ADD(2, "192.168.1.1:666", RAFT_STANDBY);
REMOVE(1);
ASSERT_N(1);
ASSERT_SERVER(0, 2, "192.168.1.1:666", RAFT_STANDBY);
return MUNIT_OK;
}
/* Remove a server in the middle. */
TEST(configurationRemove, middle, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "127.0.0.1:666", RAFT_VOTER);
ADD(2, "192.168.1.1:666", RAFT_STANDBY);
ADD(3, "10.0.1.1:666", RAFT_VOTER);
REMOVE(2);
ASSERT_N(2);
ASSERT_SERVER(0, 1, "127.0.0.1:666", RAFT_VOTER);
ASSERT_SERVER(1, 3, "10.0.1.1:666", RAFT_VOTER);
return MUNIT_OK;
}
/* Attempts to remove a server with an unknown ID result in an error. */
TEST(configurationRemove, unknownId, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
REMOVE_ERROR(RAFT_BADID, 1);
return MUNIT_OK;
}
/* Out of memory. */
TEST(configurationRemove, oom, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ADD(1, "127.0.0.1:666", RAFT_VOTER);
ADD(2, "192.168.1.1:666", RAFT_STANDBY);
HeapFaultConfig(&f->heap, 0, 1);
HeapFaultEnable(&f->heap);
REMOVE_ERROR(RAFT_NOMEM, 1);
return MUNIT_OK;
}
/******************************************************************************
*
* configurationEncode
*
*****************************************************************************/
SUITE(configurationEncode)
/* Encode a configuration with one server. */
TEST(configurationEncode, one_server, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_buffer buf;
size_t len;
const uint8_t *cursor;
const char *address = "127.0.0.1:666";
ADD(1, address, RAFT_VOTER);
ENCODE(&buf);
len = 1 + 8 + /* Version and n of servers */
8 + strlen(address) + 1; /* Server */
len = bytePad64(len);
munit_assert_ullong(buf.len, ==, len);
cursor = buf.base;
munit_assert_int(byteGet8(&cursor), ==, 1);
munit_assert_ullong(byteGet64(&cursor), ==, 1);
munit_assert_ullong(byteGet64(&cursor), ==, 1);
munit_assert_string_equal(byteGetString(&cursor, strlen(address) + 1),
address);
munit_assert_int(byteGet8(&cursor), ==, RAFT_VOTER);
raft_free(buf.base);
return MUNIT_OK;
}
/* Encode a configuration with two servers. */
TEST(configurationEncode, two_servers, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_buffer buf;
size_t len;
const uint8_t *cursor;
const char *address1 = "127.0.0.1:666";
const char *address2 = "192.168.1.1:666";
ADD(1, address1, RAFT_STANDBY);
ADD(2, address2, RAFT_VOTER);
ENCODE(&buf);
len = 1 + 8 + /* Version and n of servers */
8 + strlen(address1) + 1 + 1 + /* Server 1 */
8 + strlen(address2) + 1 + 1; /* Server 2 */
len = bytePad64(len);
munit_assert_ullong(buf.len, ==, len);
cursor = buf.base;
munit_assert_int(byteGet8(&cursor), ==, 1);
munit_assert_ullong(byteGet64(&cursor), ==, 2);
munit_assert_ullong(byteGet64(&cursor), ==, 1);
munit_assert_string_equal(byteGetString(&cursor, strlen(address1) + 1),
address1);
munit_assert_int(byteGet8(&cursor), ==, RAFT_STANDBY);
munit_assert_ullong(byteGet64(&cursor), ==, 2);
munit_assert_string_equal(byteGetString(&cursor, strlen(address2) + 1),
address2);
munit_assert_int(byteGet8(&cursor), ==, RAFT_VOTER);
raft_free(buf.base);
return MUNIT_OK;
}
/* Out of memory. */
TEST(configurationEncode, oom, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_buffer buf;
HeapFaultConfig(&f->heap, 2, 1);
HeapFaultEnable(&f->heap);
ADD(1, "127.0.0.1:666", RAFT_VOTER);
ENCODE_ERROR(RAFT_NOMEM, &buf);
return MUNIT_OK;
}
/******************************************************************************
*
* configurationDecode
*
*****************************************************************************/
SUITE(configurationDecode)
/* The decode a payload encoding a configuration with one server */
TEST(configurationDecode, one_server, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
uint8_t bytes[] = {1, /* Version */
1, 0, 0, 0, 0, 0, 0, 0, /* Number of servers */
5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */
'x', '.', 'y', 0, /* Server address */
1}; /* Role code */
struct raft_buffer buf;
buf.base = bytes;
buf.len = sizeof bytes;
DECODE(&buf);
ASSERT_N(1);
ASSERT_SERVER(0, 5, "x.y", RAFT_VOTER);
return MUNIT_OK;
}
/* The decode size is the size of a raft_server array plus the length of the
* addresses. */
TEST(configurationDecode, two_servers, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
uint8_t bytes[] = {1, /* Version */
2, 0, 0, 0, 0, 0, 0, 0, /* Number of servers */
5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */
'x', '.', 'y', 0, /* Server address */
1, /* Role code */
3, 0, 0, 0, 0, 0, 0, 0, /* Server ID */
'1', '9', '2', '.', '2', 0, /* Server address */
0}; /* Role code */
struct raft_buffer buf;
buf.base = bytes;
buf.len = sizeof bytes;
DECODE(&buf);
ASSERT_N(2);
ASSERT_SERVER(0, 5, "x.y", RAFT_VOTER);
ASSERT_SERVER(1, 3, "192.2", RAFT_STANDBY);
return MUNIT_OK;
}
static char *decode_oom_heap_fault_delay[] = {"0", "1", "2", "3", NULL};
static char *decode_oom_heap_fault_repeat[] = {"1", NULL};
static MunitParameterEnum decode_oom_params[] = {
{TEST_HEAP_FAULT_DELAY, decode_oom_heap_fault_delay},
{TEST_HEAP_FAULT_REPEAT, decode_oom_heap_fault_repeat},
{NULL, NULL},
};
/* Not enough memory for creating the decoded configuration object. */
TEST(configurationDecode, oom, setUp, tearDownNoClose, 0, decode_oom_params)
{
struct fixture *f = data;
uint8_t bytes[] = {1, /* Version */
2, 0, 0, 0, 0, 0, 0, 0, /* Number of servers */
5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */
'x', '.', 'y', 0, /* Server address */
1, /* Role code */
3, 0, 0, 0, 0, 0, 0, 0, /* Server ID */
'z', '.', 'w', 0, /* Server address */
0}; /* Role code */
struct raft_buffer buf;
HEAP_FAULT_ENABLE;
buf.base = bytes;
buf.len = sizeof bytes;
DECODE_ERROR(RAFT_NOMEM, &buf);
return MUNIT_OK;
}
/* If the encoding version is wrong, an error is returned. */
TEST(configurationDecode, badVersion, setUp, tearDownNoClose, 0, NULL)
{
struct fixture *f = data;
uint8_t bytes = 127;
struct raft_buffer buf;
buf.base = &bytes;
buf.len = 1;
DECODE_ERROR(RAFT_MALFORMED, &buf);
return MUNIT_OK;
}
/* The address of a server is not a nul-terminated string. */
TEST(configurationDecode, badAddress, setUp, tearDownNoClose, 0, NULL)
{
struct fixture *f = data;
uint8_t bytes[] = {1, /* Version */
1, 0, 0, 0, 0, 0, 0, 0, /* Number of servers */
5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */
'x', '.', 'y', /* Server address */
1}; /* Voting flag */
struct raft_buffer buf;
buf.base = bytes;
buf.len = sizeof bytes;
DECODE_ERROR(RAFT_MALFORMED, &buf);
return MUNIT_OK;
}
/* The encoded configuration is invalid because it has a duplicated server
* ID. In that case RAFT_MALFORMED is returned. */
TEST(configurationDecode, duplicatedID, setUp, tearDownNoClose, 0, NULL)
{
struct fixture *f = data;
uint8_t bytes[] = {1, /* Version */
2, 0, 0, 0, 0, 0, 0, 0, /* Number of servers */
5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */
'x', '.', 'y', 0, /* Server address */
1, /* Role code */
5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */
'z', '.', 'w', 0, /* Server address */
0}; /* Role code */
struct raft_buffer buf;
buf.base = bytes;
buf.len = sizeof bytes;
DECODE_ERROR(RAFT_MALFORMED, &buf);
return MUNIT_OK;
}
raft-0.22.1/test/unit/test_err.c 0000664 0000000 0000000 00000005323 14601504142 0016461 0 ustar 00root root 0000000 0000000 #include
#include
#include "../../src/err.h"
#include "../lib/heap.h"
#include "../lib/runner.h"
/* An error messages which is 249 characters. */
#if defined(RAFT__LEGACY_no)
#define LONG_ERRMSG "boom boom boom boom boom boom boom boom boom boom boom bo"
#else
#define LONG_ERRMSG \
"boom boom boom boom boom boom boom boom boom boom boom boom boom boom " \
"boom boom boom boom boom boom boom boom boom boom boom boom boom boom " \
"boom boom boom boom boom boom boom boom boom boom boom boom boom boom " \
"boom boom boom boom boom boom boom boom"
#endif
/******************************************************************************
*
* ErrMsgPrintf
*
*****************************************************************************/
SUITE(ErrMsgPrintf)
/* The format string has no parameters. */
TEST(ErrMsgPrintf, noParams, NULL, NULL, 0, NULL)
{
char errmsg[RAFT_ERRMSG_BUF_SIZE];
ErrMsgPrintf(errmsg, "boom");
munit_assert_string_equal(errmsg, "boom");
return MUNIT_OK;
}
/* The format string has parameters. */
TEST(ErrMsgPrintf, params, NULL, NULL, 0, NULL)
{
char errmsg[RAFT_ERRMSG_BUF_SIZE];
ErrMsgPrintf(errmsg, "boom %d", 123);
munit_assert_string_equal(errmsg, "boom 123");
return MUNIT_OK;
}
/******************************************************************************
*
* ErrMsgWrapf
*
*****************************************************************************/
SUITE(ErrMsgWrapf)
/* The wrapping format string has no parameters. */
TEST(ErrMsgWrapf, noParams, NULL, NULL, 0, NULL)
{
char errmsg[RAFT_ERRMSG_BUF_SIZE];
ErrMsgPrintf(errmsg, "boom");
ErrMsgWrapf(errmsg, "no luck");
munit_assert_string_equal(errmsg, "no luck: boom");
return MUNIT_OK;
}
/* The wrapping format string has parameters. */
TEST(ErrMsgWrapf, params, NULL, NULL, 0, NULL)
{
char errmsg[RAFT_ERRMSG_BUF_SIZE];
ErrMsgPrintf(errmsg, "boom");
ErrMsgWrapf(errmsg, "no luck, %s", "joe");
munit_assert_string_equal(errmsg, "no luck, joe: boom");
return MUNIT_OK;
}
/* The wrapped error message gets partially truncated. */
TEST(ErrMsgWrapf, partialTruncate, NULL, NULL, 0, NULL)
{
char errmsg[RAFT_ERRMSG_BUF_SIZE];
ErrMsgPrintf(errmsg, "no luck");
ErrMsgWrapf(errmsg, LONG_ERRMSG);
munit_assert_string_equal(errmsg, LONG_ERRMSG ": no l");
return MUNIT_OK;
}
/* The wrapped error message gets entirely truncated. */
TEST(ErrMsgWrapf, fullTruncate, NULL, NULL, 0, NULL)
{
char errmsg[RAFT_ERRMSG_BUF_SIZE];
ErrMsgPrintf(errmsg, "no luck");
ErrMsgWrapf(errmsg, LONG_ERRMSG " boom");
munit_assert_string_equal(errmsg, LONG_ERRMSG " boom");
return MUNIT_OK;
}
raft-0.22.1/test/unit/test_log.c 0000664 0000000 0000000 00000120363 14601504142 0016454 0 ustar 00root root 0000000 0000000 #include "../../src/configuration.h"
#include "../../src/log.h"
#include "../lib/heap.h"
#include "../lib/runner.h"
/******************************************************************************
*
* Fixture
*
*****************************************************************************/
struct fixture
{
FIXTURE_HEAP;
struct raft_log *log;
};
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
/* Accessors */
#define NUM_ENTRIES logNumEntries(f->log)
#define LAST_INDEX logLastIndex(f->log)
#define TERM_OF(INDEX) logTermOf(f->log, INDEX)
#define LAST_TERM logLastTerm(f->log)
#define GET(INDEX) logGet(f->log, INDEX)
/* Append one command entry with the given term and a hard-coded payload. */
#define APPEND(TERM) \
{ \
struct raft_buffer buf_; \
int rv_; \
buf_.base = raft_malloc(8); \
buf_.len = 8; \
strcpy(buf_.base, "hello"); \
rv_ = logAppend(f->log, TERM, RAFT_COMMAND, &buf_, NULL); \
munit_assert_int(rv_, ==, 0); \
}
/* Same as APPEND, but repeated N times. */
#define APPEND_MANY(TERM, N) \
{ \
int i_; \
for (i_ = 0; i_ < N; i_++) { \
APPEND(TERM); \
} \
}
/* Invoke append and assert that it returns the given error. */
#define APPEND_ERROR(TERM, RV) \
{ \
struct raft_buffer buf_; \
int rv_; \
buf_.base = raft_malloc(8); \
buf_.len = 8; \
rv_ = logAppend(f->log, TERM, RAFT_COMMAND, &buf_, NULL); \
munit_assert_int(rv_, ==, RV); \
raft_free(buf_.base); \
}
/* Append N entries all belonging to the same batch. Each entry will have 64-bit
* payload set to i * 1000, where i is the index of the entry in the batch. */
#define APPEND_BATCH(N) \
{ \
void *batch; \
size_t offset; \
int i; \
batch = raft_malloc(8 * N); \
munit_assert_ptr_not_null(batch); \
offset = 0; \
for (i = 0; i < N; i++) { \
struct raft_buffer buf; \
int rv; \
buf.base = (uint8_t *)batch + offset; \
buf.len = 8; \
*(uint64_t *)buf.base = i * 1000; \
rv = logAppend(f->log, 1, RAFT_COMMAND, &buf, batch); \
munit_assert_int(rv, ==, 0); \
offset += 8; \
} \
}
#define ACQUIRE(INDEX) \
{ \
int rv2; \
rv2 = logAcquire(f->log, INDEX, &entries, &n); \
munit_assert_int(rv2, ==, 0); \
}
#define RELEASE(INDEX) logRelease(f->log, INDEX, entries, n);
#define TRUNCATE(N) logTruncate(f->log, N)
#define SNAPSHOT(INDEX, TRAILING) logSnapshot(f->log, INDEX, TRAILING)
#define RESTORE(INDEX, TERM) logRestore(f->log, INDEX, TERM)
/******************************************************************************
*
* Set up an empty configuration.
*
*****************************************************************************/
static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SET_UP_HEAP;
f->log = logInit();
if (f->log == NULL) {
munit_assert_true(false);
}
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
logClose(f->log);
TEAR_DOWN_HEAP;
free(f);
}
/******************************************************************************
*
* Assertions
*
*****************************************************************************/
/* Assert the state of the fixture's log in terms of size, front/back indexes,
* offset and number of entries. */
#define ASSERT(SIZE, FRONT, BACK, OFFSET, N) \
munit_assert_int(f->log->size, ==, SIZE); \
munit_assert_int(f->log->front, ==, FRONT); \
munit_assert_int(f->log->back, ==, BACK); \
munit_assert_int(f->log->offset, ==, OFFSET); \
munit_assert_int(logNumEntries(f->log), ==, N)
/* Assert the last index and term of the most recent snapshot. */
#define ASSERT_SNAPSHOT(INDEX, TERM) \
munit_assert_int(f->log->snapshot.last_index, ==, INDEX); \
munit_assert_int(f->log->snapshot.last_term, ==, TERM)
/* Assert that the term of entry at INDEX equals TERM. */
#define ASSERT_TERM_OF(INDEX, TERM) \
{ \
const struct raft_entry *entry; \
entry = logGet(f->log, INDEX); \
munit_assert_ptr_not_null(entry); \
munit_assert_int(entry->term, ==, TERM); \
}
/* Assert that the number of outstanding references for the entry at INDEX
* equals COUNT. */
#define ASSERT_REFCOUNT(INDEX, COUNT) \
{ \
size_t i; \
munit_assert_ptr_not_null(f->log->refs); \
for (i = 0; i < f->log->refs_size; i++) { \
if (f->log->refs[i].index == INDEX) { \
munit_assert_int(f->log->refs[i].count, ==, COUNT); \
break; \
} \
} \
if (i == f->log->refs_size) { \
munit_errorf("no refcount found for entry with index %d", \
(int)INDEX); \
} \
}
/******************************************************************************
*
* logNumEntries
*
*****************************************************************************/
SUITE(logNumEntries)
/* If the log is empty, the return value is zero. */
TEST(logNumEntries, empty, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
munit_assert_int(NUM_ENTRIES, ==, 0);
return MUNIT_OK;
}
/* The log is not wrapped. */
TEST(logNumEntries, not_wrapped, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1 /* term */);
munit_assert_int(NUM_ENTRIES, ==, 1);
return MUNIT_OK;
}
/* The log is wrapped. */
TEST(logNumEntries, wrapped, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(1 /* term */, 5 /* n entries */);
SNAPSHOT(4 /* last_index */, 1 /* trailing */);
APPEND_MANY(1 /* term */, 2 /* n entries */);
munit_assert_int(NUM_ENTRIES, ==, 4);
return MUNIT_OK;
}
/* The log has an offset and is empty. */
TEST(logNumEntries, offset, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(1 /* term */, 5 /* n entries */);
SNAPSHOT(5 /* last index */, 0 /* trailing */);
munit_assert_int(NUM_ENTRIES, ==, 0);
return MUNIT_OK;
}
/* The log has an offset and is not empty. */
TEST(logNumEntries, offsetNotEmpty, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(1 /* term */, 5 /* n entries */);
SNAPSHOT(4 /* last index */, 2 /* trailing */);
munit_assert_int(NUM_ENTRIES, ==, 3);
return MUNIT_OK;
}
/******************************************************************************
*
* logLastIndex
*
*****************************************************************************/
SUITE(logLastIndex)
/* If the log is empty, last index is 0. */
TEST(logLastIndex, empty, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
munit_assert_int(LAST_INDEX, ==, 0);
return MUNIT_OK;
}
/* If the log is empty and has an offset, last index is calculated
accordingly. */
TEST(logLastIndex, emptyWithOffset, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1);
SNAPSHOT(1, 0);
munit_assert_int(LAST_INDEX, ==, 1);
return MUNIT_OK;
}
/* The log has one entry. */
TEST(logLastIndex, one, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1 /* term */);
munit_assert_int(LAST_INDEX, ==, 1);
return MUNIT_OK;
}
/* The log has two entries. */
TEST(logLastIndex, two, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(1 /* term */, 2 /* n */);
munit_assert_int(LAST_INDEX, ==, 2);
return MUNIT_OK;
}
/* If the log starts at a certain offset, the last index is bumped
* accordingly. */
TEST(logLastIndex, twoWithOffset, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(1 /* term */, 5 /* n */);
SNAPSHOT(5 /* last index */, 2 /* trailing */);
munit_assert_int(LAST_INDEX, ==, 5);
return MUNIT_OK;
}
/******************************************************************************
*
* logLastTerm
*
*****************************************************************************/
SUITE(logLastTerm)
/* If the log is empty, return zero. */
TEST(logLastTerm, empty, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
munit_assert_int(LAST_TERM, ==, 0);
return MUNIT_OK;
}
/* If the log has a snapshot and no outstanding entries, return the last term of
* the snapshot. */
TEST(logLastTerm, snapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1 /* term */);
SNAPSHOT(1 /* last index */, 0 /* trailing */);
munit_assert_int(LAST_TERM, ==, 1);
return MUNIT_OK;
}
/******************************************************************************
*
* logTermOf
*
*****************************************************************************/
SUITE(logTermOf)
/* If the given index is beyond the last index, return 0. */
TEST(logTermOf, beyondLast, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
munit_assert_int(TERM_OF(2), ==, 0);
munit_assert_int(TERM_OF(10), ==, 0);
return MUNIT_OK;
}
/* If the log is empty but has a snapshot, and the given index matches the last
* index of the snapshot, return the snapshot last term. */
TEST(logTermOf, snapshotLastIndex, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(1 /* term */, 5 /* n entries */);
SNAPSHOT(5 /* last entry */, 0 /* trailing */);
munit_assert_int(TERM_OF(5), ==, 1);
return MUNIT_OK;
}
/* The log has one entry. */
TEST(logTermOf, one, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(3 /* term */);
munit_assert_int(TERM_OF(1), ==, 3);
return MUNIT_OK;
}
/* The log has two entries. */
TEST(logTermOf, two, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(4 /* term */, 2 /* n */);
munit_assert_int(TERM_OF(1), ==, 4);
munit_assert_int(TERM_OF(2), ==, 4);
return MUNIT_OK;
}
/* The log has a snapshot and hence has an offset. */
TEST(logTermOf, withSnapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(1 /* term */, 5 /* n entries */);
SNAPSHOT(3 /* last index */, 0 /* trailing */);
munit_assert_int(TERM_OF(1), ==, 0);
munit_assert_int(TERM_OF(2), ==, 0);
munit_assert_int(TERM_OF(3), ==, 1);
munit_assert_int(TERM_OF(4), ==, 1);
munit_assert_int(TERM_OF(5), ==, 1);
return MUNIT_OK;
}
/* The log has a snapshot with trailing entries. */
TEST(logTermOf, snapshotTrailing, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(1 /* term */, 5 /* n entries */);
SNAPSHOT(3 /* last index */, 2 /* trailing */);
munit_assert_int(TERM_OF(1), ==, 0);
munit_assert_int(TERM_OF(2), ==, 1);
munit_assert_int(TERM_OF(3), ==, 1);
munit_assert_int(TERM_OF(4), ==, 1);
munit_assert_int(TERM_OF(5), ==, 1);
return MUNIT_OK;
}
/******************************************************************************
*
* logGet
*
*****************************************************************************/
SUITE(logGet)
/* The log is empty. */
TEST(logGet, empty_log, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
munit_assert_ptr_null(GET(1));
return MUNIT_OK;
}
/* The log is empty but has an offset. */
TEST(logGet, emptyWithOffset, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(4 /* term */, 10 /* n */);
SNAPSHOT(10 /* last index */, 0 /* trailing */);
munit_assert_ptr_null(GET(1));
munit_assert_ptr_null(GET(10));
munit_assert_ptr_null(GET(11));
return MUNIT_OK;
}
/* The log has one entry. */
TEST(logGet, one, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(3 /* term */);
munit_assert_int(GET(1)->term, ==, 3);
munit_assert_ptr_null(GET(2));
return MUNIT_OK;
}
/* The log has two entries. */
TEST(logGet, two, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(4 /* term */, 2 /* n */);
munit_assert_int(GET(1)->term, ==, 4);
munit_assert_int(GET(2)->term, ==, 4);
munit_assert_ptr_null(GET(3));
return MUNIT_OK;
}
/* The log starts at a certain offset. */
TEST(logGet, twoWithOffset, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(1 /* term */, 3 /* n */);
APPEND(2 /* term */);
APPEND(3 /* term */);
SNAPSHOT(4 /* las index */, 1 /* trailing */);
munit_assert_ptr_null(GET(1));
munit_assert_ptr_null(GET(2));
munit_assert_ptr_null(GET(3));
munit_assert_int(GET(4)->term, ==, 2);
munit_assert_int(GET(5)->term, ==, 3);
return MUNIT_OK;
}
/******************************************************************************
*
* logAppend
*
*****************************************************************************/
SUITE(logAppend)
/* Append one entry to an empty log. */
TEST(logAppend, one, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1 /* term */);
ASSERT(2 /* size */,
0 /* front */,
1 /* back */,
0 /* offset */,
1 /* n */);
ASSERT_TERM_OF(1 /* entry index */, 1 /* term */);
ASSERT_REFCOUNT(1 /* entry index */, 1 /* count */);
return MUNIT_OK;
}
/* Append two entries to to an empty log. */
TEST(logAppend, two, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1 /* term */);
APPEND(1 /* term */);
ASSERT(6 /* size */,
0 /* front */,
2 /* back */,
0 /* offset */,
2 /* n */);
ASSERT_TERM_OF(1 /* entry index */, 1 /* term */);
ASSERT_TERM_OF(2 /* entry index */, 1 /* term */);
ASSERT_REFCOUNT(1 /* entry index */, 1 /* count */);
ASSERT_REFCOUNT(2 /* entry index */, 1 /* count */);
return MUNIT_OK;
}
/* Append three entries in sequence. */
TEST(logAppend, three, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
/* One -> [e1, NULL] */
APPEND(1 /* term */);
/* Two -> [e1, e2, NULL, NULL, NULL, NULL] */
APPEND(1 /* term */);
/* Three -> [e1, e2, e3, NULL, NULL, NULL] */
APPEND(1 /* term */);
ASSERT(6 /* size */,
0 /* front */,
3 /* back */,
0 /* offset */,
3 /* n */);
ASSERT_TERM_OF(1 /* entry index */, 1 /* term */);
ASSERT_TERM_OF(2 /* entry index */, 1 /* term */);
ASSERT_TERM_OF(3 /* entry index */, 1 /* term */);
ASSERT_REFCOUNT(1 /* entry index */, 1 /* count */);
ASSERT_REFCOUNT(2 /* entry index */, 1 /* count */);
ASSERT_REFCOUNT(3 /* entry index */, 1 /* count */);
return MUNIT_OK;
}
/* Append enough entries to force the reference count hash table to be
* resized. */
TEST(logAppend, many, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
int i;
for (i = 0; i < 3000; i++) {
APPEND(1 /* term */);
}
munit_assert_int(f->log->refs_size, ==, 4096);
return MUNIT_OK;
}
/* Append to wrapped log that needs to be grown. */
TEST(logAppend, wrap, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(1 /* term */, 5 /* n */);
/* Now the log is [e1, e2, e3, e4, e5, NULL] */
ASSERT(6 /* size */,
0 /* front */,
5 /* back */,
0 /* offset */,
5 /* n */);
/* Delete the first 4 entries. */
SNAPSHOT(4 /* last entry */, 0 /* trailing */);
/* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */
ASSERT(6 /* size */,
4 /* front */,
5 /* back */,
4 /* offset */,
1 /* n */);
/* Append another 3 entries. */
APPEND_MANY(1 /* term */, 3 /* n */);
/* Now the log is [e7, e8, NULL, NULL, e5, e6] */
ASSERT(6 /* size */,
4 /* front */,
2 /* back */,
4 /* offset */,
4 /* n */);
/* Append another 3 entries. */
APPEND_MANY(1 /* term */, 3 /* n */);
/* Now the log is [e5, ..., e11, NULL, ..., NULL] */
ASSERT(14 /* size */,
0 /* front */,
7 /* back */,
4 /* offset */,
7 /* n */);
return MUNIT_OK;
}
/* Append a batch of entries to an empty log. */
TEST(logAppend, batch, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_BATCH(3);
ASSERT(6 /* size */,
0 /* front */,
3 /* back */,
0 /* offset */,
3 /* n */);
return MUNIT_OK;
}
static char *logAppendOomHeapFaultDelay[] = {"0", "1", NULL};
static char *logAppendOomHeapFaultRepeat[] = {"1", NULL};
static MunitParameterEnum logAppendOom[] = {
{TEST_HEAP_FAULT_DELAY, logAppendOomHeapFaultDelay},
{TEST_HEAP_FAULT_REPEAT, logAppendOomHeapFaultRepeat},
{NULL, NULL},
};
/* Out of memory. */
TEST(logAppend, oom, setUp, tearDown, 0, logAppendOom)
{
struct fixture *f = data;
struct raft_buffer buf;
int rv;
buf.base = NULL;
buf.len = 0;
HeapFaultEnable(&f->heap);
rv = logAppend(f->log, 1, RAFT_COMMAND, &buf, NULL);
munit_assert_int(rv, ==, RAFT_NOMEM);
return MUNIT_OK;
}
/* Out of memory when trying to grow the refs count table. */
TEST(logAppend, oomRefs, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(1, LOG__REFS_INITIAL_SIZE);
HeapFaultConfig(&f->heap, 1, 1);
HeapFaultEnable(&f->heap);
APPEND_ERROR(1, RAFT_NOMEM);
return MUNIT_OK;
}
/* Return an error when trying to append an entry with the same index and term
* of an older entry that got trucated but its still referenced. */
TEST(logAppend, busy, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_entry *entries1;
struct raft_entry *entries2;
struct raft_entry *entries;
unsigned n;
APPEND(1 /* term */);
ACQUIRE(1);
entries1 = entries;
ACQUIRE(1);
entries2 = entries;
TRUNCATE(1);
entries = entries1;
RELEASE(1);
APPEND_ERROR(1 /* term */, RAFT_BUSY);
entries = entries2;
RELEASE(1);
return MUNIT_OK;
}
/******************************************************************************
*
* logAppendConfiguration
*
*****************************************************************************/
SUITE(logAppendConfiguration)
static char *logAppendConfigurationOomHeapFaultDelay[] = {"0", "1", NULL};
static char *logAppendConfigurationOomHeapFaultRepeat[] = {"1", NULL};
static MunitParameterEnum logAppendConfigurationOom[] = {
{TEST_HEAP_FAULT_DELAY, logAppendConfigurationOomHeapFaultDelay},
{TEST_HEAP_FAULT_REPEAT, logAppendConfigurationOomHeapFaultRepeat},
{NULL, NULL},
};
/* Out of memory. */
TEST(logAppendConfiguration, oom, setUp, tearDown, 0, logAppendConfigurationOom)
{
struct fixture *f = data;
struct raft_configuration configuration;
int rv;
configurationInit(&configuration);
rv = configurationAdd(&configuration, 1, "1", RAFT_VOTER);
munit_assert_int(rv, ==, 0);
HeapFaultEnable(&f->heap);
rv = logAppendConfiguration(f->log, 1, &configuration);
munit_assert_int(rv, ==, RAFT_NOMEM);
configurationClose(&configuration);
return MUNIT_OK;
}
/******************************************************************************
*
* logAcquire
*
*****************************************************************************/
SUITE(logAcquire)
/* Acquire a single log entry. */
TEST(logAcquire, one, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_entry *entries;
unsigned n;
APPEND(1 /* term */);
ACQUIRE(1 /* index */);
munit_assert_ptr_not_null(entries);
munit_assert_int(n, ==, 1);
munit_assert_int(entries[0].type, ==, RAFT_COMMAND);
ASSERT_REFCOUNT(1 /* index */, 2 /* count */);
RELEASE(1 /* index */);
ASSERT_REFCOUNT(1 /* index */, 1 /* count */);
return MUNIT_OK;
}
/* Acquire two log entries. */
TEST(logAcquire, two, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_entry *entries;
unsigned n;
APPEND(1 /* term */);
APPEND(1 /* term */);
ACQUIRE(1 /* index */);
munit_assert_ptr_not_null(entries);
munit_assert_int(n, ==, 2);
munit_assert_int(entries[0].type, ==, RAFT_COMMAND);
munit_assert_int(entries[1].type, ==, RAFT_COMMAND);
ASSERT_REFCOUNT(1 /* index */, 2 /* count */);
ASSERT_REFCOUNT(2 /* index */, 2 /* count */);
RELEASE(1 /* index */);
ASSERT_REFCOUNT(1 /* index */, 1 /* count */);
ASSERT_REFCOUNT(2 /* index */, 1 /* count */);
return MUNIT_OK;
}
/* Acquire two log entries in a wrapped log. */
TEST(logAcquire, wrap, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_entry *entries;
unsigned n;
APPEND_MANY(1 /* term */, 5 /* n */);
/* Now the log is [e1, e2, e3, e4, e5, NULL] */
ASSERT(6 /* size */,
0 /* front */,
5 /* back */,
0 /* offset */,
5 /* n */);
/* Delete the first 4 entries. */
SNAPSHOT(4 /* last index */, 0 /* trailing */);
/* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */
ASSERT(6 /* size */,
4 /* front */,
5 /* back */,
4 /* offset */,
1 /* n */);
/* Append another 3 entries. */
APPEND_MANY(1 /* term */, 3 /* n */);
/* Now the log is [e7, e8, NULL, NULL, e5, e6] */
ASSERT(6 /* size */,
4 /* front */,
2 /* back */,
4 /* offset */,
4 /* n */);
ACQUIRE(6 /* index */);
munit_assert_int(n, ==, 3);
RELEASE(6 /* index */);
return MUNIT_OK;
}
/* Acquire several entries some of which belong to batches. */
TEST(logAcquire, batch, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_entry *entries;
unsigned n;
APPEND(1 /* term */);
APPEND_BATCH(2 /* n entries */);
APPEND(1 /* term */);
APPEND_BATCH(3 /* n entries */);
ACQUIRE(2 /* index */);
munit_assert_ptr_not_null(entries);
munit_assert_int(n, ==, 6);
ASSERT_REFCOUNT(2 /* index */, 2 /* count */);
/* Truncate the last 5 entries, so the only references left for the second
* batch are the ones in the acquired entries. */
TRUNCATE(3 /* index */);
RELEASE(2 /* index */);
ASSERT_REFCOUNT(2 /* index */, 1 /* count */);
return MUNIT_OK;
}
/* Trying to acquire entries out of range results in a NULL pointer. */
TEST(logAcquire, outOfRange, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_entry *entries;
unsigned n;
APPEND(1 /* term */);
APPEND(1 /* term */);
SNAPSHOT(1 /* index */, 0 /* trailing */);
ACQUIRE(1 /* index */);
munit_assert_ptr_null(entries);
ACQUIRE(3 /* index */);
munit_assert_ptr_null(entries);
return MUNIT_OK;
}
/* Out of memory. */
TEST(logAcquire, oom, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_entry *entries;
unsigned n;
int rv;
APPEND(1 /* term */);
HeapFaultConfig(&f->heap, 0, 1);
HeapFaultEnable(&f->heap);
rv = logAcquire(f->log, 1, &entries, &n);
munit_assert_int(rv, ==, RAFT_NOMEM);
return MUNIT_OK;
}
/******************************************************************************
*
* logTruncate
*
*****************************************************************************/
SUITE(logTruncate)
/* Truncate the last entry of a log with a single entry. */
TEST(logTruncate, lastOfOne, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1 /* term */);
TRUNCATE(1 /* index */);
ASSERT(0 /* size */,
0 /* front */,
0 /* back */,
0 /* offset */,
0 /* n */);
return MUNIT_OK;
}
/* Truncate the last entry of a log with a two entries. */
TEST(logTruncate, lastOfTwo, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1 /* term */);
APPEND(1 /* term */);
TRUNCATE(2 /* index */);
ASSERT(6 /* size */,
0 /* front */,
1 /* back */,
0 /* offset */,
1 /* n */);
ASSERT_TERM_OF(1 /* entry index */, 1 /* term */);
return MUNIT_OK;
}
/* Truncate from an entry which makes the log wrap. */
TEST(logTruncate, wrap, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(1 /* term */, 5 /* n entries */);
/* Now the log is [e1, e2, e3, e4, e5, NULL] */
ASSERT(6 /* size */,
0 /* front */,
5 /* back */,
0 /* offset */,
5 /* n */);
/* Delete the first 4 entries. */
SNAPSHOT(4 /* last index */, 0 /* trailing */);
/* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */
ASSERT(6 /* size */,
4 /* front */,
5 /* back */,
4 /* offset */,
1 /* n */);
/* Append another 3 entries. */
APPEND_MANY(1 /* term */, 3 /* n entries */);
/* Now the log is [e7, e8, NULL, NULL, e5, e6] */
ASSERT(6 /* size */,
4 /* front */,
2 /* back */,
4 /* offset */,
4 /* n */);
/* Truncate from e6 onward (wrapping) */
TRUNCATE(6 /* index */);
/* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */
ASSERT(6 /* size */,
4 /* front */,
5 /* back */,
4 /* offset */,
1 /* n */);
return MUNIT_OK;
}
/* Truncate the last entry of a log with a single entry, which still has an
* outstanding reference created by a call to logAcquire(). */
TEST(logTruncate, referenced, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_entry *entries;
unsigned n;
APPEND(1 /* term */);
ACQUIRE(1 /* index */);
TRUNCATE(1 /* index */);
ASSERT(0 /* size */,
0 /* front */,
0 /* back */,
0 /* offset */,
0 /* n */);
/* The entry has still an outstanding reference. */
ASSERT_REFCOUNT(1 /* index */, 1 /* count */);
munit_assert_string_equal((const char *)entries[0].buf.base, "hello");
RELEASE(1 /* index */);
ASSERT_REFCOUNT(1 /* index */, 0 /* count */);
return MUNIT_OK;
}
/* Truncate all entries belonging to a batch. */
TEST(logTruncate, batch, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_BATCH(3 /* n entries */);
TRUNCATE(1 /* index */);
munit_assert_int(f->log->size, ==, 0);
return MUNIT_OK;
}
/* Acquire entries at a certain index. Truncate the log at that index. The
* truncated entries are still referenced. Then append a new entry, which will
* have the same index but different term. */
TEST(logTruncate, acquired, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_entry *entries;
unsigned n;
APPEND(1 /* term */);
APPEND(1 /* term */);
ACQUIRE(2 /* index */);
munit_assert_int(n, ==, 1);
TRUNCATE(2 /* index */);
APPEND(2 /* term */);
RELEASE(2 /*index */);
return MUNIT_OK;
}
/* Acquire some entries, truncate the log and then append new ones forcing the
log to be grown and the reference count hash table to be re-built. */
TEST(logTruncate, acquireAppend, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
struct raft_entry *entries;
unsigned n;
size_t i;
APPEND(1 /* term */);
APPEND(1 /* term */);
ACQUIRE(2);
munit_assert_int(n, ==, 1);
TRUNCATE(2);
for (i = 0; i < LOG__REFS_INITIAL_SIZE; i++) {
APPEND(2 /* term */);
}
RELEASE(2);
return MUNIT_OK;
}
static char *logTruncateAcquiredHeapFaultDelay[] = {"0", NULL};
static char *logTruncateAcquiredFaultRepeat[] = {"1", NULL};
static MunitParameterEnum logTruncateAcquiredOom[] = {
{TEST_HEAP_FAULT_DELAY, logTruncateAcquiredHeapFaultDelay},
{TEST_HEAP_FAULT_REPEAT, logTruncateAcquiredFaultRepeat},
{NULL, NULL},
};
/* Acquire entries at a certain index. Truncate the log at that index. The
* truncated entries are still referenced. Then append a new entry, which fails
* to be appended due to OOM. */
TEST(logTruncate, acquiredOom, setUp, tearDown, 0, logTruncateAcquiredOom)
{
struct fixture *f = data;
struct raft_entry *entries;
unsigned n;
struct raft_buffer buf;
int rv;
APPEND(1 /* term */);
APPEND(1 /* term */);
ACQUIRE(2);
munit_assert_int(n, ==, 1);
TRUNCATE(2);
buf.base = NULL;
buf.len = 0;
HeapFaultEnable(&f->heap);
rv = logAppend(f->log, 2, RAFT_COMMAND, &buf, NULL);
munit_assert_int(rv, ==, RAFT_NOMEM);
RELEASE(2);
return MUNIT_OK;
}
/******************************************************************************
*
* logSnapshot
*
*****************************************************************************/
SUITE(logSnapshot)
/* Take a snapshot at entry 3, keeping 2 trailing entries. */
TEST(logSnapshot, trailing, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND(1 /* term */);
APPEND(2 /* term */);
APPEND(2 /* term */);
SNAPSHOT(3 /* last index */, 2 /* trailing */);
ASSERT(6 /* size */,
1 /* front */,
3 /* back */,
1 /* offset */,
2 /* n */);
ASSERT_SNAPSHOT(3 /* index */, 2 /* term */);
munit_assert_int(NUM_ENTRIES, ==, 2);
munit_assert_int(LAST_INDEX, ==, 3);
return MUNIT_OK;
}
/* Take a snapshot when the number of outstanding entries is lower than the
* desired trail (so no entry will be deleted). */
TEST(logSnapshot, trailingHigherThanNumEntries, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
/* Take a snapshot leaving just one entry in the log. */
APPEND_MANY(1 /* term */, 3 /* n entries */);
SNAPSHOT(3 /* last index */, 1 /* trailing */);
/* Take another snapshot, trying to leave 3 entries, but only 2 are
* available at all. */
APPEND(2 /* term */);
SNAPSHOT(4 /* last index */, 3 /* trailing */);
ASSERT(6 /* size */,
2 /* front */,
4 /* back */,
2 /* offset */,
2 /* n */);
ASSERT_SNAPSHOT(4 /* index */, 2 /* term */);
munit_assert_int(NUM_ENTRIES, ==, 2);
munit_assert_int(LAST_INDEX, ==, 4);
return MUNIT_OK;
}
/* Take a snapshot when the number of outstanding entries is exactly equal to
* the desired trail (so no entry will be deleted). */
TEST(logSnapshot, trailingMatchesOutstanding, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
/* Take a snapshot leaving just one entry in the log. */
APPEND_MANY(1 /* term */, 3 /* n entries */);
SNAPSHOT(3 /* last index */, 1 /* trailing */);
/* Take another snapshot, leaving 2 entries, which are the ones we have. */
APPEND(2 /* term */);
SNAPSHOT(4 /* last index */, 2 /* trailing */);
ASSERT(6 /* size */,
2 /* front */,
4 /* back */,
2 /* offset */,
2 /* n */);
ASSERT_SNAPSHOT(4 /* index */, 2 /* term */);
munit_assert_int(NUM_ENTRIES, ==, 2);
munit_assert_int(LAST_INDEX, ==, 4);
return MUNIT_OK;
}
/* Take a snapshot at an index which is not the last one. */
TEST(logSnapshot, lessThanHighestIndex, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
/* Take a snapshot leaving three entries in the log. */
APPEND_MANY(1 /* term */, 5 /* n entries */);
SNAPSHOT(4 /* last index */, 2 /* trailing */);
ASSERT(6 /* size */,
2 /* front */,
5 /* back */,
2 /* offset */,
3 /* n */);
ASSERT_SNAPSHOT(4 /* index */, 1 /* term */);
munit_assert_int(NUM_ENTRIES, ==, 3);
munit_assert_int(LAST_INDEX, ==, 5);
return MUNIT_OK;
}
/* Take a snapshot at a point where the log needs to wrap. */
TEST(logSnapshot, wrap, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(1 /* term */, 5 /* n entries */);
/* Now the log is [e1, e2, e3, e4, e5, NULL] */
ASSERT(6 /* size */,
0 /* front */,
5 /* back */,
0 /* offset */,
5 /* n */);
/* Take a snapshot at e5, keeping just e5 itself. */
SNAPSHOT(5 /* last index */, 1 /* trailing */);
/* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */
ASSERT(6 /* size */,
4 /* front */,
5 /* back */,
4 /* offset */,
1 /* n */);
ASSERT_SNAPSHOT(5 /* index */, 1 /* term */);
/* Append another 4 entries. */
APPEND_MANY(1 /* term */, 4 /* n */);
/* Now the log is [e7, e8, e9, NULL, e5, e6] */
ASSERT(6 /* size */,
4 /* front */,
3 /* back */,
4 /* offset */,
5 /* n */);
/* Take a snapshot at e8 keeping only e8 itself (wrapping) */
SNAPSHOT(8 /* last index */, 1 /* trailing */);
/* Now the log is [NULL, e8, e9, NULL, NULL, NULL] */
ASSERT(6 /* size */,
1 /* front */,
3 /* back */,
7 /* offset */,
2 /* n */);
ASSERT_SNAPSHOT(8 /* index */, 1 /* term */);
return MUNIT_OK;
}
/******************************************************************************
*
* logRestore
*
*****************************************************************************/
SUITE(logRestore)
/* Mimic the initial restore of a snapshot after loading state from disk, when
* there are no outstanding entries. */
TEST(logRestore, initial, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
RESTORE(2 /* last index */, 3 /* last term */);
ASSERT_SNAPSHOT(2 /* index */, 3 /* term */);
munit_assert_int(LAST_INDEX, ==, 2);
return MUNIT_OK;
}
/* If there are existing entries they are wiped out. */
TEST(logRestore, wipe, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
APPEND_MANY(1 /* term */, 5 /* n entries */);
RESTORE(2 /* last index */, 3 /* last term */);
ASSERT_SNAPSHOT(2 /* index */, 3 /* term */);
munit_assert_int(LAST_INDEX, ==, 2);
return MUNIT_OK;
}
raft-0.22.1/test/unit/test_queue.c 0000664 0000000 0000000 00000014517 14601504142 0017022 0 ustar 00root root 0000000 0000000 #include "../../src/queue.h"
#include "../lib/runner.h"
/******************************************************************************
*
* Fixture with a single queue and a few test items that can be added to it.
*
*****************************************************************************/
struct item
{
int value;
queue queue;
};
struct fixture
{
queue queue;
struct item items[3];
};
static void *setUp(MUNIT_UNUSED const MunitParameter params[],
MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
QUEUE_INIT(&f->queue);
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
free(f);
}
/******************************************************************************
*
* Helper macros
*
*****************************************************************************/
/* Initialize and push the given number of fixture items to the fixture's
* queue. Each item will have a value equal to its index plus one. */
#define PUSH(N) \
{ \
int i_; \
for (i_ = 0; i_ < N; i_++) { \
struct item *item_ = &f->items[i_]; \
item_->value = i_ + 1; \
QUEUE_PUSH(&f->queue, &item_->queue); \
} \
}
/* Remove the i'th fixture item from the fixture queue. */
#define REMOVE(I) QUEUE_REMOVE(&f->items[I].queue)
/******************************************************************************
*
* Assertions
*
*****************************************************************************/
/* Assert that the item at the head of the fixture's queue has the given
* value. */
#define ASSERT_HEAD(VALUE) \
{ \
queue *head_ = QUEUE_HEAD(&f->queue); \
struct item *item_; \
item_ = QUEUE_DATA(head_, struct item, queue); \
munit_assert_int(item_->value, ==, VALUE); \
}
/* Assert that the item at the tail of the queue has the given value. */
#define ASSERT_TAIL(VALUE) \
{ \
queue *tail_ = QUEUE_TAIL(&f->queue); \
struct item *item_; \
item_ = QUEUE_DATA(tail_, struct item, queue); \
munit_assert_int(item_->value, ==, VALUE); \
}
/* Assert that the fixture's queue is empty. */
#define ASSERT_EMPTY munit_assert_true(QUEUE_IS_EMPTY(&f->queue))
/* Assert that the fixture's queue is not empty. */
#define ASSERT_NOT_EMPTY munit_assert_false(QUEUE_IS_EMPTY(&f->queue))
/******************************************************************************
*
* QUEUE_IS_EMPTY
*
*****************************************************************************/
SUITE(QUEUE_IS_EMPTY)
TEST(QUEUE_IS_EMPTY, yes, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
ASSERT_EMPTY;
return MUNIT_OK;
}
TEST(QUEUE_IS_EMPTY, no, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
PUSH(1);
ASSERT_NOT_EMPTY;
return MUNIT_OK;
}
/******************************************************************************
*
* QUEUE_PUSH
*
*****************************************************************************/
SUITE(QUEUE_PUSH)
TEST(QUEUE_PUSH, one, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
PUSH(1);
ASSERT_HEAD(1);
return MUNIT_OK;
}
TEST(QUEUE_PUSH, two, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
int i;
PUSH(2);
for (i = 0; i < 2; i++) {
ASSERT_HEAD(i + 1);
REMOVE(i);
}
ASSERT_EMPTY;
return MUNIT_OK;
}
/******************************************************************************
*
* QUEUE_REMOVE
*
*****************************************************************************/
SUITE(QUEUE_REMOVE)
TEST(QUEUE_REMOVE, first, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
PUSH(3);
REMOVE(0);
ASSERT_HEAD(2);
return MUNIT_OK;
}
TEST(QUEUE_REMOVE, second, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
PUSH(3);
REMOVE(1);
ASSERT_HEAD(1);
return MUNIT_OK;
}
TEST(QUEUE_REMOVE, success, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
PUSH(3);
REMOVE(2);
ASSERT_HEAD(1);
return MUNIT_OK;
}
/******************************************************************************
*
* QUEUE_TAIL
*
*****************************************************************************/
SUITE(QUEUE_TAIL)
TEST(QUEUE_TAIL, one, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
PUSH(1);
ASSERT_TAIL(1);
return MUNIT_OK;
}
TEST(QUEUE_TAIL, two, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
PUSH(2);
ASSERT_TAIL(2);
return MUNIT_OK;
}
TEST(QUEUE_TAIL, three, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
PUSH(3);
ASSERT_TAIL(3);
return MUNIT_OK;
}
/******************************************************************************
*
* QUEUE_FOREACH
*
*****************************************************************************/
SUITE(QUEUE_FOREACH)
/* Loop through a queue of zero items. */
TEST(QUEUE_FOREACH, zero, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
queue *head;
int count = 0;
QUEUE_FOREACH (head, &f->queue) {
count++;
}
munit_assert_int(count, ==, 0);
return MUNIT_OK;
}
/* Loop through a queue of one item. */
TEST(QUEUE_FOREACH, one, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
queue *head;
int count = 0;
PUSH(1);
QUEUE_FOREACH (head, &f->queue) {
count++;
}
munit_assert_int(count, ==, 1);
return MUNIT_OK;
}
/* Loop through a queue of two items. The order of the loop is from the head to
* the tail. */
TEST(QUEUE_FOREACH, two, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
queue *head;
int values[2] = {0, 0};
int i = 0;
PUSH(2);
QUEUE_FOREACH (head, &f->queue) {
struct item *item;
item = QUEUE_DATA(head, struct item, queue);
values[i] = item->value;
i++;
}
munit_assert_int(values[0], ==, 1);
munit_assert_int(values[1], ==, 2);
return MUNIT_OK;
}
raft-0.22.1/test/unit/test_random.c 0000664 0000000 0000000 00000002446 14601504142 0017154 0 ustar 00root root 0000000 0000000 #include "../../src/random.h"
#include "../lib/runner.h"
SUITE(RandomWithinRange)
/* First generated number with the default seed. */
TEST(RandomWithinRange, first, NULL, NULL, 0, NULL)
{
unsigned random = 42;
unsigned n = RandomWithinRange(&random, 1000, 2000);
munit_assert_int(n, ==, 1650);
return MUNIT_OK;
}
/* Sequence of 10 numbers with the default seed. */
TEST(RandomWithinRange, sequence, NULL, NULL, 0, NULL)
{
unsigned random = 42;
unsigned n[10];
int i;
int j;
for (i = 0; i < 10; i++) {
n[i] = RandomWithinRange(&random, 1000, 2000);
}
for (i = 0; i < 9; i++) {
munit_assert_int(n[i], >=, 1000);
munit_assert_int(n[i], <=, 2000);
for (j = i + 1; j < 10; j++) {
munit_assert_int(n[i], !=, n[j]);
}
}
return MUNIT_OK;
}
/* Change the seed */
TEST(RandomWithinRange, seed, NULL, NULL, 0, NULL)
{
unsigned random = 0;
unsigned n;
int i;
int expected[20] = {1571, 1410, 1735, 1743, 1995, 1353, 1589,
1478, 1753, 1367, 1112, 1216, 1727, 1057,
1061, 1669, 1773, 1425, 1864, 1035};
for (i = 0; i < 20; i++) {
n = RandomWithinRange(&random, 1000, 2000);
munit_assert_int(n, ==, expected[i]);
}
return MUNIT_OK;
}
raft-0.22.1/test/unit/test_trail.c 0000664 0000000 0000000 00000021006 14601504142 0017000 0 ustar 00root root 0000000 0000000 #include "../../src/trail.h"
#include "../lib/heap.h"
#include "../lib/runner.h"
/******************************************************************************
*
* Fixture
*
*****************************************************************************/
struct fixture
{
FIXTURE_HEAP;
struct raft_trail trail;
};
static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SET_UP_HEAP;
TrailInit(&f->trail);
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
TrailClose(&f->trail);
TEAR_DOWN_HEAP;
free(f);
}
SUITE(trail)
/* An empty trail has recorded any entry yet. */
TEST(trail, Empty, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
munit_assert_uint(TrailNumEntries(&f->trail), ==, 0);
munit_assert_ullong(TrailLastIndex(&f->trail), ==, 0);
munit_assert_ullong(TrailLastTerm(&f->trail), ==, 0);
munit_assert_false(TrailHasEntry(&f->trail, 1));
return MUNIT_OK;
}
/* Anchor the trail at a certain index and snapshot. */
TEST(trail, Start, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
TrailStart(&f->trail, 3 /* snapshot index */, 4 /* snapshot term */,
2 /* start index */);
munit_assert_uint(TrailNumEntries(&f->trail), ==, 0);
munit_assert_ullong(TrailLastIndex(&f->trail), ==, 3);
munit_assert_ullong(TrailLastTerm(&f->trail), ==, 4);
TrailAppend(&f->trail, 3); /* index 2, term 3 */
TrailAppend(&f->trail, 4); /* index 3, term 4 */
TrailAppend(&f->trail, 4); /* index 4, term 4 */
munit_assert_uint(TrailNumEntries(&f->trail), ==, 3);
munit_assert_ullong(TrailLastIndex(&f->trail), ==, 4);
munit_assert_ullong(TrailLastTerm(&f->trail), ==, 4);
munit_assert_ullong(TrailTermOf(&f->trail, 1), ==, 0); /* non existing */
munit_assert_ullong(TrailTermOf(&f->trail, 2), ==, 3);
munit_assert_ullong(TrailTermOf(&f->trail, 3), ==, 4);
munit_assert_ullong(TrailTermOf(&f->trail, 4), ==, 4);
munit_assert_ullong(TrailTermOf(&f->trail, 5), ==, 0); /* non existing */
return MUNIT_OK;
}
/* Append to an empty trail information about a new entry. */
TEST(trail, AppendEmpty, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
TrailAppend(&f->trail, 1); /* index 1, term 1 */
munit_assert_uint(TrailNumEntries(&f->trail), ==, 1);
munit_assert_ullong(TrailLastIndex(&f->trail), ==, 1);
munit_assert_ullong(TrailLastTerm(&f->trail), ==, 1);
munit_assert_ullong(TrailTermOf(&f->trail, 1), ==, 1);
return MUNIT_OK;
}
/* Append information about a new entry with the same term as the last one. */
TEST(trail, AppendSameTerm, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
TrailAppend(&f->trail, 1); /* index 1, term 1 */
TrailAppend(&f->trail, 1); /* index 2, term 1 */
munit_assert_uint(TrailNumEntries(&f->trail), ==, 2);
munit_assert_ullong(TrailLastIndex(&f->trail), ==, 2);
munit_assert_ullong(TrailLastTerm(&f->trail), ==, 1);
munit_assert_ullong(TrailTermOf(&f->trail, 1), ==, 1);
munit_assert_ullong(TrailTermOf(&f->trail, 2), ==, 1);
return MUNIT_OK;
}
/* Append information about a new entry with a newer term than the last one. */
TEST(trail, AppendNewerTerm, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
TrailAppend(&f->trail, 1); /* index 1, term 1 */
TrailAppend(&f->trail, 2); /* index 2, term 2 */
munit_assert_uint(TrailNumEntries(&f->trail), ==, 2);
munit_assert_ullong(TrailLastIndex(&f->trail), ==, 2);
munit_assert_ullong(TrailLastTerm(&f->trail), ==, 2);
munit_assert_ullong(TrailTermOf(&f->trail, 1), ==, 1);
munit_assert_ullong(TrailTermOf(&f->trail, 2), ==, 2);
return MUNIT_OK;
}
/* Get the term of an entry. */
TEST(trail, TermOf, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
TrailAppend(&f->trail, 1); /* index 1, term 1 */
TrailAppend(&f->trail, 1); /* index 2, term 1 */
TrailAppend(&f->trail, 2); /* index 3, term 2 */
TrailAppend(&f->trail, 3); /* index 4, term 3 */
munit_assert_uint(TrailNumEntries(&f->trail), ==, 4);
munit_assert_ullong(TrailLastIndex(&f->trail), ==, 4);
munit_assert_ullong(TrailLastTerm(&f->trail), ==, 3);
munit_assert_ullong(TrailTermOf(&f->trail, 1), ==, 1);
munit_assert_ullong(TrailTermOf(&f->trail, 2), ==, 1);
munit_assert_ullong(TrailTermOf(&f->trail, 3), ==, 2);
munit_assert_ullong(TrailTermOf(&f->trail, 4), ==, 3);
return MUNIT_OK;
}
/* Truncate the trail removing information about all entries past the given
* index (included). */
TEST(trail, Truncate, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
TrailAppend(&f->trail, 1); /* index 1, term 1 */
TrailAppend(&f->trail, 1); /* index 2, term 1 */
TrailAppend(&f->trail, 2); /* index 3, term 2 */
TrailAppend(&f->trail, 3); /* index 4, term 3 */
TrailAppend(&f->trail, 3); /* index 5, term 3 */
TrailTruncate(&f->trail, 3);
munit_assert_true(TrailHasEntry(&f->trail, 2));
munit_assert_false(TrailHasEntry(&f->trail, 3));
munit_assert_uint(TrailNumEntries(&f->trail), ==, 2);
munit_assert_ullong(TrailLastIndex(&f->trail), ==, 2);
munit_assert_ullong(TrailLastTerm(&f->trail), ==, 1);
TrailTruncate(&f->trail, 1);
munit_assert_uint(TrailNumEntries(&f->trail), ==, 0);
return MUNIT_OK;
}
/* Truncate a trail that was wrapped. */
TEST(trail, TruncateWrapped, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
TrailAppend(&f->trail, 1); /* index 1, term 1 */
TrailAppend(&f->trail, 1); /* index 2, term 1 */
TrailAppend(&f->trail, 2); /* index 3, term 2 */
TrailAppend(&f->trail, 3); /* index 4, term 3 */
TrailAppend(&f->trail, 3); /* index 5, term 3 */
TrailAppend(&f->trail, 4); /* index 6, term 4 */
TrailAppend(&f->trail, 4); /* index 7, term 4 */
TrailAppend(&f->trail, 5); /* index 8, term 5 */
TrailAppend(&f->trail, 5); /* index 9, term 5 */
munit_assert_uint(TrailNumEntries(&f->trail), ==, 9);
TrailSnapshot(&f->trail, 7 /* snapshot index */, 0 /* trailing */);
munit_assert_uint(TrailNumEntries(&f->trail), ==, 2);
TrailAppend(&f->trail, 6); /* index 10, term 6 */
TrailAppend(&f->trail, 6); /* index 11, term 6 */
TrailAppend(&f->trail, 7); /* index 12, term 7 */
TrailAppend(&f->trail, 8); /* index 13, term 8 */
TrailAppend(&f->trail, 8); /* index 14, term 8 */
TrailAppend(&f->trail, 8); /* index 15, term 8 */
TrailAppend(&f->trail, 9); /* index 16, term 9 */
munit_assert_uint(TrailNumEntries(&f->trail), ==, 9);
TrailTruncate(&f->trail, 8);
munit_assert_uint(TrailNumEntries(&f->trail), ==, 0);
return MUNIT_OK;
}
/* Remove a prefix of the log. */
TEST(trail, Snapshot, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
TrailAppend(&f->trail, 1); /* index 1, term 1 */
TrailAppend(&f->trail, 1); /* index 2, term 1 */
TrailAppend(&f->trail, 2); /* index 3, term 2 */
TrailAppend(&f->trail, 3); /* index 4, term 3 */
TrailAppend(&f->trail, 3); /* index 5, term 3 */
TrailSnapshot(&f->trail, 3 /* snapshot index */, 2 /* trailing */);
munit_assert_uint(TrailNumEntries(&f->trail), ==, 4);
munit_assert_ullong(TrailTermOf(&f->trail, 1), ==, 0);
munit_assert_ullong(TrailTermOf(&f->trail, 2), ==, 1);
munit_assert_false(TrailHasEntry(&f->trail, 1));
munit_assert_true(TrailHasEntry(&f->trail, 2));
TrailAppend(&f->trail, 4); /* index 6, term 4 */
TrailAppend(&f->trail, 4); /* index 7, term 4 */
TrailSnapshot(&f->trail, 6 /* snapshot index */, 1 /* trailing */);
munit_assert_uint(TrailNumEntries(&f->trail), ==, 2);
munit_assert_ullong(TrailTermOf(&f->trail, 5), ==, 0);
munit_assert_ullong(TrailTermOf(&f->trail, 6), ==, 4);
return MUNIT_OK;
}
/* Restore a snapshot erasing the whole log. */
TEST(trail, Restore, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;
TrailAppend(&f->trail, 1); /* index 1, term 1 */
TrailAppend(&f->trail, 1); /* index 2, term 1 */
TrailAppend(&f->trail, 2); /* index 3, term 2 */
TrailAppend(&f->trail, 3); /* index 4, term 3 */
TrailRestore(&f->trail, 3 /* snapshot index */, 2 /* snapshot term */);
munit_assert_uint(TrailNumEntries(&f->trail), ==, 0);
munit_assert_ullong(TrailLastIndex(&f->trail), ==, 3);
munit_assert_ullong(TrailLastTerm(&f->trail), ==, 2);
TrailAppend(&f->trail, 3); /* index 4, term 3 */
munit_assert_uint(TrailNumEntries(&f->trail), ==, 1);
munit_assert_ullong(TrailTermOf(&f->trail, 4), ==, 3);
return MUNIT_OK;
}
raft-0.22.1/test/unit/test_uv_fs.c 0000664 0000000 0000000 00000036131 14601504142 0017014 0 ustar 00root root 0000000 0000000 #include
#include "../../src/uv_fs.h"
#include "../../src/uv_os.h"
#include "../lib/aio.h"
#include "../lib/dir.h"
#include "../lib/runner.h"
/******************************************************************************
*
* UvFsCheckDir
*
*****************************************************************************/
/* Invoke UvFsCheckDir passing it the given dir. */
#define CHECK_DIR(DIR) \
{ \
int _rv; \
char _errmsg[RAFT_ERRMSG_BUF_SIZE]; \
_rv = UvFsCheckDir(DIR, _errmsg); \
munit_assert_int(_rv, ==, 0); \
}
/* Invoke UvFsCheckDir passing it the given dir and check that the given error
* occurs. */
#define CHECK_DIR_ERROR(DIR, RV, ERRMSG) \
{ \
int _rv; \
char _errmsg[RAFT_ERRMSG_BUF_SIZE]; \
_rv = UvFsCheckDir(DIR, _errmsg); \
munit_assert_int(_rv, ==, RV); \
munit_assert_string_equal(_errmsg, ERRMSG); \
}
SUITE(UvFsCheckDir)
/* If the directory exists, the function succeeds. */
TEST(UvFsCheckDir, exists, DirSetUp, DirTearDown, 0, NULL)
{
const char *dir = data;
CHECK_DIR(dir);
return MUNIT_OK;
}
/* If the directory doesn't exist, it an error is returned. */
TEST(UvFsCheckDir, doesNotExist, DirSetUp, DirTearDown, 0, NULL)
{
const char *parent = data;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
char dir[128];
sprintf(errmsg, "%s/sub", parent);
sprintf(errmsg, "directory '%s' does not exist", dir);
CHECK_DIR_ERROR(dir, RAFT_NOTFOUND, errmsg);
return MUNIT_OK;
}
/* If the process can't access the directory, an error is returned. */
TEST(UvFsCheckDir, permissionDenied, NULL, NULL, 0, NULL)
{
bool has_access = DirHasFile("/proc/1", "root");
/* Skip the test is the process actually has access to /proc/1/root. */
if (has_access) {
return MUNIT_SKIP;
}
CHECK_DIR_ERROR("/proc/1/root", RAFT_UNAUTHORIZED,
"can't access directory '/proc/1/root'");
return MUNIT_OK;
}
/* If the given path contains a non-directory prefix, an error is returned. */
TEST(UvFsCheckDir, notDirPrefix, NULL, NULL, 0, NULL)
{
CHECK_DIR_ERROR("/dev/null/foo", RAFT_INVALID,
"path '/dev/null/foo' is not a directory");
return MUNIT_OK;
}
/* If the given path is not a directory, an error is returned. */
TEST(UvFsCheckDir, notDir, NULL, NULL, 0, NULL)
{
CHECK_DIR_ERROR("/dev/null", RAFT_INVALID,
"path '/dev/null' is not a directory");
return MUNIT_OK;
}
/* If the given directory is not writable, an error is returned. */
TEST(UvFsCheckDir, notWritable, DirSetUp, DirTearDown, 0, NULL)
{
const char *dir = data;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
sprintf(errmsg, "directory '%s' is not writable", dir);
DirMakeUnwritable(dir);
CHECK_DIR_ERROR(dir, RAFT_INVALID, errmsg);
return MUNIT_OK;
}
/******************************************************************************
*
* UvFsSyncDir
*
*****************************************************************************/
/* Invoke UvFsSyncDir passing it the given dir. */
#define SYNC_DIR_ERROR(DIR, RV, ERRMSG) \
{ \
char _errmsg[RAFT_ERRMSG_BUF_SIZE]; \
munit_assert_int(UvFsSyncDir(DIR, _errmsg), ==, RV); \
munit_assert_string_equal(_errmsg, ERRMSG); \
}
SUITE(UvFsSyncDir)
/* If the directory doesn't exist, an error is returned. */
TEST(UvFsSyncDir, noExists, NULL, NULL, 0, NULL)
{
SYNC_DIR_ERROR("/abcdef", RAFT_IOERR,
"open directory: no such file or directory");
return MUNIT_OK;
}
/******************************************************************************
*
* UvFsOpenFileForReading
*
*****************************************************************************/
/* Open a file in the given dir. */
#define OPEN_FILE_FOR_READING_ERROR(DIR, FILENAME, RV, ERRMSG) \
{ \
uv_file fd_; \
char errmsg_[RAFT_ERRMSG_BUF_SIZE]; \
int rv_ = UvFsOpenFileForReading(DIR, FILENAME, &fd_, errmsg_); \
munit_assert_int(rv_, ==, RV); \
munit_assert_string_equal(errmsg_, ERRMSG); \
}
SUITE(UvFsOpenFileForReading)
/* If the directory doesn't exist, an error is returned. */
TEST(UvFsOpenFileForReading, noExists, DirSetUp, DirTearDown, 0, NULL)
{
const char *dir = data;
OPEN_FILE_FOR_READING_ERROR(dir, "foo", RAFT_IOERR,
"open: no such file or directory");
return MUNIT_OK;
}
/******************************************************************************
*
* UvFsAllocateFile
*
*****************************************************************************/
/* Allocate a file with the given parameters and assert that no error occurred.
*/
#define ALLOCATE_FILE(DIR, FILENAME, SIZE) \
{ \
uv_file fd_; \
char errmsg_; \
int rv_; \
rv_ = UvFsAllocateFile(DIR, FILENAME, SIZE, &fd_, &errmsg_); \
munit_assert_int(rv_, ==, 0); \
munit_assert_int(UvOsClose(fd_), ==, 0); \
}
/* Assert that creating a file with the given parameters fails with the given
* code and error message. */
#define ALLOCATE_FILE_ERROR(DIR, FILENAME, SIZE, RV, ERRMSG) \
{ \
uv_file fd_; \
char errmsg_[RAFT_ERRMSG_BUF_SIZE]; \
int rv_; \
rv_ = UvFsAllocateFile(DIR, FILENAME, SIZE, &fd_, errmsg_); \
munit_assert_int(rv_, ==, RV); \
munit_assert_string_equal(errmsg_, ERRMSG); \
}
SUITE(UvFsAllocateFile)
/* If the given path is valid, the file gets created. */
TEST(UvFsAllocateFile, success, DirSetUp, DirTearDown, 0, NULL)
{
const char *dir = data;
ALLOCATE_FILE(dir, /* dir */
"foo", /* filename */
4096 /* size */);
munit_assert_true(DirHasFile(dir, "foo"));
return MUNIT_OK;
}
/* The directory of given path does not exist, an error is returned. */
TEST(UvFsAllocateFile, dirNoExists, NULL, NULL, 0, NULL)
{
ALLOCATE_FILE_ERROR("/non/existing/dir", /* dir */
"foo", /* filename */
64, /* size */
RAFT_IOERR, /* status */
"open: no such file or directory");
return MUNIT_OK;
}
/* If the given path already exists, an error is returned. */
TEST(UvFsAllocateFile, fileAlreadyExists, DirSetUp, DirTearDown, 0, NULL)
{
const char *dir = data;
char buf[8] = {0};
DirWriteFile(dir, "foo", buf, sizeof buf);
ALLOCATE_FILE_ERROR(dir, /* dir */
"foo", /* filename */
64, /* size */
RAFT_IOERR, /* status */
"open: file already exists");
return MUNIT_OK;
}
/* The file system has run out of space. */
TEST(UvFsAllocateFile, noSpace, DirSetUp, DirTearDown, 0, DirTmpfsParams)
{
const char *dir = data;
if (dir == NULL) {
return MUNIT_SKIP;
}
ALLOCATE_FILE_ERROR(dir, /* dir */
"foo", /* filename */
4096 * 32768, /* size */
RAFT_NOSPACE, /* status */
"not enough space to allocate 134217728 bytes");
munit_assert_false(DirHasFile(dir, "foo"));
return MUNIT_OK;
}
/******************************************************************************
*
* UvFsProbeCapabilities
*
*****************************************************************************/
/* Invoke UvFsProbeCapabilities against the given dir and assert that it returns
* the given values for direct I/O and async I/O. */
#define PROBE_CAPABILITIES(DIR, DIRECT_IO, ASYNC_IO) \
{ \
size_t direct_io_; \
bool async_io_; \
char errmsg_; \
int rv_; \
rv_ = UvFsProbeCapabilities(DIR, &direct_io_, &async_io_, &errmsg_); \
munit_assert_int(rv_, ==, 0); \
munit_assert_int(direct_io_, ==, DIRECT_IO); \
if (ASYNC_IO) { \
munit_assert_true(async_io_); \
} else { \
munit_assert_false(async_io_); \
} \
}
/* Invoke UvFsProbeCapabilities and check that the given error occurs. */
#define PROBE_CAPABILITIES_ERROR(DIR, RV, ERRMSG) \
{ \
size_t direct_io_; \
bool async_io_; \
char errmsg_[RAFT_ERRMSG_BUF_SIZE]; \
int rv_; \
rv_ = UvFsProbeCapabilities(DIR, &direct_io_, &async_io_, errmsg_); \
munit_assert_int(rv_, ==, RV); \
munit_assert_string_equal(errmsg_, ERRMSG); \
}
SUITE(UvFsProbeCapabilities)
TEST(UvFsProbeCapabilities, tmpfs, DirTmpfsSetUp, DirTearDown, 0, NULL)
{
const char *dir = data;
if (dir == NULL) {
return MUNIT_SKIP;
}
PROBE_CAPABILITIES(dir, 0, false);
return MUNIT_OK;
}
/* ZFS 0.8 reports that it supports direct I/O, but does not support fully
* support asynchronous kernel AIO. */
TEST(UvFsProbeCapabilities, zfsDirectIO, DirZfsSetUp, DirTearDown, 0, NULL)
{
const char *dir = data;
size_t direct_io = 0;
#if defined(RAFT_HAVE_ZFS_WITH_DIRECT_IO)
direct_io = 4096;
#endif
if (dir == NULL) {
return MUNIT_SKIP;
}
PROBE_CAPABILITIES(dir, direct_io, false);
return MUNIT_OK;
}
/* File systems that fully support DIO. */
TEST(UvFsProbeCapabilities, aio, DirSetUp, DirTearDown, 0, DirAioParams)
{
const char *dir = data;
if (dir == NULL) {
return MUNIT_SKIP;
}
/* FIXME: btrfs doesn't like that we perform a first write to the probe file
* to detect the direct I/O buffer size. */
if (strcmp(munit_parameters_get(params, DIR_FS_PARAM), "btrfs") == 0) {
return MUNIT_SKIP;
}
PROBE_CAPABILITIES(dir, 4096, true);
return MUNIT_OK;
}
/* If the given path is not executable, the block size of the underlying file
* system can't be determined and an error is returned. */
TEST(UvFsProbeCapabilities, noAccess, DirSetUp, DirTearDown, 0, NULL)
{
const char *dir = data;
/* Skip the test when running as root, since EACCES would not be triggered
* in that case. */
if (getuid() == 0) {
return MUNIT_SKIP;
}
DirMakeUnexecutable(dir);
PROBE_CAPABILITIES_ERROR(
dir, RAFT_IOERR,
"create I/O capabilities probe file: open: permission denied");
return MUNIT_OK;
}
/* No space is left on the target device. */
TEST(UvFsProbeCapabilities, noSpace, DirTmpfsSetUp, DirTearDown, 0, NULL)
{
const char *dir = data;
if (dir == NULL) {
return MUNIT_SKIP;
}
DirFill(dir, 0);
PROBE_CAPABILITIES_ERROR(dir, RAFT_NOSPACE,
"create I/O capabilities probe file: not enough "
"space to allocate 4096 bytes");
return MUNIT_OK;
}
/* The uvIoSetup() call fails with EAGAIN. */
TEST(UvFsProbeCapabilities, noResources, DirBtrfsSetUp, DirTearDown, 0, NULL)
{
const char *dir = data;
aio_context_t ctx = 0;
int rv;
if (dir == NULL) {
return MUNIT_SKIP;
}
rv = AioFill(&ctx, 0);
if (rv != 0) {
return MUNIT_SKIP;
}
PROBE_CAPABILITIES_ERROR(
dir, RAFT_IOERR,
"probe Async I/O: io_setup: resource temporarily unavailable");
AioDestroy(ctx);
return MUNIT_OK;
}
/******************************************************************************
*
* UvFsMakeFile
*
*****************************************************************************/
SUITE(UvFsMakeFile)
/* If the file does not exist, the function succeeds. */
TEST(UvFsMakeFile, notExists, DirSetUp, DirTearDown, 0, NULL)
{
const char *dir = data;
int rv;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
struct raft_buffer bufs[2] = {{0}, {0}};
rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg);
munit_assert_int(rv, ==, 0);
return MUNIT_OK;
}
/* If the file exists, the function does not succeed. */
TEST(UvFsMakeFile, exists, DirSetUp, DirTearDown, 0, NULL)
{
const char *dir = data;
int rv;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
struct raft_buffer bufs[2] = {{0}, {0}};
rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg);
munit_assert_int(rv, ==, 0);
rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg);
munit_assert_int(rv, !=, 0);
return MUNIT_OK;
}
/******************************************************************************
*
* UvFsRenameFile
*
*****************************************************************************/
SUITE(UvFsRenameFile)
TEST(UvFsRenameFile, rename, DirSetUp, DirTearDown, 0, NULL)
{
const char *dir = data;
int rv;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
struct raft_buffer bufs[2] = {{0}, {0}};
rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg);
munit_assert_int(rv, ==, 0);
rv = UvFsRenameFile(dir, "foo", "bar", errmsg);
munit_assert_int(rv, ==, 0);
munit_assert_false(DirHasFile(dir, "foo"));
munit_assert_true(DirHasFile(dir, "bar"));
return MUNIT_OK;
}
/* rename to same name */
TEST(UvFsRenameFile, same, DirSetUp, DirTearDown, 0, NULL)
{
const char *dir = data;
int rv;
char errmsg[RAFT_ERRMSG_BUF_SIZE];
struct raft_buffer bufs[2] = {{0}, {0}};
rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg);
munit_assert_int(rv, ==, 0);
rv = UvFsRenameFile(dir, "foo", "foo", errmsg);
munit_assert_int(rv, ==, 0);
munit_assert_true(DirHasFile(dir, "foo"));
return MUNIT_OK;
}
raft-0.22.1/test/unit/test_uv_os.c 0000664 0000000 0000000 00000005154 14601504142 0017026 0 ustar 00root root 0000000 0000000 #include "../../src/uv_os.h"
#include "../lib/dir.h"
#include "../lib/runner.h"
SUITE(UvOsJoin)
/* dir and filename have sensible lengths */
TEST(UvOsJoin, basic, NULL, NULL, 0, NULL)
{
int rv;
const char *dir = "/home";
const char *filename = "testfile";
char path[UV__PATH_SZ];
rv = UvOsJoin(dir, filename, path);
munit_assert_int(rv, ==, 0);
munit_assert_string_equal(path, "/home/testfile");
return MUNIT_OK;
}
TEST(UvOsJoin, dirTooLong, NULL, NULL, 0, NULL)
{
int rv;
char path[UV__PATH_SZ];
char dir[UV__DIR_LEN + 2]; /* Room for '\0' and then 1 char over limit. */
memset((char *)dir, '/', sizeof(dir));
dir[sizeof(dir) - 1] = '\0';
const char *filename = "testfile";
rv = UvOsJoin(dir, filename, path);
munit_assert_int(rv, !=, 0);
return MUNIT_OK;
}
TEST(UvOsJoin, filenameTooLong, NULL, NULL, 0, NULL)
{
int rv;
char path[UV__PATH_SZ];
const char *dir = "testdir";
char filename[UV__FILENAME_LEN + 2];
memset((char *)filename, 'a', sizeof(filename));
filename[sizeof(filename) - 1] = '\0';
rv = UvOsJoin(dir, filename, path);
munit_assert_int(rv, !=, 0);
return MUNIT_OK;
}
TEST(UvOsJoin, dirAndFilenameTooLong, NULL, NULL, 0, NULL)
{
int rv;
char path[UV__PATH_SZ];
char dir[UV__DIR_LEN + 2];
memset((char *)dir, '/', sizeof(dir));
dir[sizeof(dir) - 1] = '\0';
char filename[UV__FILENAME_LEN + 2];
memset((char *)filename, 'a', sizeof(filename));
filename[sizeof(filename) - 1] = '\0';
rv = UvOsJoin(dir, filename, path);
munit_assert_int(rv, !=, 0);
return MUNIT_OK;
}
TEST(UvOsJoin, dirAndFilenameMax, NULL, NULL, 0, NULL)
{
int rv;
char path[UV__PATH_SZ];
char dir[UV__DIR_LEN + 1];
memset((char *)dir, '/', sizeof(dir));
dir[sizeof(dir) - 1] = '\0';
char filename[UV__FILENAME_LEN + 1];
memset((char *)filename, 'a', sizeof(filename));
filename[sizeof(filename) - 1] = '\0';
rv = UvOsJoin(dir, filename, path);
munit_assert_int(rv, ==, 0);
char cmp_path[UV__DIR_LEN + UV__FILENAME_LEN + 1 + 1];
snprintf(cmp_path, UV__DIR_LEN + UV__FILENAME_LEN + 1 + 1, "%s/%s", dir,
filename);
munit_assert_string_equal(path, cmp_path);
return MUNIT_OK;
}
SUITE(UvOsOpen)
TEST(UvOsOpen, Tmpfile, DirSetUp, DirTearDown, 0, DirAllParams)
{
const char *dir = data;
uv_file fd;
int rv;
if (dir == NULL) {
return MUNIT_SKIP;
}
rv = UvOsOpen(dir, O_TMPFILE | O_WRONLY, S_IRUSR | S_IWUSR, &fd);
munit_assert_int(rv, ==, 0);
rv = UvOsClose(fd);
munit_assert_int(rv, ==, 0);
return MUNIT_OK;
}
raft-0.22.1/test/unit/test_uv_writer.c 0000664 0000000 0000000 00000032163 14601504142 0017721 0 ustar 00root root 0000000 0000000 #include "../../src/uv_fs.h"
#include "../../src/uv_writer.h"
#include "../lib/aio.h"
#include "../lib/dir.h"
#include "../lib/loop.h"
#include "../lib/runner.h"
/******************************************************************************
*
* Fixture with a UvWriter and an open file ready for writing.
*
*****************************************************************************/
struct fixture
{
FIXTURE_DIR;
FIXTURE_LOOP;
int fd;
size_t block_size;
size_t direct_io;
bool async_io;
char errmsg[256];
struct UvWriter writer;
bool closed;
};
/******************************************************************************
*
* Helper macros.
*
*****************************************************************************/
struct result
{
int status;
bool done;
};
static void closeCb(struct UvWriter *writer)
{
struct fixture *f = writer->data;
f->closed = true;
}
static void submitCbAssertResult(struct UvWriterReq *req, int status)
{
struct result *result = req->data;
munit_assert_int(status, ==, result->status);
result->done = true;
}
/* Initialize the fixture's writer. */
#define INIT(MAX_WRITES) \
do { \
int _rv; \
_rv = UvWriterInit(&f->writer, &f->loop, f->fd, f->direct_io != 0, \
f->async_io, MAX_WRITES, f->errmsg); \
munit_assert_int(_rv, ==, 0); \
f->writer.data = f; \
f->closed = false; \
} while (0)
/* Try to initialize the fixture's writer and check that the given error is
* returned. */
#define INIT_ERROR(RV, ERRMSG) \
do { \
int _rv; \
_rv = UvWriterInit(&f->writer, &f->loop, f->fd, f->direct_io != 0, \
f->async_io, 1, f->errmsg); \
munit_assert_int(_rv, ==, RV); \
munit_assert_string_equal(f->errmsg, ERRMSG); \
} while (0)
/* Close helper. */
#define CLOSE_SUBMIT \
munit_assert_false(f->closed); \
UvWriterClose(&f->writer, closeCb); \
munit_assert_false(f->closed)
#define CLOSE_WAIT LOOP_RUN_UNTIL(&f->closed)
#define CLOSE \
CLOSE_SUBMIT; \
CLOSE_WAIT
#define MAKE_BUFS(BUFS, N_BUFS, CONTENT) \
{ \
int __i; \
BUFS = munit_malloc(sizeof *BUFS * N_BUFS); \
for (__i = 0; __i < N_BUFS; __i++) { \
uv_buf_t *__buf = &BUFS[__i]; \
__buf->len = f->block_size; \
__buf->base = aligned_alloc(f->block_size, f->block_size); \
munit_assert_ptr_not_null(__buf->base); \
memset(__buf->base, CONTENT + __i, __buf->len); \
} \
}
#define DESTROY_BUFS(BUFS, N_BUFS) \
{ \
int __i; \
for (__i = 0; __i < N_BUFS; __i++) { \
free(BUFS[__i].base); \
} \
free(BUFS); \
}
#define WRITE_REQ(N_BUFS, CONTENT, OFFSET, RV, STATUS) \
struct uv_buf_t *_bufs; \
struct UvWriterReq _req; \
struct result _result = {STATUS, false}; \
int _rv; \
MAKE_BUFS(_bufs, N_BUFS, CONTENT); \
_req.data = &_result; \
_rv = UvWriterSubmit(&f->writer, &_req, _bufs, N_BUFS, OFFSET, \
submitCbAssertResult); \
munit_assert_int(_rv, ==, RV);
/* Submit a write request with the given parameters and wait for the operation
* to successfully complete. Deallocate BUFS when done.
*
* N_BUFS is the number of buffers to allocate and write, each of them will have
* f->block_size bytes.
*
* CONTENT must be an unsigned byte value: all bytes of the first buffer will be
* filled with that value, all bytes of the second buffer will be filled will
* that value plus one, etc.
*
* OFFSET is the offset at which to write the buffers. */
#define WRITE(N_BUFS, CONTENT, OFFSET) \
do { \
WRITE_REQ(N_BUFS, CONTENT, OFFSET, 0 /* rv */, 0 /* status */); \
LOOP_RUN_UNTIL(&_result.done); \
DESTROY_BUFS(_bufs, N_BUFS); \
} while (0)
/* Submit a write request with the given parameters and wait for the operation
* to fail with the given code and message. */
#define WRITE_FAILURE(N_BUFS, CONTENT, OFFSET, STATUS, ERRMSG) \
do { \
WRITE_REQ(N_BUFS, CONTENT, OFFSET, 0 /* rv */, STATUS); \
LOOP_RUN_UNTIL(&_result.done); \
munit_assert_string_equal(f->writer.errmsg, ERRMSG); \
DESTROY_BUFS(_bufs, N_BUFS); \
} while (0)
/* Submit a write request with the given parameters, close the writer right
* after and assert that the request got canceled. */
#define WRITE_CLOSE(N_BUFS, CONTENT, OFFSET, STATUS) \
do { \
WRITE_REQ(N_BUFS, CONTENT, OFFSET, 0 /* rv */, STATUS); \
CLOSE_SUBMIT; \
munit_assert_false(_result.done); \
LOOP_RUN_UNTIL(&_result.done); \
DESTROY_BUFS(_bufs, N_BUFS); \
CLOSE_WAIT; \
} while (0)
/* Assert that the content of the test file has the given number of blocks, each
* filled with progressive numbers. */
#define ASSERT_CONTENT(N) \
do { \
size_t _size = N * f->block_size; \
void *_buf = munit_malloc(_size); \
unsigned _i; \
unsigned _j; \
\
DirReadFile(f->dir, "foo", _buf, _size); \
\
for (_i = 0; _i < N; _i++) { \
char *cursor = (char *)_buf + _i * f->block_size; \
for (_j = 0; _j < f->block_size; _j++) { \
munit_assert_int(cursor[_j], ==, _i + 1); \
} \
} \
\
free(_buf); \
} while (0)
#define N_BLOCKS 5
/******************************************************************************
*
* Set up and tear down.
*
*****************************************************************************/
static void *setUpDeps(const MunitParameter params[], void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
char path[UV__PATH_SZ];
char errmsg[256];
int rv;
SET_UP_DIR;
SETUP_LOOP;
rv = UvFsProbeCapabilities(f->dir, &f->direct_io, &f->async_io, errmsg);
munit_assert_int(rv, ==, 0);
f->block_size = f->direct_io != 0 ? f->direct_io : 4096;
rv = UvOsJoin(f->dir, "foo", path);
munit_assert_int(rv, ==, 0);
rv = UvOsOpen(path, O_WRONLY | O_CREAT, S_IRUSR | S_IWUSR, &f->fd);
munit_assert_int(rv, ==, 0);
rv = UvOsFallocate(f->fd, 0, f->block_size * N_BLOCKS);
munit_assert_int(rv, ==, 0);
return f;
}
static void tearDownDeps(void *data)
{
struct fixture *f = data;
if (f == NULL) {
return; /* Was skipped. */
}
UvOsClose(f->fd);
TEAR_DOWN_LOOP;
TEAR_DOWN_DIR;
free(f);
}
static void *setUp(const MunitParameter params[], void *user_data)
{
struct fixture *f = setUpDeps(params, user_data);
if (f == NULL) {
return NULL;
}
INIT(1);
return f;
}
static void tearDown(void *data)
{
struct fixture *f = data;
if (f == NULL) {
return; /* Was skipped. */
}
CLOSE;
tearDownDeps(f);
}
/******************************************************************************
*
* UvWriterInit
*
*****************************************************************************/
SUITE(UvWriterInit)
/* The kernel has ran out of available AIO events. */
TEST(UvWriterInit, noResources, setUpDeps, tearDownDeps, 0, NULL)
{
struct fixture *f = data;
aio_context_t ctx = 0;
int rv;
rv = AioFill(&ctx, 0);
if (rv != 0) {
return MUNIT_SKIP;
}
INIT_ERROR(RAFT_TOOMANY, "AIO events user limit exceeded");
AioDestroy(ctx);
return MUNIT_OK;
}
/******************************************************************************
*
* UvWriterSubmit
*
*****************************************************************************/
SUITE(UvWriterSubmit)
TEST(UvWriterSubmit, one, setUp, tearDown, 0, DirAllParams)
{
struct fixture *f = data;
SKIP_IF_NO_FIXTURE;
WRITE(1 /* n bufs */, 1 /* content */, 0 /* offset */);
ASSERT_CONTENT(1);
return MUNIT_OK;
}
/* Write two buffers, one after the other. */
TEST(UvWriterSubmit, two, setUp, tearDown, 0, DirAllParams)
{
struct fixture *f = data;
SKIP_IF_NO_FIXTURE;
WRITE(1 /* n bufs */, 1 /* content */, 0 /* offset */);
WRITE(1 /* n bufs */, 2 /* content */, f->block_size /* offset */);
ASSERT_CONTENT(2);
return MUNIT_OK;
}
/* Write the same block twice. */
TEST(UvWriterSubmit, twice, setUp, tearDown, 0, DirAllParams)
{
struct fixture *f = data;
SKIP_IF_NO_FIXTURE;
WRITE(1 /* n bufs */, 0 /* content */, 0 /* offset */);
WRITE(1 /* n bufs */, 1 /* content */, 0 /* offset */);
ASSERT_CONTENT(1);
return MUNIT_OK;
}
/* Write a vector of buffers. */
TEST(UvWriterSubmit, vec, setUp, tearDown, 0, DirAllParams)
{
struct fixture *f = data;
SKIP_IF_NO_FIXTURE;
WRITE(2 /* n bufs */, 1 /* content */, 0 /* offset */);
ASSERT_CONTENT(1);
return MUNIT_OK;
}
/* Write a vector of buffers twice. */
TEST(UvWriterSubmit, vecTwice, setUp, tearDown, 0, DirAllParams)
{
struct fixture *f = data;
SKIP_IF_NO_FIXTURE;
WRITE(2 /* n bufs */, 1 /* content */, 0 /* offset */);
WRITE(2 /* n bufs */, 1 /* content */, 0 /* offset */);
ASSERT_CONTENT(2);
return MUNIT_OK;
}
/* Write past the allocated space. */
TEST(UvWriterSubmit, beyondEOF, setUp, tearDown, 0, DirAllParams)
{
struct fixture *f = data;
int i;
SKIP_IF_NO_FIXTURE;
for (i = 0; i < N_BLOCKS + 1; i++) {
WRITE(1 /* n bufs */, i + 1 /* content */,
i * f->block_size /* offset */);
}
ASSERT_CONTENT((N_BLOCKS + 1));
return MUNIT_OK;
}
/* Write two different blocks concurrently. */
TEST(UvWriterSubmit, concurrent, NULL, NULL, 0, DirAllParams)
{
return MUNIT_SKIP; /* TODO: tests stop responding */
}
/* Write the same block concurrently. */
TEST(UvWriterSubmit, concurrentSame, NULL, NULL, 0, DirAllParams)
{
return MUNIT_SKIP; /* TODO: tests stop responding */
}
/* There are not enough resources to create an AIO context to perform the
* write. */
TEST(UvWriterSubmit, noResources, setUpDeps, tearDown, 0, DirNoAioParams)
{
struct fixture *f = data;
aio_context_t ctx = 0;
int rv;
SKIP_IF_NO_FIXTURE;
INIT(2);
rv = AioFill(&ctx, 0);
if (rv != 0) {
return MUNIT_SKIP;
}
WRITE_FAILURE(1, 0, 0, RAFT_TOOMANY, "AIO events user limit exceeded");
AioDestroy(ctx);
return MUNIT_OK;
}
/******************************************************************************
*
* UvWriterSubmit
*
*****************************************************************************/
SUITE(UvWriterClose)
/* Close with an inflight write running in the threadpool. */
TEST(UvWriterClose, threadpool, setUp, tearDownDeps, 0, DirNoAioParams)
{
struct fixture *f = data;
SKIP_IF_NO_FIXTURE;
WRITE_CLOSE(1, 0, 0, 0);
return MUNIT_OK;
}
/* Close with an inflight AIO write . */
TEST(UvWriterClose, aio, setUp, tearDownDeps, 0, DirAioParams)
{
struct fixture *f = data;
SKIP_IF_NO_FIXTURE;
WRITE_CLOSE(1, 0, 0, RAFT_CANCELED);
return MUNIT_OK;
}
raft-0.22.1/tools/ 0000775 0000000 0000000 00000000000 14601504142 0013665 5 ustar 00root root 0000000 0000000 raft-0.22.1/tools/benchmark/ 0000775 0000000 0000000 00000000000 14601504142 0015617 5 ustar 00root root 0000000 0000000 raft-0.22.1/tools/benchmark/disk.c 0000664 0000000 0000000 00000014423 14601504142 0016721 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include
#include
#include
#include
#include
#include
#include "disk.h"
#include "disk_options.h"
#include "disk_parse.h"
#include "disk_uring.h"
#include "fs.h"
#include "timer.h"
/* Allocate a buffer of the given size. */
static void allocBuffer(struct iovec *iov, size_t size)
{
unsigned i;
iov->iov_len = size;
iov->iov_base = aligned_alloc(iov->iov_len, iov->iov_len);
assert(iov->iov_base != NULL);
/* Populate the buffer with some fixed data. */
for (i = 0; i < size; i++) {
*(((uint8_t *)iov->iov_base) + i) = i % 128;
}
}
/* Prepare the file or device to write to. */
static int openFile(struct diskOptions *opts,
struct FsFileInfo *info,
int *fd,
char **path)
{
unsigned n = opts->size / (unsigned)opts->buf;
int rv;
rv = FsFileInfo(opts->dir, info);
if (rv != 0) {
printf("file info '%s': %s\n", opts->dir, strerror(errno));
return -1;
}
if (info->type == FS_TYPE_DEVICE) {
rv = FsOpenBlockDevice(opts->dir, fd);
} else {
rv = FsCreateTempFile(opts->dir, n * opts->buf, path, fd);
if (rv == 0) {
rv = FsFileInfo(*path, info);
}
}
if (rv != 0) {
return -1;
}
return 0;
}
static int closeFile(struct FsFileInfo *info, int fd, char *path)
{
int rv;
if (info->type != FS_TYPE_DEVICE) {
rv = FsRemoveTempFile(path, fd);
} else {
rv = close(fd);
}
if (rv != 0) {
return -1;
}
return 0;
}
static void reportLatency(struct benchmark *benchmark,
struct histogram *histogram)
{
struct metric *m;
m = BenchmarkGrow(benchmark, METRIC_KIND_LATENCY);
MetricFillHistogram(m, histogram);
}
static void reportThroughput(struct benchmark *benchmark,
unsigned long duration,
unsigned size)
{
struct metric *m;
unsigned megabytes = size / (1024 * 1024); /* N megabytes written */
m = BenchmarkGrow(benchmark, METRIC_KIND_THROUGHPUT);
MetricFillThroughput(m, megabytes, duration);
}
int DiskRun(int argc, char *argv[], struct report *report)
{
struct diskOptions opts;
struct Profiler profiler;
struct FsFileInfo info;
struct benchmark *benchmark;
struct timer timer;
struct histogram histogram;
struct iovec iov;
char *name;
char *path;
int fd;
unsigned long duration;
unsigned i;
unsigned n;
int rv;
DiskParse(argc, argv, &opts);
rv = openFile(&opts, &info, &fd, &path);
if (rv != 0) {
return -1;
}
ProfilerInit(&profiler, &info);
if (opts.perf) {
rv = ProfilerPerf(&profiler);
if (rv != 0) {
return -1;
}
}
for (i = 0; i < opts.n_traces; i++) {
ProfilerTrace(&profiler, opts.traces[i]);
}
allocBuffer(&iov, opts.buf);
HistogramInit(&histogram, info.buckets, info.resolution);
TimerStart(&timer);
n = opts.size / (unsigned)opts.buf;
rv = DiskWriteUsingUring(fd, &iov, n, &profiler, &histogram);
duration = TimerStop(&timer);
free(iov.iov_base);
if (rv != 0) {
return -1;
}
rv = closeFile(&info, fd, path);
if (rv != 0) {
return -1;
}
/* 262144 is the maximum buffer size where no context switches happen,
* presumably because io_uring inlines smaller requests and uses the
* threadpool for larger ones. */
if (opts.perf && profiler.switches != 0 &&
info.driver != FS_DRIVER_GENERIC) {
printf("Error: unexpected context switches: %u\n", profiler.switches);
return -1;
}
/* Only report disk benchmarks if kernel sub-systems performance measurement
* is disabled.
*
* In CI we run the "raft-benchmark disk" command twice: once with kernel
* sub-systems performance measurement enabled, to report raw block/nvme
* metrics, and once with kernel sub-systems performance measurement
* disabled, to report the actual end-to-end metrics. That's because
* enabling kernel sub-systems performance measurement has a noticeable
* (albeit low) overhead.
*/
if (!opts.perf) {
rv = asprintf(&name, "disk:%zu", opts.buf);
assert(rv > 0);
assert(name != NULL);
benchmark = ReportGrow(report, name);
reportLatency(benchmark, &histogram);
reportThroughput(benchmark, duration, opts.size);
}
HistogramClose(&histogram);
if (opts.perf && info.driver != FS_DRIVER_GENERIC) {
struct ProfilerDataSource *data;
const char *system;
system = "block";
data = &profiler.block;
rv = asprintf(&name, "disk:%s:%zu", system, opts.buf);
assert(rv > 0);
assert(name != NULL);
HistogramInit(&histogram, info.buckets, info.resolution);
if (data->n_commands != n && info.driver != FS_DRIVER_GENERIC) {
printf("Error: unexpected commands: %u\n", data->n_commands);
return -1;
}
for (i = 0; i < data->n_commands; i++) {
assert(data->commands[i].duration > 0);
HistogramCount(&histogram, data->commands[i].duration);
}
benchmark = ReportGrow(report, name);
reportLatency(benchmark, &histogram);
HistogramClose(&histogram);
if (info.driver == FS_DRIVER_NVME) {
system = "nvme";
data = &profiler.block;
rv = asprintf(&name, "disk:%s:%zu", system, opts.buf);
assert(rv > 0);
assert(name != NULL);
HistogramInit(&histogram, info.buckets, info.resolution);
if (data->n_commands != n) {
printf("Error: unexpected commands: %u\n", data->n_commands);
return -1;
}
for (i = 0; i < data->n_commands; i++) {
assert(data->commands[i].duration > 0);
HistogramCount(&histogram, data->commands[i].duration);
}
benchmark = ReportGrow(report, name);
reportLatency(benchmark, &histogram);
HistogramClose(&histogram);
}
}
ProfilerClose(&profiler);
return 0;
}
raft-0.22.1/tools/benchmark/disk.h 0000664 0000000 0000000 00000000306 14601504142 0016721 0 ustar 00root root 0000000 0000000 /* Run the disk benchmark. */
#ifndef DISK_H_
#define DISK_H_
#include "report.h"
/* Run the disk subcommand. */
int DiskRun(int argc, char *argv[], struct report *report);
#endif /* DISK_H_ */
raft-0.22.1/tools/benchmark/disk_options.h 0000664 0000000 0000000 00000001065 14601504142 0020477 0 ustar 00root root 0000000 0000000 /* Options for the disk benchmark. */
#ifndef DISK_OPTIONS_H_
#define DISK_OPTIONS_H_
#include
#include "profiler.h"
/* Options for the disk benchmark */
struct diskOptions
{
char *dir; /* Directory to use for creating temporary files */
size_t buf; /* Write buffer size */
unsigned size; /* Size of the file to write, must be a multiple of buf */
bool perf; /* Turn on or off kernel performance measuring */
const char *traces[8]; /* Kernel sub-systems to trace */
unsigned n_traces;
};
#endif /* DISK_ARGS_H_ */
raft-0.22.1/tools/benchmark/disk_parse.c 0000664 0000000 0000000 00000005773 14601504142 0020123 0 ustar 00root root 0000000 0000000 #include
#include
#include
#include
#include
#include
#include "disk.h"
#include "disk_parse.h"
#define MEGABYTE (1024 * 1024)
static char doc[] = "Benchmark sequential write performance\n";
/* Order of fields: {NAME, KEY, ARG, FLAGS, DOC, GROUP}.*/
static struct argp_option options[] = {
{"dir", 'd', "DIR", 0, "Directory to use for temp files (default '.')", 0},
{"buf", 'b', "BUF", 0, "Write buffer size (default 4096)", 0},
{"size", 's', "S", 0, "Size of the file to write (default 8M)", 0},
{"perf", 'p', NULL, 0, "Report kernel subsystems performance metrics", 0},
{"trace", 't', "TRACE", 0, "Comma-separated kernel subsystems to trace", 0},
{0}};
static error_t argpParser(int key, char *arg, struct argp_state *state);
static struct argp argp = {
.options = options,
.parser = argpParser,
.doc = doc,
};
/* Parse a comma-separated list of kernels subsystem names. */
static void parseTracing(struct diskOptions *opts, char *arg)
{
char *token;
unsigned n_tokens = 1;
unsigned i;
/* Count the number of comma-separated tokens in this argument. */
for (i = 0; i < strlen(arg); i++) {
if (arg[i] == ',')
n_tokens++;
}
for (i = 0; i < n_tokens; i++) {
if (i == 0) {
token = strtok(arg, ",");
} else {
token = strtok(NULL, ",");
}
assert(token != NULL);
opts->traces[opts->n_traces] = token;
opts->n_traces++;
}
}
static error_t argpParser(int key, char *arg, struct argp_state *state)
{
struct diskOptions *opts = state->input;
switch (key) {
case 'd':
opts->dir = arg;
break;
case 'b':
opts->buf = (size_t)atoi(arg);
break;
case 's':
opts->size = (unsigned)atoi(arg);
break;
case 'p':
opts->perf = true;
break;
case 't':
parseTracing(opts, arg);
break;
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}
static void optionsInit(struct diskOptions *opts)
{
opts->dir = ".";
opts->buf = 4096;
opts->size = 8 * MEGABYTE;
opts->perf = false;
opts->n_traces = 0;
}
static void optionsCheck(struct diskOptions *opts)
{
if (opts->buf == 0 || opts->buf > MEGABYTE) {
printf("Invalid buffer size %zu\n", opts->buf);
exit(1);
}
if (opts->size == 0 || opts->size % 4096 != 0) {
printf("Invalid file size %u\n", opts->size);
exit(1);
}
if (opts->perf && getuid() != 0) {
printf("Performance measurment requires root\n");
exit(1);
}
if (opts->n_traces > 0 && getuid() != 0) {
printf("Tracing requires root\n");
exit(1);
}
}
void DiskParse(int argc, char *argv[], struct diskOptions *opts)
{
optionsInit(opts);
argv[0] = "benchmark/run disk";
argp_parse(&argp, argc, argv, 0, 0, opts);
optionsCheck(opts);
}
raft-0.22.1/tools/benchmark/disk_parse.h 0000664 0000000 0000000 00000000415 14601504142 0020114 0 ustar 00root root 0000000 0000000 /* Parse command line arguments for the disk benchmark. */
#ifndef DISK_ARGS_H_
#define DISK_ARGS_H_
#include "disk_options.h"
/* Parse the given command line arguments. */
void DiskParse(int argc, char *argv[], struct diskOptions *opts);
#endif /* DISK_ARGS_H_ */
raft-0.22.1/tools/benchmark/disk_uring.c 0000664 0000000 0000000 00000017614 14601504142 0020132 0 ustar 00root root 0000000 0000000 #include
#include "disk_uring.h"
#if defined(HAVE_LINUX_IO_URING_H)
#include
#include
#include
#include
#include
#include
#include