Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions .config/nextest.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# nextest configuration. Run with: cargo nextest run --all-features
#
# Why nextest over `cargo test`:
# - Each test runs in its own process → no in-process state contention.
# Integration tests that spawn 3-node clusters used to hang under
# `cargo test`'s default within-binary parallelism because multiple
# clusters in the same process exhausted ports / file descriptors.
# - Per-test timeouts make hangs fail fast instead of stalling CI.
# - Better failure output, retry support, and JUnit XML for CI.

[profile.default]
# Hard ceiling per test. Anything above this is a bug, not a slow test.
slow-timeout = { period = "30s", terminate-after = 4 }

# Use every available core for cheap unit tests. Heavy cluster tests
# are kept from starving by `threads-required` overrides below — they
# claim ALL slots so nothing else runs alongside them, regardless of
# whether you're on a 24-core dev box or a 2-core CI runner.
test-threads = "num-cpus"

# Heavy cluster tests: each one brings up 3 servers + per-node Tokio
# runtimes. Two things keep them stable across machine sizes:
#
# 1. `test-group = "cluster"` with `max-threads = 1` ensures at
# most ONE cluster test runs at a time (no two clusters share
# ports / file descriptors / thread pools).
# 2. `threads-required = "num-test-threads"` makes the running
# cluster test claim every available test slot, which evicts
# every other test from the run-queue while it's executing.
# That's what prevents a 24-core dev box from scheduling 23
# unit tests alongside the cluster and starving its Raft
# heartbeats.
#
# The combined effect: cluster tests run strictly serially AND
# strictly alone, and the rest of the suite gets full parallelism
# the moment the cluster test finishes.
[[profile.default.overrides]]
filter = '''
binary(/cluster/)
| binary(/cross_node/)
| binary(/_lease_/)
| binary(descriptor_lease_drain)
| binary(descriptor_lease_forwarding_and_renewal)
| binary(descriptor_lease_planner_integration)
| binary(descriptor_versioning_cross_node)
| binary(prepared_cache_invalidation)
| binary(sql_cluster_cross_node_dml)
'''
test-group = 'cluster'
threads-required = 'num-test-threads'
# Cluster tests bring up real Raft nodes and racy multi-node
# convergence checks. They're flaky enough that one retry catches
# legitimate startup jitter without hiding real regressions — a
# genuinely broken test fails twice in a row.
retries = { backoff = "fixed", count = 2, delay = "1s" }

[test-groups]
cluster = { max-threads = 1 }

[profile.ci]
# CI inherits the default profile (cluster group, threads-required,
# slow-timeout) and adds:
# - more retries: CI runners are ~2× slower per-core than dev
# workstations, so the cluster tests' in-test `wait_for`
# budgets are proportionally tighter. Three retries (four total
# attempts) buys headroom for jitter without papering over real
# regressions — a genuinely broken test fails four times in a row.
# - JUnit XML: picked up by the workflow's artifact upload.
#
# NOTE: we deliberately do NOT bump `slow-timeout` here. The
# slow-timeout only controls when nextest gives up on a stuck
# *process*; it does NOT extend the test's internal `wait_for`
# budgets. Once a `wait_for` panics, the test has already failed —
# making nextest wait longer just wastes CI minutes on cleanup.
retries = { backoff = "fixed", count = 3, delay = "2s" }
fail-fast = false

[profile.ci.junit]
path = "junit.xml"
18 changes: 17 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,21 @@ jobs:
sudo apt-get install -y --no-install-recommends \
cmake clang libclang-dev pkg-config protobuf-compiler perl \
libcurl4-openssl-dev libsasl2-dev
# nextest is required — `.config/nextest.toml` defines the
# `cluster` test-group that serializes 3-node integration tests
# and the `ci` profile that retries flaky cluster tests once and
# writes a JUnit report. Plain `cargo test` ignores all of that
# and will hang/fail on the cluster suite.
- name: Install cargo-nextest
uses: taiki-e/install-action@v2
with:
tool: nextest
- name: Run tests
run: cargo test --all-features --profile ci
run: cargo nextest run --all-features --cargo-profile ci --profile ci
- name: Upload JUnit report
if: always()
uses: actions/upload-artifact@v4
with:
name: junit-report
path: target/nextest/ci/junit.xml
if-no-files-found: ignore
13 changes: 11 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@ FROM debian:bookworm-slim AS runtime

# ca-certificates: needed for JWKS fetch, OTLP export, S3 archival
# curl: needed for HEALTHCHECK
# gosu: drop privileges from root after fixing data-dir ownership in entrypoint
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
curl \
gosu \
&& rm -rf /var/lib/apt/lists/*

# Non-root user
Expand All @@ -51,12 +53,18 @@ RUN mkdir -p /var/lib/nodedb /etc/nodedb \

COPY --from=builder /build/target/release/nodedb /usr/local/bin/nodedb

# Entrypoint: when started as root, fix data-dir ownership and drop to the
# nodedb user. When already started as a non-root user (e.g. `--user 10001`),
# exec directly. This makes `-v <named-volume>:/var/lib/nodedb` work even
# when Docker initialises the volume as root-owned (common on Linux hosts).
COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
RUN chmod +x /usr/local/bin/docker-entrypoint.sh

# Bind to all interfaces (required for Docker port mapping)
# Point data dir at the declared volume
ENV NODEDB_HOST=0.0.0.0 \
NODEDB_DATA_DIR=/var/lib/nodedb

USER nodedb
WORKDIR /var/lib/nodedb

# pgwire | native protocol | HTTP API | WebSocket sync | OTLP gRPC | OTLP HTTP
Expand All @@ -67,4 +75,5 @@ VOLUME ["/var/lib/nodedb"]
HEALTHCHECK --interval=10s --timeout=3s --start-period=5s \
CMD curl -f http://localhost:6480/health || exit 1

ENTRYPOINT ["/usr/local/bin/nodedb"]
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
CMD ["/usr/local/bin/nodedb"]
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ For development or contributing:
git clone https://github.com/NodeDB-Lab/nodedb.git
cd nodedb
cargo build --release
cargo test --all-features
cargo install cargo-nextest --locked # one-time
cargo nextest run --all-features
```

## Status
Expand Down
47 changes: 47 additions & 0 deletions docker-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/sh
# NodeDB container entrypoint.
#
# When invoked as root (the default for `docker run` with no --user), fix
# ownership of NODEDB_DATA_DIR and drop privileges to the unprivileged
# `nodedb` user before exec'ing the server. When invoked as any other UID
# (e.g. `--user 10001` or via Kubernetes runAsUser), exec directly and
# leave the data directory alone.
#
# This makes `-v <named-volume>:/var/lib/nodedb` work even when Docker
# initialises the named volume as root-owned (common on Linux hosts where
# the volume is created out-of-band before the container's first run).

set -e

DATA_DIR="${NODEDB_DATA_DIR:-/var/lib/nodedb}"

if [ "$(id -u)" = "0" ]; then
# Running as root: ensure the data dir exists and is owned by nodedb,
# then drop privileges. mkdir is a no-op for the declared VOLUME but
# protects against custom NODEDB_DATA_DIR overrides.
mkdir -p "$DATA_DIR"
chown -R nodedb:nodedb "$DATA_DIR"
exec gosu nodedb "$@"
fi

# Already non-root: ensure we can actually write to the data dir, otherwise
# fail fast with a clear message instead of the cryptic WAL "Permission
# denied (os error 13)" the user sees on a misconfigured volume mount.
if [ ! -w "$DATA_DIR" ]; then
cat >&2 <<EOF
nodedb: data directory $DATA_DIR is not writable by uid=$(id -u) gid=$(id -g).

This usually means a host volume was mounted with root ownership while
NodeDB is configured to run as a non-root user. Fixes:

1. Let the entrypoint fix it: drop the explicit --user flag so the
container starts as root and chowns the volume on first boot.
2. Pre-create the volume with the right ownership on the host, e.g.
chown -R 10001:10001 /path/to/host/dir
3. Run as root explicitly: docker run --user 0:0 ...

EOF
exit 1
fi

exec "$@"
Loading
Loading