From cc66f5507f5e1267e498eca141fd91484d51eaf0 Mon Sep 17 00:00:00 2001 From: Peter Clark Date: Wed, 4 Mar 2026 23:02:12 +0900 Subject: [PATCH 1/3] Added CLI support for riak_client:repair_node() --- docs/OperationsAndTroubleshootingGuide.md | 2 +- src/riak_kv_cli_registry.erl | 3 +- src/riak_kv_node_cli.erl | 105 ++++++++++++++++++++++ 3 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 src/riak_kv_node_cli.erl diff --git a/docs/OperationsAndTroubleshootingGuide.md b/docs/OperationsAndTroubleshootingGuide.md index 8c9123c2..3f3942cf 100644 --- a/docs/OperationsAndTroubleshootingGuide.md +++ b/docs/OperationsAndTroubleshootingGuide.md @@ -104,7 +104,7 @@ riak eval "riak_client:remove_node_from_coverage()." #### Completing a Repair -The data can then be recovered from the other nodes in the cluster issuing the `riak_client:repair_node()` command from the `remote_console` of the replacement node. This will prompt all vnodes which partially overlap the data held in the vnodes on the replacement node to race to play a role in repairing the node. Each vnode will only repair the data which overlaps, filtering out any data that another vnode has already repaired (or is in the process of repairing). +The data can then be recovered from the other nodes in the cluster issuing the `riak admin node repair -n NODE` command. This will prompt all vnodes which partially overlap the data held in the vnodes on the replacement node to race to play a role in repairing the node. Each vnode will only repair the data which overlaps, filtering out any data that another vnode has already repaired (or is in the process of repairing). Available from Riak 3.4.0{: .label .label-purple }To improve the performance of repair, the `repair_span` configuration in the [riak_core schema section of riak.conf](https://github.com/OpenRiak/riak_core/blob/openriak-3.4/priv/riak_core.schema) can be changed to `double_pair`, and this has been proven to be more effective when used with the leveled backend together with the enablement of the `repair_deferred` option in the [riak_kv schema section of riak.conf](https://github.com/OpenRiak/riak_kv/blob/openriak-3.4/priv/riak_kv.schema). diff --git a/src/riak_kv_cli_registry.erl b/src/riak_kv_cli_registry.erl index deea38bf..e3ad7438 100644 --- a/src/riak_kv_cli_registry.erl +++ b/src/riak_kv_cli_registry.erl @@ -21,7 +21,8 @@ -module(riak_kv_cli_registry). -define(CLI_MODULES, [riak_kv_tictacaae_cli, - riak_kv_vnode_status_cli + riak_kv_vnode_status_cli, + riak_kv_node_cli ]). -export([register_cli/0 diff --git a/src/riak_kv_node_cli.erl b/src/riak_kv_node_cli.erl new file mode 100644 index 00000000..b63157a0 --- /dev/null +++ b/src/riak_kv_node_cli.erl @@ -0,0 +1,105 @@ +%% ------------------------------------------------------------------- +%% +%% Copyright (c) 2026 TI Tokyo. All Rights Reserved. +%% +%% This file is provided to you under the Apache License, +%% Version 2.0 (the "License"); you may not use this file +%% except in compliance with the License. You may obtain +%% a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. +%% +%% ------------------------------------------------------------------- + +-module(riak_kv_node_cli). + +-behaviour(clique_handler). + +-include_lib("kernel/include/logger.hrl"). + +-export([register_cli/0]). + +register_cli() -> + register_all_usage(), + register_all_commands(). + +register_all_usage() -> + clique:register_usage(["riak-admin", "node"], node_usage()), + clique:register_usage(["riak-admin", "node", "repair"], node_repair_usage()). + +register_all_commands() -> + lists:foreach( + fun(Args) -> apply(clique, register_command, Args) end, + [node_repair_specs() + ]). + +node_usage() -> + ["riak admin node { repair }\n", + "See individual subcommand usage for options and arguments.\n", + "\n", + "Unless given specifically with -n NODE, commands are executed on the current node.\n", + "NODE can be \"all\".\n" + ]. + +node_repair_usage() -> + ["riak admin node repair\n", + "Triggers a partition repair on all partitions on node(s).\n", + "\n", + "Unless given specifically with -n NODE, commands are executed on the current node.\n", + "NODE can be \"all\".\n" + ]. + +main(Fun, A, B, C) -> + try + Fun(A, B, C) + catch + Class:Reason:Stack -> + logger:error("node repair: handler failed: ~p:~p stack=~p", + [Class, Reason, Stack]), + clique_status:text(io_lib:format("ERROR: ~p:~p", [Class, Reason])) + end. + +-define(NODEOPT, {node, [{shortname, "n"}, + {longname, "node"}, + {typecast, fun to_node/1}]}). + +target_nodes(Opts) -> + NN = [N || {node, N} <- Opts], + case lists:member(all, NN) of + true -> + Ns = [node() | nodes()], + Ns; + false when NN /= [] -> + NN; + _ -> + Ns = [node()], + Ns + end. + +node_repair_specs() -> + [["riak-admin", "node", "repair"], + [], [?NODEOPT], + fun(A, B, C) -> main(fun node_repair_cmd/3, A, B, C) end + ]. + +node_repair_cmd(_Cmd, _Args, Opts) -> + Nodes = target_nodes(Opts), + + [{text, + lists:flatten( + io_lib:format("~p -> ~p", + [N, rpc:call(N, riak_client, repair_node, [])]))} + || N <- Nodes]. + +to_node("all") -> + all; +to_node(A) -> + clique_typecast:to_node(A). + From 4bba66acaf9bd48b886e63df0f746927738c6d91 Mon Sep 17 00:00:00 2001 From: Andriy Zavada Date: Thu, 18 Jun 2026 00:51:52 +0100 Subject: [PATCH 2/3] node_cli: repair start/status/stop --- src/riak_kv_node_cli.erl | 218 +++++++++++++++++++++++++++------- src/riak_kv_tictacaae_cli.erl | 8 +- 2 files changed, 180 insertions(+), 46 deletions(-) diff --git a/src/riak_kv_node_cli.erl b/src/riak_kv_node_cli.erl index b63157a0..98689262 100644 --- a/src/riak_kv_node_cli.erl +++ b/src/riak_kv_node_cli.erl @@ -32,30 +32,65 @@ register_cli() -> register_all_usage() -> clique:register_usage(["riak-admin", "node"], node_usage()), - clique:register_usage(["riak-admin", "node", "repair"], node_repair_usage()). + clique:register_usage(["riak-admin", "node", "repair"], node_repair_usage()), + clique:register_usage(["riak-admin", "node", "repair", "start"], node_repair_start_usage()), + clique:register_usage(["riak-admin", "node", "repair", "status"], node_repair_status_usage()), + clique:register_usage(["riak-admin", "node", "repair", "stop", '*'], node_repair_stop_usage()). register_all_commands() -> lists:foreach( fun(Args) -> apply(clique, register_command, Args) end, - [node_repair_specs() + [node_repair_status_specs(), + node_repair_start_specs(), + node_repair_stop_specs() ]). node_usage() -> - ["riak admin node { repair }\n", - "See individual subcommand usage for options and arguments.\n", - "\n", - "Unless given specifically with -n NODE, commands are executed on the current node.\n", - "NODE can be \"all\".\n" + ["riak admin node repair { start | status | stop REASON } [OPTIONS]\n", + "See individual subcommand usage for options and arguments.\n" ]. - node_repair_usage() -> - ["riak admin node repair\n", - "Triggers a partition repair on all partitions on node(s).\n", - "\n", - "Unless given specifically with -n NODE, commands are executed on the current node.\n", - "NODE can be \"all\".\n" + node_usage(). + +node_repair_status_usage() -> + ["riak admin node repair status [-n|--node NODE|all] [-f|--format table|json]\n", + "Print status of any ongoing partition repairs on NODE.\n" + ]. + +node_repair_start_usage() -> + ["riak admin node repair start [-n|--node NODE]\n", + "Start partition repair on NODE.\n" ]. +node_repair_stop_usage() -> + ["riak admin node repair stop [-n|--node NODE|all] REASON\n", + "Kill all an ongoing partition repair on NODE, with REASON.\n" + ]. + +-define(NODEOPT, {node, [{shortname, "n"}, + {longname, "node"}, + {typecast, fun to_node/1}]}). +-define(FMTOPTION, {format, [{shortname, "f"}, + {longname, "format"}, + {typecast, fun to_fmt/1}]}). + +node_repair_status_specs() -> + [["riak-admin", "node", "repair", "status"], + [], [?NODEOPT, ?FMTOPTION], + fun(A, B, C) -> main(fun node_repair_status_cmd/3, A, B, C) end + ]. +node_repair_start_specs() -> + [["riak-admin", "node", "repair", "start"], + [], [?NODEOPT], + fun(A, B, C) -> main(fun node_repair_start_cmd/3, A, B, C) end + ]. +node_repair_stop_specs() -> + [["riak-admin", "node", "repair", "stop", '*'], + [], [?NODEOPT], + fun(A, B, C) -> main(fun node_repair_stop_cmd/3, A, B, C) end + ]. + + main(Fun, A, B, C) -> try Fun(A, B, C) @@ -63,43 +98,142 @@ main(Fun, A, B, C) -> Class:Reason:Stack -> logger:error("node repair: handler failed: ~p:~p stack=~p", [Class, Reason, Stack]), - clique_status:text(io_lib:format("ERROR: ~p:~p", [Class, Reason])) + [alert("Error: ~p:~p", [Class, Reason])] end. --define(NODEOPT, {node, [{shortname, "n"}, - {longname, "node"}, - {typecast, fun to_node/1}]}). - -target_nodes(Opts) -> - NN = [N || {node, N} <- Opts], - case lists:member(all, NN) of - true -> - Ns = [node() | nodes()], - Ns; - false when NN /= [] -> - NN; - _ -> - Ns = [node()], - Ns +node_repair_status_cmd(_Cmd, _Args, Opts) -> + Nodes = + case [A || {node, A} <- Opts] of + [all] -> + [node() | nodes()]; + [] -> + [node()]; + NN -> + NN + end, + Fmt = extract_fmt_option(Opts), + + Res = get_node_repair_status(Nodes), + + case Fmt of + table -> + Table = + [begin + Rows = + [[{mod, Mod}, {idx, Idx}, {pid, list_to_binary(pid_to_list(Pid))}] + || {Mod, Idx, Pid} <- NRes], + case Rows of + [] -> + text("No active node repairs on ~s\n", [Node]); + _ -> + [text("Node repairs on ~s", [Node]), table(Rows)] + end + end || {Node, NRes} <- Res], + lists:flatten(Table); + json -> + [text("~s", [riak_kv_wm_json:encode( + [#{node => Node, + status => [jsonify_status(S) || S <- Statuses]} + || {Node, Statuses} <- Res])])] end. -node_repair_specs() -> - [["riak-admin", "node", "repair"], - [], [?NODEOPT], - fun(A, B, C) -> main(fun node_repair_cmd/3, A, B, C) end - ]. - -node_repair_cmd(_Cmd, _Args, Opts) -> - Nodes = target_nodes(Opts), +get_node_repair_status(Nodes) -> + [begin + Vnodes = erpc:call(Node, riak_core_vnode_manager, all_vnodes, []), + Statuses = [{Mod, Idx, Pid, + erpc:call(Node, riak_core_vnode_manager, repair_status, [{Mod, Idx}])} + || {Mod, Idx, Pid} <- Vnodes], + {Node, [{Mod, Idx, Pid} || {Mod, Idx, Pid, Status} <- Statuses, Status /= not_found]} + end || Node <- Nodes]. + +nodes_running_repair() -> + AllSS = get_node_repair_status([node() | nodes()]), + [N || {N, SS} <- AllSS, SS /= []]. + +jsonify_status({Mod, Idx, Pid}) -> + #{mod => Mod, + idx => Idx, + pid => list_to_binary(pid_to_list(Pid))}. + +node_repair_start_cmd(_Cmd, _Args, Opts) -> + Node = + case [A || {node, A} <- Opts] of + [all] -> invalid; + [] -> node(); + [N] -> N; + _ -> invalid + end, + case Node of + invalid -> + [alert("Error: node repair can be started on one node at a time")]; + Node -> + case nodes_running_repair() of + [] -> + case erpc:call(Node, riak_client, repair_node, []) of + ok -> + [text("Node repair started on ~s.", [Node])]; + {error, BadRpcReason} -> + [alert("Error: failed to start node repair on ~s: ~p", [Node, BadRpcReason])] + end; + NwAA -> + [alert("There are repairs currently ongoing on node~s ~s.\n" + "Wait until these are completed before starting a new node repair.", + [ending(NwAA), string:join([atom_to_list(N) || N <- NwAA], ",")])] + end + end. - [{text, - lists:flatten( - io_lib:format("~p -> ~p", - [N, rpc:call(N, riak_client, repair_node, [])]))} - || N <- Nodes]. +node_repair_stop_cmd([_, _, _, _, Reason], _, Opts) -> + Nodes = + case [A || {node, A} <- Opts] of + [all] -> + [node() | nodes()]; + [] -> + [node()]; + NN -> + NN + end, + AllNodesWithRepairs = nodes_running_repair(), + Items = + [begin + case lists:member(Node, AllNodesWithRepairs) of + false -> + io_lib:format("\n* ~s: No active repairs", [Node]); + true -> + case erpc:call(Node, riak_core_vnode_manager, kill_repairs, [Reason]) of + ok -> + io_lib:format("\n* ~s: Node repair stopped", [Node]); + {error, BadRpcReason} -> + io_lib:format("\n* ~s: Failed to stop node repair on: ~p", [Node, BadRpcReason]) + end + end + end || Node <- Nodes], + [clique_status:list("Stopping repairs", Items)]. + + +extract_fmt_option(Opts) -> + case [A || {format, A} <- Opts] of + [] -> table; + ["table"] -> table; + ["json"] -> json; + _ -> invalid + end. to_node("all") -> all; to_node(A) -> clique_typecast:to_node(A). +to_fmt(A) -> + A. + +text(F, A) -> + clique_status:text(lists:flatten(io_lib:format(F, A))). +alert(S) -> + alert(S, []). +alert(F, A) -> + clique_status:alert([text(F, A)]). +table(A) -> + clique_status:table(A). + +ending([_]) -> ""; +ending(_) -> "s". diff --git a/src/riak_kv_tictacaae_cli.erl b/src/riak_kv_tictacaae_cli.erl index d9cb2420..73358a63 100644 --- a/src/riak_kv_tictacaae_cli.erl +++ b/src/riak_kv_tictacaae_cli.erl @@ -679,7 +679,7 @@ fold_cmd([_, _, _ | Items], Keys, Options) -> }, {ok, BB} = riak_client:aae_fold(Query), Printable = [printable_bin(B) || B <- BB], - io:format(FD, "~s\n", [mochijson2:encode(Printable)]) + io:format(FD, "~s\n", [riak_kv_wm_json:encode(Printable)]) end); ["find-keys"] -> @@ -697,7 +697,7 @@ fold_cmd([_, _, _ | Items], Keys, Options) -> Printable = [#{<<"key">> => printable_bin(K), <<"sibling_count">> => SibCnt } || {_B, K, SibCnt} <- KK], - io:format(FD, "~s\n", [mochijson2:encode(Printable)]) + io:format(FD, "~s\n", [riak_kv_wm_json:encode(Printable)]) end); ["count-keys"] -> @@ -730,7 +730,7 @@ fold_cmd([_, _, _ | Items], Keys, Options) -> Printable = [#{bucket => printable_bin(B), key => printable_bin(K), vclock => printable_vclock(VC)} || {B, K, VC} <- TT], - io:format(FD, "~s\n", [mochijson2:encode(Printable)]) + io:format(FD, "~s\n", [riak_kv_wm_json:encode(Printable)]) end); ["count-tombstones"] -> @@ -779,7 +779,7 @@ fold_cmd([_, _, _ | Items], Keys, Options) -> TS = proplists:get_value(total_size, SS), Sizes = proplists:get_value(sizes, SS), Siblings = proplists:get_value(siblings, SS), - io:format(FD, "~s\n", [mochijson2:encode( + io:format(FD, "~s\n", [riak_kv_wm_json:encode( #{total_count => TC, total_size => TS, sizes => [#{min => Min, From 47ed9eba96624dfe30f27e79f4af3142f4d41988 Mon Sep 17 00:00:00 2001 From: Andriy Zavada Date: Thu, 18 Jun 2026 14:06:49 +0100 Subject: [PATCH 3/3] node_cli: update docs --- docs/OperationsAndTroubleshootingGuide.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/OperationsAndTroubleshootingGuide.md b/docs/OperationsAndTroubleshootingGuide.md index 3f3942cf..f78a6b74 100644 --- a/docs/OperationsAndTroubleshootingGuide.md +++ b/docs/OperationsAndTroubleshootingGuide.md @@ -104,7 +104,7 @@ riak eval "riak_client:remove_node_from_coverage()." #### Completing a Repair -The data can then be recovered from the other nodes in the cluster issuing the `riak admin node repair -n NODE` command. This will prompt all vnodes which partially overlap the data held in the vnodes on the replacement node to race to play a role in repairing the node. Each vnode will only repair the data which overlaps, filtering out any data that another vnode has already repaired (or is in the process of repairing). +The data can then be recovered from the other nodes in the cluster issuing the `riak admin node repair start [-n NODE]` command. This will prompt all vnodes which partially overlap the data held in the vnodes on the replacement node to race to play a role in repairing the node. Each vnode will only repair the data which overlaps, filtering out any data that another vnode has already repaired (or is in the process of repairing). Available from Riak 3.4.0{: .label .label-purple }To improve the performance of repair, the `repair_span` configuration in the [riak_core schema section of riak.conf](https://github.com/OpenRiak/riak_core/blob/openriak-3.4/priv/riak_core.schema) can be changed to `double_pair`, and this has been proven to be more effective when used with the leveled backend together with the enablement of the `repair_deferred` option in the [riak_kv schema section of riak.conf](https://github.com/OpenRiak/riak_kv/blob/openriak-3.4/priv/riak_kv.schema). @@ -112,6 +112,8 @@ The combination of `repair_span = double_pair, repair_deferred = enabled` is sig Repair uses handoffs, and so can be tracked as with other cluster change operations. Once handoffs are complete, Tictac AAE should be re-enabled, e.g. by using `riak_client:tictacaae_resume_node().`. Once Tictac AAE confirms all vnodes are in-sync - then [`participate_in_coverage` can be re-enabled](#riak_client-remote_console-commands). +The progress of repairs can be inspected with `riak admin node repair status`, and stopped with `riak admin node repair stop`. + ### Rolling Replacement A rolling replacement is an extension of the [proactive replacement](#proactive-replacement) process. In a rolling replacement, a group of new nodes are installed. There is then a rolling process where some nodes are proactively replaced by the new nodes; and once those replaced nodes are free - they are use to proactively replace other nodes in the cluster.