Nick Marino df438b9406 Fix occasional failures in ensemble_sync
This fixes some rare race conditions in ensemble_sync where we could
sometimes run wait_for_stable prior to an ensemble actually becoming
unstable, and then it would pass the wait but the ensemble could become
unavailable during the next step in the test.

By waiting for the ensemble leader tick counts to increment, we can
guarantee that any failures will have been "noticed" prior to our
calling wait_for_stable, because the leader_tick function ensures a
quorum is present when it executes, and steps down if it fails to get
2015-04-15 18:35:01 -04:00

215 lines
8.5 KiB

-define(INTERCEPT_TAB, intercept_leader_tick_counts).
confirm() ->
NVal = 5,
Config = ensemble_util:fast_config(NVal),
Nodes = ensemble_util:build_cluster(8, Config, NVal),
lists:foreach(fun init_intercepts/1, Nodes),
Node = hd(Nodes),
lager:info("Creating/activating 'strong' bucket type"),
rt:create_and_activate_bucket_type(Node, <<"strong">>,
[{consistent, true}, {n_val, NVal}]),
ensemble_util:wait_until_stable(Node, NVal),
ExpectOkay = [ok],
ExpectTimeout = [{error, timeout}, {error, <<"timeout">>},
{error, <<"failed">>} | ExpectOkay],
ExpectFail = [{error, notfound} | ExpectTimeout],
Scenarios = [%% corrupted, suspended, valid, empty, bucket, expect
{1, 1, 1, 2, <<"test1">>, ExpectOkay},
{1, 2, 0, 2, <<"test2">>, ExpectTimeout},
{2, 1, 0, 2, <<"test3">>, ExpectTimeout},
{3, 0, 0, 2, <<"test4">>, ExpectFail}
[ok = run_scenario(Nodes, NVal, Scenario) || Scenario <- Scenarios],
-spec partition(non_neg_integer(), node(), list()) -> {[{non_neg_integer(), node()}], [node()]}.
partition(Minority, ContactNode, PL) ->
AllVnodes = [VN || {VN, _} <- PL],
OtherVnodes = [VN || {VN={_, Owner}, _} <- PL,
Owner =/= ContactNode],
NodeCounts = num_partitions_per_node(OtherVnodes),
PartitionedNodes = minority_nodes(NodeCounts, Minority),
PartitionedVnodes = minority_vnodes(OtherVnodes, PartitionedNodes),
ValidVnodes = AllVnodes -- PartitionedVnodes,
{ValidVnodes, PartitionedNodes}.
num_partitions_per_node(Other) ->
lists:foldl(fun({_, Node}, Acc) ->
orddict:update_counter(Node, 1, Acc)
end, orddict:new(), Other).
minority_nodes(NodeCounts, MinoritySize) ->
lists:foldl(fun({Node, Count}, Acc) ->
case Count =:= 1 andalso length(Acc) < MinoritySize of
true ->
[Node | Acc];
false ->
end, [], NodeCounts).
minority_vnodes(Vnodes, PartitionedNodes) ->
[VN || {_, Node}=VN <- Vnodes, lists:member(Node, PartitionedNodes)].
run_scenario(Nodes, NVal, {NumKill, NumSuspend, NumValid, _, Name, Expect}) ->
Node = hd(Nodes),
Quorum = NVal div 2 + 1,
Minority = NVal - Quorum,
Bucket = {<<"strong">>, Name},
Keys = [<<N:64/integer>> || N <- lists:seq(1,1000)],
Key1 = hd(Keys),
DocIdx = rpc:call(Node, riak_core_util, chash_std_keyfun, [{Bucket, Key1}]),
PL = rpc:call(Node, riak_core_apl, get_primary_apl, [DocIdx, NVal, riak_kv]),
{Valid, Partitioned} = partition(Minority, Node, PL),
{KillVN, Valid2} = lists:split(NumKill, Valid),
{SuspendVN, Valid3} = lists:split(NumSuspend, Valid2),
{AfterVN, _} = lists:split(NumValid, Valid3),
io:format("PL: ~p~n", [PL]),
PBC = rt:pbc(Node),
Options = [{timeout, 2000}],
rpc:multicall(Nodes, riak_kv_entropy_manager, set_mode, [manual]),
Part = rt:partition(Nodes -- Partitioned, Partitioned),
ensemble_util:wait_until_stable(Node, Quorum),
%% Write data while minority is partitioned
lager:info("Writing ~p consistent keys", [1000]),
[ok = rt:pbc_write(PBC, Bucket, Key, Key) || Key <- Keys],
lager:info("Read keys to verify they exist"),
[rt:pbc_read(PBC, Bucket, Key, Options) || Key <- Keys],
%% Suspend desired number of valid vnodes
S1 = [vnode_util:suspend_vnode(VNode, VIdx) || {VIdx, VNode} <- SuspendVN],
%% Kill/corrupt desired number of valid vnodes
[vnode_util:kill_vnode(VN) || VN <- KillVN],
[vnode_util:rebuild_vnode(VN) || VN <- KillVN],
rpc:multicall(Nodes, riak_kv_entropy_manager, set_mode, [automatic]),
ensemble_util:wait_until_stable(Node, Quorum),
lager:info("Disabling AAE"),
rpc:multicall(Nodes, riak_kv_entropy_manager, disable, []),
ensemble_util:wait_until_stable(Node, Quorum),
%% Suspend remaining valid vnodes to ensure data comes from repaired vnodes
S2 = [vnode_util:suspend_vnode(VNode, VIdx) || {VIdx, VNode} <- AfterVN],
ensemble_util:wait_until_stable(Node, Quorum),
lager:info("Checking that key results match scenario"),
[rt:pbc_read_check(PBC, Bucket, Key, Expect, Options) || Key <- Keys],
lager:info("Re-enabling AAE"),
rpc:multicall(Nodes, riak_kv_entropy_manager, enable, []),
lager:info("Resuming all vnodes"),
[vnode_util:resume_vnode(Pid) || Pid <- S1 ++ S2],
ensemble_util:wait_until_stable(Node, NVal),
%% Check that for other than the "all bets are off" failure case,
%% we can successfully read all keys after all vnodes are available.
case lists:member({error, notfound}, Expect) of
true ->
false ->
lager:info("Re-reading keys to verify they exist"),
[rt:pbc_read(PBC, Bucket, Key, Options) || Key <- Keys]
lager:info("Scenario passed"),
%% The following code is used so that we can wait for ensemble leader ticks to fire.
%% This allows us to fix a kind of race condition that we were dealing with in the
%% previous version of this test, where we were relying on ensemble_util:wait_until_stable
%% after making certain changes to the cluster.
init_intercepts(Node) ->
rt_intercept:add(Node, {riak_ensemble_peer, [{{leader_tick, 1}, count_leader_ticks}]}).
make_intercepts_tab(Node) ->
SupPid = rpc:call(Node, erlang, whereis, [sasl_safe_sup]),
Opts = [named_table, public, set, {heir, SupPid, {}}],
?INTERCEPT_TAB = rpc:call(Node, ets, new, [?INTERCEPT_TAB, Opts]).
get_leader_tick_counts(Nodes) ->
AllCounts = [get_leader_tick_counts_for_node(N) || N <- Nodes],
get_leader_tick_counts_for_node(Node) ->
Ensembles = rpc:call(Node, riak_kv_ensembles, local_ensembles, []),
Leaders = rpc:call(Node, lists, map, [fun riak_ensemble_manager:get_leader_pid/1, Ensembles]),
LocalLeaders = [P || P <- Leaders, node(P) =:= Node],
LookupFun = fun(P) ->
[Res] = rpc:call(Node, ets, lookup, [?INTERCEPT_TAB, P]),
lists:map(LookupFun, LocalLeaders).
wait_for_leader_tick_changes(Nodes) ->
Counts = get_leader_tick_counts(Nodes),
lists:foreach(fun wait_for_leader_tick_change/1, Counts).
wait_for_leader_tick_change({Pid, Count}) ->
F = fun() -> leader_tick_count_exceeds(Pid, Count) end,
?assertEqual(ok, rt:wait_until(F)).
leader_tick_count_exceeds(Pid, Count) ->
Node = node(Pid),
case rpc:call(Node, ets, lookup, [?INTERCEPT_TAB, Pid]) of
[{Pid, NewCount}] when NewCount > Count ->
Res ->
%% If the count hasn't incremented, it may be because the leader
%% already stepped down, so check for that scenario as well:
case rpc:call(Node, sys, get_state, [Pid]) of
{leading, _} ->
Res2 = {badrpc, _} ->
{Res, Res2};
{_, _} ->
%% Would be nice if there was a more explicit way to match
%% this, but if it's not a badrpc and we're not leading, we
%% must be in some other state