mirror of
https://github.com/valitydev/riak_test.git
synced 2024-11-06 16:45:29 +00:00
df438b9406
This fixes some rare race conditions in ensemble_sync where we could sometimes run wait_for_stable prior to an ensemble actually becoming unstable, and then it would pass the wait but the ensemble could become unavailable during the next step in the test. By waiting for the ensemble leader tick counts to increment, we can guarantee that any failures will have been "noticed" prior to our calling wait_for_stable, because the leader_tick function ensures a quorum is present when it executes, and steps down if it fails to get one.
215 lines
8.5 KiB
Erlang
215 lines
8.5 KiB
Erlang
%% -------------------------------------------------------------------
|
|
%%
|
|
%% Copyright (c) 2013-2014 Basho Technologies, Inc.
|
|
%%
|
|
%% This file is provided to you under the Apache License,
|
|
%% Version 2.0 (the "License"); you may not use this file
|
|
%% except in compliance with the License. You may obtain
|
|
%% a copy of the License at
|
|
%%
|
|
%% http://www.apache.org/licenses/LICENSE-2.0
|
|
%%
|
|
%% Unless required by applicable law or agreed to in writing,
|
|
%% software distributed under the License is distributed on an
|
|
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
%% KIND, either express or implied. See the License for the
|
|
%% specific language governing permissions and limitations
|
|
%% under the License.
|
|
%%
|
|
%% -------------------------------------------------------------------
|
|
|
|
-module(ensemble_sync).
|
|
-export([confirm/0]).
|
|
-include_lib("eunit/include/eunit.hrl").
|
|
|
|
-define(INTERCEPT_TAB, intercept_leader_tick_counts).
|
|
|
|
confirm() ->
|
|
NVal = 5,
|
|
Config = ensemble_util:fast_config(NVal),
|
|
Nodes = ensemble_util:build_cluster(8, Config, NVal),
|
|
lists:foreach(fun init_intercepts/1, Nodes),
|
|
Node = hd(Nodes),
|
|
vnode_util:load(Nodes),
|
|
|
|
lager:info("Creating/activating 'strong' bucket type"),
|
|
rt:create_and_activate_bucket_type(Node, <<"strong">>,
|
|
[{consistent, true}, {n_val, NVal}]),
|
|
ensemble_util:wait_until_stable(Node, NVal),
|
|
|
|
ExpectOkay = [ok],
|
|
ExpectTimeout = [{error, timeout}, {error, <<"timeout">>},
|
|
{error, <<"failed">>} | ExpectOkay],
|
|
ExpectFail = [{error, notfound} | ExpectTimeout],
|
|
|
|
Scenarios = [%% corrupted, suspended, valid, empty, bucket, expect
|
|
{1, 1, 1, 2, <<"test1">>, ExpectOkay},
|
|
{1, 2, 0, 2, <<"test2">>, ExpectTimeout},
|
|
{2, 1, 0, 2, <<"test3">>, ExpectTimeout},
|
|
{3, 0, 0, 2, <<"test4">>, ExpectFail}
|
|
],
|
|
[ok = run_scenario(Nodes, NVal, Scenario) || Scenario <- Scenarios],
|
|
pass.
|
|
|
|
-spec partition(non_neg_integer(), node(), list()) -> {[{non_neg_integer(), node()}], [node()]}.
|
|
partition(Minority, ContactNode, PL) ->
|
|
AllVnodes = [VN || {VN, _} <- PL],
|
|
OtherVnodes = [VN || {VN={_, Owner}, _} <- PL,
|
|
Owner =/= ContactNode],
|
|
NodeCounts = num_partitions_per_node(OtherVnodes),
|
|
PartitionedNodes = minority_nodes(NodeCounts, Minority),
|
|
PartitionedVnodes = minority_vnodes(OtherVnodes, PartitionedNodes),
|
|
ValidVnodes = AllVnodes -- PartitionedVnodes,
|
|
{ValidVnodes, PartitionedNodes}.
|
|
|
|
num_partitions_per_node(Other) ->
|
|
lists:foldl(fun({_, Node}, Acc) ->
|
|
orddict:update_counter(Node, 1, Acc)
|
|
end, orddict:new(), Other).
|
|
|
|
minority_nodes(NodeCounts, MinoritySize) ->
|
|
lists:foldl(fun({Node, Count}, Acc) ->
|
|
case Count =:= 1 andalso length(Acc) < MinoritySize of
|
|
true ->
|
|
[Node | Acc];
|
|
false ->
|
|
Acc
|
|
end
|
|
end, [], NodeCounts).
|
|
|
|
minority_vnodes(Vnodes, PartitionedNodes) ->
|
|
[VN || {_, Node}=VN <- Vnodes, lists:member(Node, PartitionedNodes)].
|
|
|
|
run_scenario(Nodes, NVal, {NumKill, NumSuspend, NumValid, _, Name, Expect}) ->
|
|
Node = hd(Nodes),
|
|
Quorum = NVal div 2 + 1,
|
|
Minority = NVal - Quorum,
|
|
Bucket = {<<"strong">>, Name},
|
|
Keys = [<<N:64/integer>> || N <- lists:seq(1,1000)],
|
|
|
|
Key1 = hd(Keys),
|
|
DocIdx = rpc:call(Node, riak_core_util, chash_std_keyfun, [{Bucket, Key1}]),
|
|
PL = rpc:call(Node, riak_core_apl, get_primary_apl, [DocIdx, NVal, riak_kv]),
|
|
{Valid, Partitioned} = partition(Minority, Node, PL),
|
|
|
|
{KillVN, Valid2} = lists:split(NumKill, Valid),
|
|
{SuspendVN, Valid3} = lists:split(NumSuspend, Valid2),
|
|
{AfterVN, _} = lists:split(NumValid, Valid3),
|
|
|
|
io:format("PL: ~p~n", [PL]),
|
|
PBC = rt:pbc(Node),
|
|
Options = [{timeout, 2000}],
|
|
|
|
rpc:multicall(Nodes, riak_kv_entropy_manager, set_mode, [manual]),
|
|
Part = rt:partition(Nodes -- Partitioned, Partitioned),
|
|
wait_for_leader_tick_changes(Nodes),
|
|
ensemble_util:wait_until_stable(Node, Quorum),
|
|
|
|
%% Write data while minority is partitioned
|
|
lager:info("Writing ~p consistent keys", [1000]),
|
|
[ok = rt:pbc_write(PBC, Bucket, Key, Key) || Key <- Keys],
|
|
|
|
lager:info("Read keys to verify they exist"),
|
|
[rt:pbc_read(PBC, Bucket, Key, Options) || Key <- Keys],
|
|
rt:heal(Part),
|
|
|
|
%% Suspend desired number of valid vnodes
|
|
S1 = [vnode_util:suspend_vnode(VNode, VIdx) || {VIdx, VNode} <- SuspendVN],
|
|
|
|
%% Kill/corrupt desired number of valid vnodes
|
|
[vnode_util:kill_vnode(VN) || VN <- KillVN],
|
|
[vnode_util:rebuild_vnode(VN) || VN <- KillVN],
|
|
rpc:multicall(Nodes, riak_kv_entropy_manager, set_mode, [automatic]),
|
|
wait_for_leader_tick_changes(Nodes),
|
|
ensemble_util:wait_until_stable(Node, Quorum),
|
|
|
|
lager:info("Disabling AAE"),
|
|
rpc:multicall(Nodes, riak_kv_entropy_manager, disable, []),
|
|
ensemble_util:wait_until_stable(Node, Quorum),
|
|
|
|
%% Suspend remaining valid vnodes to ensure data comes from repaired vnodes
|
|
S2 = [vnode_util:suspend_vnode(VNode, VIdx) || {VIdx, VNode} <- AfterVN],
|
|
wait_for_leader_tick_changes(Nodes),
|
|
ensemble_util:wait_until_stable(Node, Quorum),
|
|
|
|
lager:info("Checking that key results match scenario"),
|
|
[rt:pbc_read_check(PBC, Bucket, Key, Expect, Options) || Key <- Keys],
|
|
|
|
lager:info("Re-enabling AAE"),
|
|
rpc:multicall(Nodes, riak_kv_entropy_manager, enable, []),
|
|
|
|
lager:info("Resuming all vnodes"),
|
|
[vnode_util:resume_vnode(Pid) || Pid <- S1 ++ S2],
|
|
wait_for_leader_tick_changes(Nodes),
|
|
ensemble_util:wait_until_stable(Node, NVal),
|
|
|
|
%% Check that for other than the "all bets are off" failure case,
|
|
%% we can successfully read all keys after all vnodes are available.
|
|
case lists:member({error, notfound}, Expect) of
|
|
true ->
|
|
ok;
|
|
false ->
|
|
lager:info("Re-reading keys to verify they exist"),
|
|
[rt:pbc_read(PBC, Bucket, Key, Options) || Key <- Keys]
|
|
end,
|
|
|
|
lager:info("Scenario passed"),
|
|
lager:info("-----------------------------------------------------"),
|
|
ok.
|
|
|
|
%% The following code is used so that we can wait for ensemble leader ticks to fire.
|
|
%% This allows us to fix a kind of race condition that we were dealing with in the
|
|
%% previous version of this test, where we were relying on ensemble_util:wait_until_stable
|
|
%% after making certain changes to the cluster.
|
|
init_intercepts(Node) ->
|
|
make_intercepts_tab(Node),
|
|
rt_intercept:add(Node, {riak_ensemble_peer, [{{leader_tick, 1}, count_leader_ticks}]}).
|
|
|
|
make_intercepts_tab(Node) ->
|
|
SupPid = rpc:call(Node, erlang, whereis, [sasl_safe_sup]),
|
|
Opts = [named_table, public, set, {heir, SupPid, {}}],
|
|
?INTERCEPT_TAB = rpc:call(Node, ets, new, [?INTERCEPT_TAB, Opts]).
|
|
|
|
get_leader_tick_counts(Nodes) ->
|
|
AllCounts = [get_leader_tick_counts_for_node(N) || N <- Nodes],
|
|
lists:append(AllCounts).
|
|
|
|
get_leader_tick_counts_for_node(Node) ->
|
|
Ensembles = rpc:call(Node, riak_kv_ensembles, local_ensembles, []),
|
|
Leaders = rpc:call(Node, lists, map, [fun riak_ensemble_manager:get_leader_pid/1, Ensembles]),
|
|
LocalLeaders = [P || P <- Leaders, node(P) =:= Node],
|
|
LookupFun = fun(P) ->
|
|
[Res] = rpc:call(Node, ets, lookup, [?INTERCEPT_TAB, P]),
|
|
Res
|
|
end,
|
|
lists:map(LookupFun, LocalLeaders).
|
|
|
|
wait_for_leader_tick_changes(Nodes) ->
|
|
Counts = get_leader_tick_counts(Nodes),
|
|
lists:foreach(fun wait_for_leader_tick_change/1, Counts).
|
|
|
|
wait_for_leader_tick_change({Pid, Count}) ->
|
|
F = fun() -> leader_tick_count_exceeds(Pid, Count) end,
|
|
?assertEqual(ok, rt:wait_until(F)).
|
|
|
|
leader_tick_count_exceeds(Pid, Count) ->
|
|
Node = node(Pid),
|
|
case rpc:call(Node, ets, lookup, [?INTERCEPT_TAB, Pid]) of
|
|
[{Pid, NewCount}] when NewCount > Count ->
|
|
true;
|
|
Res ->
|
|
%% If the count hasn't incremented, it may be because the leader
|
|
%% already stepped down, so check for that scenario as well:
|
|
case rpc:call(Node, sys, get_state, [Pid]) of
|
|
{leading, _} ->
|
|
Res;
|
|
Res2 = {badrpc, _} ->
|
|
{Res, Res2};
|
|
{_, _} ->
|
|
%% Would be nice if there was a more explicit way to match
|
|
%% this, but if it's not a badrpc and we're not leading, we
|
|
%% must be in some other state
|
|
true
|
|
end
|
|
end.
|