riak_test/tests/ensemble_byzantine.erl
Nick Marino 8e2b99d6e8 Use rt:waitfor on reads in ensemble-byzantine test
There were some race conditions we found in this test where reads
to an ensemble may fail for a brief period of time after recovery. By
using the waitfor function, we ensure that the tests will still pass as
long as things eventually recover and start working properly again.
2015-03-23 13:12:46 -04:00

305 lines
10 KiB
Erlang

%% -------------------------------------------------------------------
%%
%% Copyright (c) 2013-2014 Basho Technologies, Inc.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(ensemble_byzantine).
-export([confirm/0]).
-include_lib("eunit/include/eunit.hrl").
-define(HARNESS, (rt_config:get(rt_harness))).
-define(NUM_NODES, 8).
-define(NVAL, 5).
confirm() ->
NumNodes = ?NUM_NODES,
NVal = ?NVAL,
_Quorum = NVal div 2 + 1,
Config = config(),
lager:info("Building cluster and waiting for ensemble to stablize"),
Nodes = ensemble_util:build_cluster(NumNodes, Config, NVal),
vnode_util:load(Nodes),
Node = hd(Nodes),
ensemble_util:wait_until_stable(Node, NVal),
create_strong_bucket_type(Node, NVal),
Bucket = {<<"strong">>, <<"test">>},
Key = <<"test-key">>,
Val = <<"test-val">>,
{ok, PL} = get_preflist(Node, Bucket, Key, NVal),
?assertEqual(NVal, length(PL)),
lager:info("PREFERENCE LIST: ~n ~p", [PL]),
PBC = rt:pbc(Node),
normal_write_and_read(PBC, Bucket, Key, Val),
test_lose_one_node_one_partition(PBC, Bucket, Key, Val, PL),
test_lose_all_but_one_partition(PBC, Bucket, Key, Val, PL),
test_lose_minority_synctrees(PBC, Bucket, Key, Val, PL),
test_lose_majority_synctrees(PBC, Bucket, Key, Val, PL),
test_lose_minority_synctrees_one_node_partitioned(PBC, Bucket, Key, Val,
PL, Nodes),
test_lose_all_data_and_trees_except_one_node(PBC, Bucket, Key, Val, PL),
{ok, _NewVal} = test_backup_restore_data_not_trees(Bucket, Key, Val, PL),
test_lose_all_data(PBC, Bucket, Key, PL),
pass.
config() ->
[{riak_core, [{default_bucket_props, [{n_val, 5}]},
{vnode_management_timer, 1000},
{ring_creation_size, 16},
{enable_consensus, true},
{target_n_val, 8}]}].
test_lose_minority_synctrees(PBC, Bucket, Key, Val, PL) ->
Minority = minority_vnodes(PL),
assert_lose_synctrees_and_recover(PBC, Bucket, Key, Val, PL, Minority).
test_lose_majority_synctrees(PBC, Bucket, Key, Val, PL) ->
Majority= majority_vnodes(PL),
assert_lose_synctrees_and_recover(PBC, Bucket, Key, Val, PL, Majority).
test_lose_minority_synctrees_one_node_partitioned(PBC, Bucket, Key, Val, PL,
Nodes) ->
Minority = minority_vnodes(PL),
{{Idx0, Node0}, primary} = hd(PL),
Ensemble = {kv, Idx0, 5},
Rest = [Node || {{_, Node}, _} <- lists:sublist(PL, ?NVAL-1)],
%% Partition off the last node
{{_, PartitionedNode},_} = lists:nth(?NVAL, PL),
PartInfo = rt:partition([PartitionedNode], Nodes -- [PartitionedNode]),
%% Wipe a minority of nodes
[wipe_tree(Ensemble, Idx, Node) || {{Idx, Node}, _} <- Minority],
kill_peers(Ensemble, Rest),
%% With a majority of nodes down (minority reboot, 1 partitioned) we
%% shouldn't be able to reach quorum. This is because we now have a majority
%% of untrusted synctrees, and all nodes are not online.
timer:sleep(10000),
{error, <<"timeout">>} = riakc_pb_socket:get(PBC, Bucket, Key, []),
%% Heal the partition so that we can get quorum
rt:heal(PartInfo),
ensemble_util:wait_until_quorum(Node0, Ensemble),
assert_valid_read(PBC, Bucket, Key, Val).
test_lose_all_but_one_partition(PBC, Bucket, Key, Val, PL) ->
Wiped = tl(PL),
{{Idx0, Node0}, primary} = hd(PL),
Ensemble = {kv, Idx0, 5},
lager:info("Wiping Data on Following Vnodes: ~p", [Wiped]),
wipe_partitions(Wiped),
ensemble_util:wait_until_quorum(Node0, Ensemble),
assert_valid_read(PBC, Bucket, Key, Val).
test_lose_one_node_one_partition(PBC, Bucket, Key, Val, PL) ->
{{Idx0, Node0}, primary} = hd(PL),
Ensemble = {kv, Idx0, 5},
Leader = ensemble_util:get_leader_pid(Node0, Ensemble),
LeaderNode = node(Leader),
LeaderIdx = get_leader_idx(PL, LeaderNode),
lager:info("Wiping Idx ~p data on LeaderNode ~p", [LeaderIdx, LeaderNode]),
wipe_partition(LeaderIdx, LeaderNode),
ensemble_util:wait_until_quorum(LeaderNode, Ensemble),
assert_valid_read(PBC, Bucket, Key, Val).
test_lose_all_data_and_trees_except_one_node(PBC, Bucket, Key, Val, PL) ->
Wiped = tl(PL),
{{Idx0, Node0}, primary} = hd(PL),
Ensemble = {kv, Idx0, 5},
wipe_partitions(Wiped),
wipe_trees(Ensemble, Wiped),
ensemble_util:wait_until_quorum(Node0, Ensemble),
assert_valid_read(PBC, Bucket, Key, Val).
test_backup_restore_data_not_trees(Bucket, Key, _Val, PL) ->
{{Idx, Node}, primary} = hd(PL),
Ensemble = {kv, Idx, 5},
stop_nodes(PL),
backup_data(1, PL),
start_nodes(PL),
PBC = rt:pbc(Node),
ensemble_util:wait_until_quorum(Node, Ensemble),
timer:sleep(10000),
Obj0 = rt:pbc_read(PBC, Bucket, Key),
NewVal = <<"test-val2">>,
Obj = riakc_obj:update_value(Obj0, NewVal),
riakc_pb_socket:put(PBC, Obj),
assert_valid_read(PBC, Bucket, Key, NewVal),
stop_nodes(PL),
%% Backup the new data.
backup_data(2, PL),
%% Restore old data
restore_data(1, PL),
start_nodes(PL),
PBC1 = rt:pbc(Node),
ensemble_util:wait_until_quorum(Node, Ensemble),
%% Fail to read the restored data. Trees match newer data than what was
%% restored
assert_failed_read(PBC1, Bucket, Key),
stop_nodes(PL),
%% Restore New Data that matches trees
restore_data(2, PL),
start_nodes(PL),
PBC2 = rt:pbc(Node),
ensemble_util:wait_until_quorum(Node, Ensemble),
assert_valid_read(PBC2, Bucket, Key, NewVal),
{ok, NewVal}.
test_lose_all_data(PBC, Bucket, Key, PL) ->
wipe_partitions(PL),
{error, _}=E = riakc_pb_socket:get(PBC, Bucket, Key, []),
lager:info("All data loss error = ~p", [E]).
assert_valid_read(PBC, Bucket, Key, Val) ->
ReadFun = fun() ->
Obj = rt:pbc_read(PBC, Bucket, Key),
Val =:= riakc_obj:get_value(Obj)
end,
?assertEqual(ok, rt:wait_until(ReadFun)).
assert_failed_read(PBC, Bucket, Key) ->
?assertMatch({error, _}, riakc_pb_socket:get(PBC, Bucket, Key, [])).
normal_write_and_read(PBC, Bucket, Key, Val) ->
lager:info("Writing a consistent key"),
ok = rt:pbc_write(PBC, Bucket, Key, Val),
lager:info("Read key to verify it exists"),
assert_valid_read(PBC, Bucket, Key, Val).
stop_nodes(PL) ->
[rt:stop_and_wait(Node) || {{_, Node}, _} <- PL].
start_nodes(PL) ->
[rt:start_and_wait(Node) || {{_, Node}, _} <- PL].
data_path(Node) ->
?HARNESS:node_path(Node) ++ "/data/"++backend_dir().
backup_path(Node, N) ->
data_path(Node) ++ integer_to_list(N) ++ ".bak".
backup_data(N, PL) ->
[backup_node(Node, N) || {{_, Node}, _} <- PL].
backup_node(Node, N) ->
Path = data_path(Node),
BackupPath = backup_path(Node, N),
Cmd = "cp -R "++Path++" "++BackupPath,
lager:info("~p", [os:cmd(Cmd)]).
restore_data(N, PL) ->
[restore_node(Node, N) || {{_, Node}, _} <- PL].
restore_node(Node, N) ->
Path = data_path(Node),
BackupPath = backup_path(Node, N),
rm_backend_dir(Node),
Cmd = "mv "++BackupPath++" "++Path,
?assertEqual([], os:cmd(Cmd)).
assert_lose_synctrees_and_recover(PBC, Bucket, Key, Val, PL, ToLose) ->
{{Idx0, Node0}, primary} = hd(PL),
Ensemble = {kv, Idx0, 5},
[wipe_tree(Ensemble, Idx, Node) || {{Idx, Node}, _} <- ToLose],
ensemble_util:wait_until_quorum(Node0, Ensemble),
assert_valid_read(PBC, Bucket, Key, Val).
majority_vnodes(PL) ->
Num = ?NVAL div 2 + 1,
{Majority, _} = lists:split(Num, PL),
Majority.
minority_vnodes(PL) ->
Num = ?NVAL div 2,
{Minority, _} = lists:split(Num, PL),
Minority.
get_leader_idx(PL, LeaderNode) ->
[{LeaderIdx, _}] = [{Idx, N} || {{Idx, N}, _} <- PL, N =:= LeaderNode],
LeaderIdx.
kill_peers(Ensemble, Nodes) ->
Node = hd(Nodes),
{_, [View | _]} = rpc:call(Node, riak_ensemble_manager, get_views, [Ensemble]),
Peers = [P || P={_Id, N} <- View, lists:member(N, Nodes)],
lager:info("Killing Peers: ~p", [Peers]),
Pids = [rpc:call(Node, riak_ensemble_manager, get_peer_pid,
[Ensemble, Peer]) || Peer <- Peers],
[exit(Pid, kill) || Pid <- Pids, Pid =/= undefined].
wipe_partitions(PL) ->
[wipe_partition(Idx, Node) || {{Idx, Node}, _} <- PL].
wipe_trees(Ensemble, PL) ->
[wipe_tree(Ensemble, Idx, Node) || {{Idx, Node}, _} <- PL].
wipe_tree(Ensemble, Idx, Node) ->
rt:clean_data_dir([Node], "ensembles/trees/kv_"++integer_to_list(Idx)),
{_, [View | _]} = rpc:call(Node, riak_ensemble_manager, get_views, [Ensemble]),
[Peer] = [P || P={_Id, N} <- View, Node =:= N],
Pid = rpc:call(Node, riak_ensemble_manager, get_peer_pid, [Ensemble, Peer]),
lager:info("Peer= ~p, Pid = ~p", [Peer, Pid]),
exit(Pid, kill).
wipe_partition(Idx, Node) ->
rm_partition_dir(Idx, Node),
vnode_util:kill_vnode({Idx, Node}).
rm_backend_dir(Node) ->
rt:clean_data_dir([Node], backend_dir()).
rm_partition_dir(Idx, Node) ->
RelativePath = backend_dir() ++ "/" ++ integer_to_list(Idx),
rt:clean_data_dir([Node], RelativePath).
backend_dir() ->
TestMetaData = riak_test_runner:metadata(),
KVBackend = proplists:get_value(backend, TestMetaData),
backend_dir(KVBackend).
backend_dir(undefined) ->
%% riak_test defaults to bitcask when undefined
backend_dir(bitcask);
backend_dir(bitcask) ->
"bitcask";
backend_dir(eleveldb) ->
"leveldb".
get_preflist(Node, Bucket, Key, NVal) ->
DocIdx = rpc:call(Node, riak_core_util, chash_std_keyfun, [{Bucket, Key}]),
PL = rpc:call(Node, riak_core_apl, get_primary_apl, [DocIdx, NVal, riak_kv]),
{ok, PL}.
create_strong_bucket_type(Node, NVal) ->
lager:info("Creating/activating 'strong' bucket type"),
rt:create_and_activate_bucket_type(Node, <<"strong">>,
[{consistent, true}, {n_val, NVal}]),
ensemble_util:wait_until_stable(Node, NVal).