Nick Marino 8e2b99d6e8 Use rt:waitfor on reads in ensemble-byzantine test
There were some race conditions we found in this test where reads
to an ensemble may fail for a brief period of time after recovery. By
using the waitfor function, we ensure that the tests will still pass as
long as things eventually recover and start working properly again.
2015-03-23 13:12:46 -04:00

305 lines
10 KiB

-define(HARNESS, (rt_config:get(rt_harness))).
-define(NUM_NODES, 8).
-define(NVAL, 5).
confirm() ->
NumNodes = ?NUM_NODES,
NVal = ?NVAL,
_Quorum = NVal div 2 + 1,
Config = config(),
lager:info("Building cluster and waiting for ensemble to stablize"),
Nodes = ensemble_util:build_cluster(NumNodes, Config, NVal),
Node = hd(Nodes),
ensemble_util:wait_until_stable(Node, NVal),
create_strong_bucket_type(Node, NVal),
Bucket = {<<"strong">>, <<"test">>},
Key = <<"test-key">>,
Val = <<"test-val">>,
{ok, PL} = get_preflist(Node, Bucket, Key, NVal),
?assertEqual(NVal, length(PL)),
lager:info("PREFERENCE LIST: ~n ~p", [PL]),
PBC = rt:pbc(Node),
normal_write_and_read(PBC, Bucket, Key, Val),
test_lose_one_node_one_partition(PBC, Bucket, Key, Val, PL),
test_lose_all_but_one_partition(PBC, Bucket, Key, Val, PL),
test_lose_minority_synctrees(PBC, Bucket, Key, Val, PL),
test_lose_majority_synctrees(PBC, Bucket, Key, Val, PL),
test_lose_minority_synctrees_one_node_partitioned(PBC, Bucket, Key, Val,
PL, Nodes),
test_lose_all_data_and_trees_except_one_node(PBC, Bucket, Key, Val, PL),
{ok, _NewVal} = test_backup_restore_data_not_trees(Bucket, Key, Val, PL),
test_lose_all_data(PBC, Bucket, Key, PL),
config() ->
[{riak_core, [{default_bucket_props, [{n_val, 5}]},
{vnode_management_timer, 1000},
{ring_creation_size, 16},
{enable_consensus, true},
{target_n_val, 8}]}].
test_lose_minority_synctrees(PBC, Bucket, Key, Val, PL) ->
Minority = minority_vnodes(PL),
assert_lose_synctrees_and_recover(PBC, Bucket, Key, Val, PL, Minority).
test_lose_majority_synctrees(PBC, Bucket, Key, Val, PL) ->
Majority= majority_vnodes(PL),
assert_lose_synctrees_and_recover(PBC, Bucket, Key, Val, PL, Majority).
test_lose_minority_synctrees_one_node_partitioned(PBC, Bucket, Key, Val, PL,
Nodes) ->
Minority = minority_vnodes(PL),
{{Idx0, Node0}, primary} = hd(PL),
Ensemble = {kv, Idx0, 5},
Rest = [Node || {{_, Node}, _} <- lists:sublist(PL, ?NVAL-1)],
%% Partition off the last node
{{_, PartitionedNode},_} = lists:nth(?NVAL, PL),
PartInfo = rt:partition([PartitionedNode], Nodes -- [PartitionedNode]),
%% Wipe a minority of nodes
[wipe_tree(Ensemble, Idx, Node) || {{Idx, Node}, _} <- Minority],
kill_peers(Ensemble, Rest),
%% With a majority of nodes down (minority reboot, 1 partitioned) we
%% shouldn't be able to reach quorum. This is because we now have a majority
%% of untrusted synctrees, and all nodes are not online.
{error, <<"timeout">>} = riakc_pb_socket:get(PBC, Bucket, Key, []),
%% Heal the partition so that we can get quorum
ensemble_util:wait_until_quorum(Node0, Ensemble),
assert_valid_read(PBC, Bucket, Key, Val).
test_lose_all_but_one_partition(PBC, Bucket, Key, Val, PL) ->
Wiped = tl(PL),
{{Idx0, Node0}, primary} = hd(PL),
Ensemble = {kv, Idx0, 5},
lager:info("Wiping Data on Following Vnodes: ~p", [Wiped]),
ensemble_util:wait_until_quorum(Node0, Ensemble),
assert_valid_read(PBC, Bucket, Key, Val).
test_lose_one_node_one_partition(PBC, Bucket, Key, Val, PL) ->
{{Idx0, Node0}, primary} = hd(PL),
Ensemble = {kv, Idx0, 5},
Leader = ensemble_util:get_leader_pid(Node0, Ensemble),
LeaderNode = node(Leader),
LeaderIdx = get_leader_idx(PL, LeaderNode),
lager:info("Wiping Idx ~p data on LeaderNode ~p", [LeaderIdx, LeaderNode]),
wipe_partition(LeaderIdx, LeaderNode),
ensemble_util:wait_until_quorum(LeaderNode, Ensemble),
assert_valid_read(PBC, Bucket, Key, Val).
test_lose_all_data_and_trees_except_one_node(PBC, Bucket, Key, Val, PL) ->
Wiped = tl(PL),
{{Idx0, Node0}, primary} = hd(PL),
Ensemble = {kv, Idx0, 5},
wipe_trees(Ensemble, Wiped),
ensemble_util:wait_until_quorum(Node0, Ensemble),
assert_valid_read(PBC, Bucket, Key, Val).
test_backup_restore_data_not_trees(Bucket, Key, _Val, PL) ->
{{Idx, Node}, primary} = hd(PL),
Ensemble = {kv, Idx, 5},
backup_data(1, PL),
PBC = rt:pbc(Node),
ensemble_util:wait_until_quorum(Node, Ensemble),
Obj0 = rt:pbc_read(PBC, Bucket, Key),
NewVal = <<"test-val2">>,
Obj = riakc_obj:update_value(Obj0, NewVal),
riakc_pb_socket:put(PBC, Obj),
assert_valid_read(PBC, Bucket, Key, NewVal),
%% Backup the new data.
backup_data(2, PL),
%% Restore old data
restore_data(1, PL),
PBC1 = rt:pbc(Node),
ensemble_util:wait_until_quorum(Node, Ensemble),
%% Fail to read the restored data. Trees match newer data than what was
%% restored
assert_failed_read(PBC1, Bucket, Key),
%% Restore New Data that matches trees
restore_data(2, PL),
PBC2 = rt:pbc(Node),
ensemble_util:wait_until_quorum(Node, Ensemble),
assert_valid_read(PBC2, Bucket, Key, NewVal),
{ok, NewVal}.
test_lose_all_data(PBC, Bucket, Key, PL) ->
{error, _}=E = riakc_pb_socket:get(PBC, Bucket, Key, []),
lager:info("All data loss error = ~p", [E]).
assert_valid_read(PBC, Bucket, Key, Val) ->
ReadFun = fun() ->
Obj = rt:pbc_read(PBC, Bucket, Key),
Val =:= riakc_obj:get_value(Obj)
?assertEqual(ok, rt:wait_until(ReadFun)).
assert_failed_read(PBC, Bucket, Key) ->
?assertMatch({error, _}, riakc_pb_socket:get(PBC, Bucket, Key, [])).
normal_write_and_read(PBC, Bucket, Key, Val) ->
lager:info("Writing a consistent key"),
ok = rt:pbc_write(PBC, Bucket, Key, Val),
lager:info("Read key to verify it exists"),
assert_valid_read(PBC, Bucket, Key, Val).
stop_nodes(PL) ->
[rt:stop_and_wait(Node) || {{_, Node}, _} <- PL].
start_nodes(PL) ->
[rt:start_and_wait(Node) || {{_, Node}, _} <- PL].
data_path(Node) ->
?HARNESS:node_path(Node) ++ "/data/"++backend_dir().
backup_path(Node, N) ->
data_path(Node) ++ integer_to_list(N) ++ ".bak".
backup_data(N, PL) ->
[backup_node(Node, N) || {{_, Node}, _} <- PL].
backup_node(Node, N) ->
Path = data_path(Node),
BackupPath = backup_path(Node, N),
Cmd = "cp -R "++Path++" "++BackupPath,
lager:info("~p", [os:cmd(Cmd)]).
restore_data(N, PL) ->
[restore_node(Node, N) || {{_, Node}, _} <- PL].
restore_node(Node, N) ->
Path = data_path(Node),
BackupPath = backup_path(Node, N),
Cmd = "mv "++BackupPath++" "++Path,
?assertEqual([], os:cmd(Cmd)).
assert_lose_synctrees_and_recover(PBC, Bucket, Key, Val, PL, ToLose) ->
{{Idx0, Node0}, primary} = hd(PL),
Ensemble = {kv, Idx0, 5},
[wipe_tree(Ensemble, Idx, Node) || {{Idx, Node}, _} <- ToLose],
ensemble_util:wait_until_quorum(Node0, Ensemble),
assert_valid_read(PBC, Bucket, Key, Val).
majority_vnodes(PL) ->
Num = ?NVAL div 2 + 1,
{Majority, _} = lists:split(Num, PL),
minority_vnodes(PL) ->
Num = ?NVAL div 2,
{Minority, _} = lists:split(Num, PL),
get_leader_idx(PL, LeaderNode) ->
[{LeaderIdx, _}] = [{Idx, N} || {{Idx, N}, _} <- PL, N =:= LeaderNode],
kill_peers(Ensemble, Nodes) ->
Node = hd(Nodes),
{_, [View | _]} = rpc:call(Node, riak_ensemble_manager, get_views, [Ensemble]),
Peers = [P || P={_Id, N} <- View, lists:member(N, Nodes)],
lager:info("Killing Peers: ~p", [Peers]),
Pids = [rpc:call(Node, riak_ensemble_manager, get_peer_pid,
[Ensemble, Peer]) || Peer <- Peers],
[exit(Pid, kill) || Pid <- Pids, Pid =/= undefined].
wipe_partitions(PL) ->
[wipe_partition(Idx, Node) || {{Idx, Node}, _} <- PL].
wipe_trees(Ensemble, PL) ->
[wipe_tree(Ensemble, Idx, Node) || {{Idx, Node}, _} <- PL].
wipe_tree(Ensemble, Idx, Node) ->
rt:clean_data_dir([Node], "ensembles/trees/kv_"++integer_to_list(Idx)),
{_, [View | _]} = rpc:call(Node, riak_ensemble_manager, get_views, [Ensemble]),
[Peer] = [P || P={_Id, N} <- View, Node =:= N],
Pid = rpc:call(Node, riak_ensemble_manager, get_peer_pid, [Ensemble, Peer]),
lager:info("Peer= ~p, Pid = ~p", [Peer, Pid]),
exit(Pid, kill).
wipe_partition(Idx, Node) ->
rm_partition_dir(Idx, Node),
vnode_util:kill_vnode({Idx, Node}).
rm_backend_dir(Node) ->
rt:clean_data_dir([Node], backend_dir()).
rm_partition_dir(Idx, Node) ->
RelativePath = backend_dir() ++ "/" ++ integer_to_list(Idx),
rt:clean_data_dir([Node], RelativePath).
backend_dir() ->
TestMetaData = riak_test_runner:metadata(),
KVBackend = proplists:get_value(backend, TestMetaData),
backend_dir(undefined) ->
%% riak_test defaults to bitcask when undefined
backend_dir(bitcask) ->
backend_dir(eleveldb) ->
get_preflist(Node, Bucket, Key, NVal) ->
DocIdx = rpc:call(Node, riak_core_util, chash_std_keyfun, [{Bucket, Key}]),
PL = rpc:call(Node, riak_core_apl, get_primary_apl, [DocIdx, NVal, riak_kv]),
{ok, PL}.
create_strong_bucket_type(Node, NVal) ->
lager:info("Creating/activating 'strong' bucket type"),
rt:create_and_activate_bucket_type(Node, <<"strong">>,
[{consistent, true}, {n_val, NVal}]),
ensemble_util:wait_until_stable(Node, NVal).