riak_test/tests/sweeper_long_test.erl
Mikael Lixenstrand 8dc5a13427 Update long test
2017-01-12 15:46:27 -05:00

348 lines
11 KiB
Erlang

-module(sweeper_long_test).
%% Copyright (c) 2007-2015 Basho Technologies, Inc. All Rights Reserved.
%% @doc The large scale test is to test:
%%
%% Properties
%%
%% large scale
%% real time balance working
%% fullsync not halted by nodes up/down/add/remove
%% realtime not halted by nodes up/down/add/remove
%% Sweeper AAE tree rebuild and Sweeper reaper
-behavior(riak_test).
-include_lib("eunit/include/eunit.hrl").
-compile(export_all).
-export([confirm/0]).
-record(state, {a_up = [], a_down = [], a_left = [], b_up= [], b_down= [], b_left =[]}).
-define(Conf,
[
{riak_repl,
[
{fullsync_strategy, aae},
{fullsync_on_connect, false},
{fullsync_interval, disabled},
{max_fssource_soft_retries, 10},
{max_fssource_retries, 5}
]},
{riak_kv, [{delete_mode, keep},
{tombstone_grace_period, 3600}, %% 1h in s
{reap_sweep_interval, 60}, %% 60 min
{anti_entropy_expire, 1000 * 60}, %% 60 min sweep intercal
{obj_ttl_sweep_interval, 60}, %% 60 min
{storage_backend, riak_kv_eleveldb_backend},
%%{sweep_window, {22, 7}},
{sweep_tick, 10000},
{anti_entropy, {on, []}}
]}
]).
-define(DEFAULT_BENCH_DURATION, 120).
-define(SizeA, 3).
-define(SizeB, 3).
-define(HARNESS, (rt_config:get(rt_harness))).
confirm() ->
BenchDuration =
rt_config:get(basho_bench_duration, ?DEFAULT_BENCH_DURATION),
lager:info("Setup test that will run for ~p min", [BenchDuration]),
{ANodes, BNodes} =
deploy_clusters_with_rt([{?SizeA, ?Conf}, {?SizeB,?Conf}], '<->'),
State0 = #state{ a_up = ANodes, b_up = BNodes},
lager:info("Start bench"),
start_basho_bench(ANodes, "putttl", [{put, 1}, {put_ttl, 1}, {delete, 1}]),
put(test_start, now()),
timer:sleep(timer:minutes(1)),
timer:sleep(timer:minutes(1)),
run_full_sync(State0),
timer:sleep(timer:minutes(1)),
run_full_sync(State0),
timer:sleep(timer:minutes(1)),
run_full_sync(State0),
timer:sleep(timer:minutes(1)),
_State5 = up_and_down_nodes(State0),
timer:sleep(trunc(BenchDuration* 60 * 1000 * 1.2)),
get_status(hd(ANodes)),
pass.
leave_join(State) ->
State2 = node_a_leave(State),
rt:wait_until_no_pending_changes(all_active_nodes(State2)),
timer:sleep(timer:minutes(1)),
State3 = node_a_join(State2),
rt:wait_until_no_pending_changes(all_active_nodes(State3)),
State3.
up_and_down_nodes(State) ->
State2 = node_a_down(State),
rt:wait_until_no_pending_changes(all_active_nodes(State2)),
State3 = node_b_down(State2),
rt:wait_until_no_pending_changes(all_active_nodes(State3)),
State4 = node_a_up(State3),
rt:wait_until_no_pending_changes(all_active_nodes(State4)),
State5 = node_b_up(State4),
rt:wait_until_no_pending_changes(all_active_nodes(State5)),
lager:info("No pending changes"),
State5.
get_status(Node) ->
rpc:call(Node, riak_kv_console, sweep_status, [[]]).
run_full_sync(State) ->
time_stamp_action(run_full_sync, "A->B"),
LeaderA = prepare_cluster(State#state.a_up, State#state.b_up),
{FullsyncTime, _} = timer:tc(repl_util,
start_and_wait_until_fullsync_complete,
[LeaderA]),
time_stamp_action(full_done, FullsyncTime div 1000000).
start_basho_bench(Nodes, Name, Operations) ->
PbIps = lists:map(fun(Node) ->
{ok, [{PB_IP, PB_Port}]} = rt:get_pb_conn_info(Node),
{PB_IP, PB_Port}
end, Nodes),
LoadConfig = bacho_bench_config(PbIps, Operations),
spawn_link(fun() -> rt_bench:bench(LoadConfig, Nodes, Name, 1, false) end).
bacho_bench_config(HostList, Operations) ->
BenchRate =
rt_config:get(basho_bench_rate, 10),
BenchDuration =
rt_config:get(basho_bench_duration, ?DEFAULT_BENCH_DURATION),
KeyGen =
rt_config:get(basho_bench_keygen, {int_to_bin_bigendian, {pareto_int, 1000000000}}),
ValGen =
rt_config:get(basho_bench_valgen, {exponential_bin, 100, 500}),
%% {get, 1},{put, 1},{delete, 2}
Operations =
rt_config:get(basho_bench_operations, Operations),
Bucket =
rt_config:get(basho_bench_bucket, <<"mybucket">>),
Driver =
rt_config:get(basho_bench_driver, riakc_pb),
ObjTTL =
rt_config:get(obj_ttl, 5),
rt_bench:config(BenchRate,
BenchDuration,
HostList,
KeyGen,
ValGen,
Operations,
Bucket,
Driver,
[{obj_ttl, ObjTTL}]).
random_action(State) ->
[_|ValidAUp] = State#state.a_up,
[_|ValidBUp] = State#state.b_up,
NodeActionList =
lists:flatten(
[add_actions(ValidAUp, fun node_a_down/2),
add_actions(ValidBUp, fun node_b_down/2),
add_actions(State#state.a_down, fun node_a_up/2),
add_actions(State#state.b_down, fun node_b_up/2)]),
{Node, Action} = lists:nth(random:uniform(length(NodeActionList)), NodeActionList),
Action(State, Node).
add_actions(Nodes, Action) ->
[{Node, Action} || Node <- Nodes].
%%%%%%%% Start / Stop
node_a_down(State) ->
node_a_down(State, lists:last(State#state.a_up)).
node_b_down(State) ->
node_b_down(State, lists:last(State#state.b_up)).
node_a_up(State) ->
node_a_up(State, lists:last(State#state.a_down)).
node_b_up(State) ->
node_b_up(State, lists:last(State#state.b_down)).
node_a_down(State, Node) ->
stop(Node),
new_state(State, node_a_down, [Node]).
node_b_down(State, Node) ->
stop(Node),
new_state(State, node_b_down, [Node]).
node_a_up(State, Node) ->
start(Node),
new_state(State, node_a_up, [Node]).
node_b_up(State, Node) ->
start(Node),
new_state(State, node_b_up, [Node]).
stop(Node) ->
rt:stop(Node),
rt:wait_until_unpingable(Node),
time_stamp_action(stop, Node),
timer:sleep(5000),
true.
start(Node) ->
rt:start(Node),
rt:wait_until_ready(Node),
timer:sleep(5000),
time_stamp_action(start, Node),
true.
%%%%%%%% Leave / Join
node_a_leave(State) ->
node_a_leave(State, lists:last(State#state.a_up)).
node_b_leave(State) ->
node_b_leave(State, lists:last(State#state.b_up)).
node_a_join(State) ->
node_a_join(State, lists:last(State#state.a_left)).
node_b_join(State) ->
node_b_join(State, lists:last(State#state.b_left)).
node_a_leave(State, Node) ->
leave(Node),
rt:wait_until_unpingable(Node),
new_state(State, node_a_leave, [Node]).
node_b_leave(State, Node) ->
leave(Node),
rt:wait_until_unpingable(Node),
new_state(State, node_b_leave, [Node]).
node_a_join(State, Node) ->
join(Node, hd(State#state.a_up)),
new_state(State, node_a_join, [Node]).
node_b_join(State, Node) ->
join(Node, hd(State#state.b_up)),
new_state(State, node_b_join, [Node]).
leave(Node) ->
time_stamp_action(leave, Node),
rt:leave(Node).
join(Node, Node1) ->
start(Node),
rt:wait_until_pingable(Node),
rt:staged_join(Node, Node1),
rt:plan_and_commit(Node1),
time_stamp_action(join, Node),
rt:try_nodes_ready([Node], 3, 500).
%%%%%%%% Update state after action
new_state(S, node_a_down, Node) ->
S#state{a_up = S#state.a_up -- Node,
a_down = S#state.a_down ++ Node};
new_state(S, node_b_down, Node) ->
S#state{b_up = S#state.b_up -- Node,
b_down = S#state.b_down ++ Node};
new_state(S, node_a_up, Node) ->
S#state{a_down = S#state.a_down -- Node,
a_up = S#state.a_up ++ Node};
new_state(S, node_b_up, Node) ->
S#state{b_down = S#state.b_down -- Node,
b_up = S#state.b_up ++ Node};
new_state(S, node_a_leave, Node) ->
S#state{a_up = S#state.a_up -- Node,
a_left = S#state.a_left ++ Node};
new_state(S, node_b_leave, Node) ->
S#state{b_up = S#state.b_up -- Node,
b_left = S#state.b_left ++ Node};
new_state(S, node_a_join, Node) ->
S#state{a_left = S#state.a_left -- Node,
a_up = S#state.a_up ++ Node};
new_state(S, node_b_join, Node) ->
S#state{b_left = S#state.b_left -- Node,
b_up = S#state.b_up ++ Node}.
all_active_nodes(State) ->
State#state.a_up ++ State#state.b_up.
prepare_cluster([AFirst|_] = ANodes, [BFirst|_]) ->
lager:info("Prepare cluster for fullsync"),
LeaderA = rpc:call(AFirst,
riak_core_cluster_mgr, get_leader, []),
{ok, {IP, Port}} = rpc:call(BFirst,
application, get_env, [riak_core, cluster_mgr]),
repl_util:connect_cluster(LeaderA, IP, Port),
?assertEqual(ok, repl_util:wait_for_connection(LeaderA, "B")),
repl_util:enable_fullsync(LeaderA, "B"),
rt:wait_until_ring_converged(ANodes), %% Only works when all nodes in ANodes are up.
?assertEqual(ok, repl_util:wait_for_connection(LeaderA, "B")),
lager:info("Prepare cluster for fullsync done"),
LeaderA.
time_stamp_action(Action, MetaData) ->
lager:info("repl_test ~p ~p ~p", [time_since_test_start(), Action, MetaData]).
time_since_test_start() ->
timer:now_diff(now(), get(test_start)) div 1000000.
random_up_down(State, N) ->
_ = random:seed(now()),
lists:foldl(fun(_N, StateIn) ->
NewState = random_action(StateIn),
run_full_sync(NewState)
end, State, lists:seq(1,N)).
deploy_clusters_with_rt(ClusterSetup, Direction) ->
[ANodes, BNodes] = rt:build_clusters(ClusterSetup),
setup_cluster_rt([ANodes, BNodes], Direction).
setup_cluster_rt([ANodes, BNodes], Direction) ->
?assertEqual(ok, repl_util:wait_until_leader_converge(ANodes)),
AFirst = hd(ANodes),
?assertEqual(ok, repl_util:wait_until_leader_converge(BNodes)),
BFirst = hd(BNodes),
repl_util:name_cluster(AFirst, "A"),
repl_util:name_cluster(BFirst, "B"),
?assertEqual(ok, rt:wait_until_ring_converged(ANodes)),
?assertEqual(ok, rt:wait_until_ring_converged(BNodes)),
case Direction of
'<->' ->
setup_rt(ANodes, '->', BNodes), setup_rt(ANodes, '<-', BNodes);
_ ->
setup_rt(ANodes, Direction, BNodes)
end,
{ANodes, BNodes}.
setup_rt(ANodes, '->', BNodes) ->
AFirst = hd(ANodes),
BFirst = hd(BNodes),
%% A -> B
connect_clusters(AFirst, BFirst),
repl_util:enable_realtime(AFirst, "B"),
?assertEqual(ok, rt:wait_until_ring_converged(ANodes)),
repl_util:start_realtime(AFirst, "B"),
?assertEqual(ok, rt:wait_until_ring_converged(ANodes));
setup_rt(ANodes, '<-', BNodes) ->
AFirst = hd(ANodes),
BFirst = hd(BNodes),
%% B -> A
connect_clusters(BFirst, AFirst),
repl_util:enable_realtime(BFirst, "A"),
?assertEqual(ok, rt:wait_until_ring_converged(BNodes)),
repl_util:start_realtime(BFirst, "A"),
?assertEqual(ok, rt:wait_until_ring_converged(BNodes)).
%% @doc Connect two clusters for replication using their respective
%% leader nodes.
connect_clusters(LeaderA, LeaderB) ->
{ok, {IP, Port}} = rpc:call(LeaderB, application, get_env,
[riak_core, cluster_mgr]),
repl_util:connect_cluster(LeaderA, IP, Port).