mirror of
https://github.com/valitydev/riak_test.git
synced 2024-11-06 16:45:29 +00:00
490 lines
16 KiB
Erlang
490 lines
16 KiB
Erlang
%%
|
|
%% QuickCheck test to verify that the vnode proxy overloads when the
|
|
%% mailbox estimate is in overload and that it recovers from overload
|
|
%% state once the vnode has processed the message queue. In the real
|
|
%% system, messages are sent to the vnode outside of the proxy so this
|
|
%% is also modelled.
|
|
%%
|
|
%% Properties -
|
|
%% 1) If the vnode message queue has cleared, the vnode proxy should not
|
|
%% be in overload.
|
|
%% 2) If the proxy has been sent more messages than the overload threshold
|
|
%% while the vnode is suspended, it should be in overload.
|
|
%% 3) If the proxy has been sent less than or equal to the overload threshold
|
|
%% it should not be in overload.
|
|
%%
|
|
%% Whenever the vnode is resumed, the test checks the message queues are
|
|
%% cleared and the model is reset. The main goal is to advance the proxy
|
|
%% into interesting new states.
|
|
%%
|
|
%% This test can be run outside of riak_test while working on it.
|
|
%% Symlink the source into a release build and run
|
|
%% c(proxy_overload_recovery).
|
|
%% proxy_overload_recovery:run(300). % Run for 5 mins
|
|
%%
|
|
%% On failure you can re-run counter examples *and* print out the internal
|
|
%% state with the run.
|
|
%% proxy_overload_recovery:check().
|
|
%%
|
|
%% TODO/Questions:
|
|
%% 1) Is there a better way to do the initialization step?
|
|
%% 2) Remove the riak_kv dependency so it's a pure riak_core test.
|
|
|
|
-module(proxy_overload_recovery).
|
|
-behaviour(riak_test).
|
|
|
|
-ifdef(EQC).
|
|
-include_lib("eqc/include/eqc.hrl").
|
|
-include_lib("eqc/include/eqc_statem.hrl").
|
|
-include_lib("eunit/include/eunit.hrl").
|
|
-export([confirm/0]).
|
|
-compile(export_all).
|
|
|
|
confirm() ->
|
|
%% disable AAE to prevent vnode start delay opening AAE leveldb
|
|
Opts = [{riak_kv, [{anti_entropy, {off, []}}]}],
|
|
Nodes = [Node1] = rt:deploy_nodes(1, Opts),
|
|
|
|
case setup_eqc(Nodes) of
|
|
ok ->
|
|
%% Distribute this code to the remote and add EQC
|
|
rt:load_modules_on_nodes([?MODULE], Nodes),
|
|
|
|
TestingTime = rt_config:get(eqc_testing_time, 120),
|
|
lager:info("Running vnode proxy overload property for ~p seconds\n",
|
|
[TestingTime]),
|
|
?assertEqual(true, rpc:call(Node1, ?MODULE, rtrun, [TestingTime])),
|
|
pass;
|
|
_ ->
|
|
lager:warning("EQC was unavailable on this machine - PASSing "
|
|
"the test, but it did not run. Wish we could skip.\n", []),
|
|
pass
|
|
end.
|
|
|
|
|
|
|
|
|
|
-record(tstate, {
|
|
rt, % Runtime information - either symbolic state or #rt{} record
|
|
vnode_running = true, % true if running, false if suspended
|
|
proxy_msgs = 0, % messages proxied while vnode suspended
|
|
direct_msgs = 0, % messages sent directly while vnode suspended
|
|
threshold}). % threshold for overload
|
|
-record(rt, {id, % Partition id being used for the test
|
|
pl, % Preflist to send messages via the proxy
|
|
ppid, % Proxy pid
|
|
vpid}).% Vnode pid
|
|
|
|
%% Riak test run - to be executed on the node
|
|
rtrun(Seconds) ->
|
|
case run(Seconds) of
|
|
true ->
|
|
true;
|
|
Res ->
|
|
%% Hopefully rpc will set the group leader to r_t so we can see
|
|
io:format("=== FAILED - running check to see if it was "
|
|
"transient and print model states ===\n", []),
|
|
check(),
|
|
Res
|
|
end.
|
|
|
|
%%
|
|
%% Runners - on failure may be helpful to connect to the node and run
|
|
%% check / recheck to verify a fix or try reshrinking.
|
|
%%
|
|
run() ->
|
|
run(60).
|
|
|
|
run(Seconds) ->
|
|
eqc:quickcheck(eqc:testing_time(Seconds, ?MODULE:prop_proxy_recovery())).
|
|
|
|
trap_run(Seconds) ->
|
|
eqc:quickcheck(eqc:testing_time(Seconds, ?TRAPEXIT(?MODULE:prop_proxy_recovery()))).
|
|
|
|
check() ->
|
|
eqc:check(eqc_statem:show_states(?MODULE:prop_proxy_recovery())).
|
|
|
|
recheck() ->
|
|
eqc:recheck(?MODULE:prop_proxy_recovery()).
|
|
|
|
cover(Seconds) ->
|
|
cover:start(),
|
|
cover:compile(?MODULE),
|
|
eqc:quickcheck(eqc:testing_time(Seconds, ?MODULE:prop_proxy_recovery())),
|
|
cover:analyze_to_file(?MODULE),
|
|
cover:stop().
|
|
|
|
%%
|
|
%% Quickcheck Property - setup and run the statem
|
|
%%
|
|
prop_proxy_recovery() ->
|
|
?FORALL(Cmds,commands(?MODULE),
|
|
begin
|
|
Start = os:timestamp(),
|
|
{H,S,Res} = run_commands(?MODULE, Cmds),
|
|
End = os:timestamp(),
|
|
Msecs = timer:now_diff(End, Start) div 1000,
|
|
?WHENFAIL(
|
|
begin
|
|
RT = S#tstate.rt,
|
|
case RT of
|
|
undefined ->
|
|
ok;
|
|
#rt{ppid = PPid, vpid = VPid} ->
|
|
eqc:format("\n\nPPid state = ~w\n",
|
|
[catch sys:get_status(PPid)]),
|
|
eqc:format("Vnode MsgQLen = ~p\n",
|
|
[catch msgq_len(VPid)])
|
|
end
|
|
end,
|
|
measure(duration, Msecs,
|
|
aggregate(with_title("Commands"), command_names(Cmds),
|
|
pretty_commands(?MODULE, Cmds, {H, S, Res},
|
|
Res == ok))))
|
|
end).
|
|
%%
|
|
%% Generators
|
|
%%
|
|
pos_int() ->
|
|
?LET(X, int(), 1 + abs(X)).
|
|
|
|
%% Generate the threshold seed - see get_params
|
|
threshold_seed() ->
|
|
frequency([%% test proxy disabled - 1%
|
|
{1, undefined},
|
|
%% test with all parameters given
|
|
{4, {choose(1,25),choose(1,25),choose(1,50)}},
|
|
%% test with the interval and threshold
|
|
{5, {choose(1, 50), choose(1, 50)}},
|
|
%% just provide a threshold, defaults for others
|
|
{90, choose(4, 100)}]).
|
|
|
|
%%
|
|
%% initial_state
|
|
%%
|
|
initial_state() ->
|
|
#tstate{}.
|
|
|
|
%%
|
|
%% Run prepare once before any other calls and then never again.
|
|
%%
|
|
precondition_common(#tstate{rt = undefined}, {call, _M, F, _A}) ->
|
|
F == prepare;
|
|
precondition_common(_, {call, _M, F, _A}) ->
|
|
F /= prepare.
|
|
|
|
%% %% Make sure we're still running what we think we're running - uncomment
|
|
%% %% if having process death issues
|
|
%% invariant(#tstate{rt = undefined}) ->
|
|
%% true;
|
|
%% invariant(#tstate{rt = #rt{id = Index, ppid = PPid, vpid = VPid}}) ->
|
|
%% RegName = riak_core_vnode_proxy:reg_name(riak_kv_vnode, Index),
|
|
%% PPid = whereis(RegName), % Check process we think it is.
|
|
%% true = is_process_alive(PPid),
|
|
%% true = is_process_alive(VPid),
|
|
%% true.
|
|
|
|
|
|
%%
|
|
%% Prepare runtime state, pick a vnode and find pids - precondition_common
|
|
%% ensures this runs first.
|
|
%%
|
|
prepare_args(_S) ->
|
|
[threshold_seed()].
|
|
|
|
prepare(ThresholdSeed) ->
|
|
{RequestInterval, Interval, Threshold} = get_params(ThresholdSeed),
|
|
Index = 0, % Mess with the same vnode proxy each time - could rotate if we wanted
|
|
|
|
%% Set (or unset) appenvs for the test
|
|
prep_env(vnode_check_request_interval, RequestInterval),
|
|
prep_env(vnode_check_interval, Interval),
|
|
prep_env(vnode_overload_threshold, Threshold),
|
|
prep_env(inactivity_timeout, infinity),
|
|
|
|
%% Reset the vnode and the proxy to get back into a known state.
|
|
%% Ask the supervisor to do it for us to avoid messy messages in the logs.
|
|
Id = 0,
|
|
{ok, VPid0} = riak_core_vnode_manager:get_vnode_pid(Id, riak_kv_vnode),
|
|
sys:resume(VPid0),
|
|
ok = supervisor:terminate_child(riak_core_vnode_sup, VPid0),
|
|
|
|
%% Reset the proxy pid to make sure it resets state and picks up the new
|
|
%% environment variables
|
|
ok = supervisor:terminate_child(riak_core_vnode_proxy_sup, {riak_kv_vnode, Id}),
|
|
RegName = riak_core_vnode_proxy:reg_name(riak_kv_vnode, Index),
|
|
undefined = whereis(RegName),
|
|
VPid1 = wait_for_vnode_change(VPid0, Index),
|
|
|
|
{ok, PPid} = supervisor:restart_child(riak_core_vnode_proxy_sup, {riak_kv_vnode, Id}),
|
|
|
|
%% Find the proxy pid and check it's alive and matches the supervisor
|
|
RegName = riak_core_vnode_proxy:reg_name(riak_kv_vnode, Index),
|
|
PPid = whereis(RegName),
|
|
|
|
%% Send a message through the vnode proxy (the timeout message is ignored)
|
|
%% and return the Pid so we know we have the same Pid.
|
|
{ok, VPid} = riak_core_vnode_proxy:command_return_vnode(
|
|
{riak_kv_vnode,Index,node()}, timeout),
|
|
?assertEqual(VPid, VPid1),
|
|
|
|
true = is_process_alive(PPid),
|
|
true = is_process_alive(VPid),
|
|
#rt{id = Id, pl = {Id, node()}, ppid = PPid, vpid = VPid}.
|
|
|
|
prepare_next(S, RT, [ThresholdSeed]) ->
|
|
{_RequestInterval, _Interval, Threshold} = get_params(ThresholdSeed),
|
|
S#tstate{rt = RT, threshold = Threshold}.
|
|
|
|
|
|
%%
|
|
%% Suspend the vnode so it cannot process messages and builds
|
|
%% up the vnode message queue
|
|
%%
|
|
suspend_pre(#tstate{vnode_running = Running}) ->
|
|
Running.
|
|
|
|
suspend_args(#tstate{rt = RT}) ->
|
|
[RT].
|
|
|
|
suspend(#rt{vpid = VPid}) ->
|
|
sys:suspend(VPid).
|
|
|
|
suspend_next(S, _V, _A) ->
|
|
S#tstate{vnode_running = false}.
|
|
|
|
%%
|
|
%% Resume the vnode and wait until it has completed
|
|
%% processing all queued messages
|
|
%%
|
|
resume_pre(#tstate{vnode_running = Running}) ->
|
|
not Running.
|
|
|
|
resume_args(#tstate{rt = RT}) ->
|
|
[RT].
|
|
|
|
resume(#rt{ppid = PPid, vpid = VPid}) ->
|
|
sys:resume(VPid),
|
|
%% Use the sys:get_status call to force a synchronous call
|
|
%% against the vnode & the proxy to ensure all messages sent by
|
|
%% this process have been serviced and there are no pending
|
|
%% 'ping's in the vnode before we continue.
|
|
%% Then drain the vnode to make sure any pending pongs have
|
|
%% been sent, and ensure the proxy has
|
|
_ = sys:get_status(PPid),
|
|
_ = sys:get_status(VPid),
|
|
ok = drain([VPid, PPid]).
|
|
|
|
resume_next(S, _V, _A) ->
|
|
S#tstate{vnode_running = true, proxy_msgs = 0, direct_msgs = 0}.
|
|
|
|
%%
|
|
%% Send messages through the proxy process, do not
|
|
%% wait for responses, use ignore so riak_core_vnode:reply
|
|
%% will not send any responses
|
|
%%
|
|
proxy_msg_args(#tstate{rt = RT}) ->
|
|
[RT, pos_int()].
|
|
|
|
proxy_msg(#rt{pl = PL}, NumMsgs) ->
|
|
[riak_kv_vnode:get(PL, {<<"b">>,<<"k">>}, ignore) ||
|
|
_X <- lists:seq(1, NumMsgs)],
|
|
ok.
|
|
|
|
proxy_msg_next(#tstate{vnode_running = false, proxy_msgs = ProxyMsgs} = S, _V,
|
|
[_RT, NumMsgs]) ->
|
|
S#tstate{proxy_msgs = ProxyMsgs + NumMsgs};
|
|
proxy_msg_next(S, _V, _A) ->
|
|
S.
|
|
|
|
%%
|
|
%% Send message directly to the core vnode process, simulating
|
|
%% the vnode manager or other processes sending additional messages
|
|
%% not accounted for by the proxy.
|
|
%%
|
|
direct_msg_args(#tstate{rt = RT}) ->
|
|
[RT, pos_int()].
|
|
|
|
direct_msg(#rt{vpid = VPid}, NumMsgs) ->
|
|
[catch gen_fsm:sync_send_event(VPid, {ohai, X}, 0) ||
|
|
X <- lists:seq(1, NumMsgs)],
|
|
ok.
|
|
|
|
direct_msg_next(#tstate{vnode_running = false, direct_msgs = DirectMsgs} = S,
|
|
_V, [_RT, NumMsgs]) ->
|
|
S#tstate{direct_msgs = DirectMsgs + NumMsgs};
|
|
direct_msg_next(S, _V, _A) ->
|
|
S.
|
|
|
|
|
|
%%
|
|
%% Check the current state of the vnode so we can test properties
|
|
%%
|
|
overloaded_args(#tstate{vnode_running = Running, rt = RT}) ->
|
|
[Running, RT].
|
|
|
|
overloaded(Running, #rt{ppid = PPid, vpid = VPid}) ->
|
|
case Running of
|
|
true ->
|
|
ok = drain([PPid, VPid]);
|
|
_ ->
|
|
ok
|
|
end,
|
|
{messages, PMsgs} = process_info(PPid, messages),
|
|
{messages, VMsgs} = process_info(VPid, messages),
|
|
Overloaded = riak_core_vnode_proxy:overloaded(PPid),
|
|
{Overloaded, {VMsgs, PMsgs}, sys:get_status(PPid)}.
|
|
|
|
overloaded_post(#tstate{threshold = undefined}, _A,
|
|
{R, _Messages, _ProxyStatus}) ->
|
|
%% If there are no thresholds there should never be an overload
|
|
eq(R, false);
|
|
overloaded_post(#tstate{vnode_running = true}, _A,
|
|
{R, _Messages, _ProxyStatus}) ->
|
|
%% If the vnode is running, we have cleared queues so
|
|
%% should not be in overload.
|
|
eq(R, false);
|
|
overloaded_post(#tstate{vnode_running = false,
|
|
proxy_msgs = ProxyMsgs,
|
|
threshold = Threshold}, _A,
|
|
{ResultOverload, _Messages, _ProxyStatus}) ->
|
|
%% Either
|
|
%% mailbox is completely an estimate based on proxy msgs
|
|
%% or mailbox is a check + estimate since
|
|
%% The test cannot tell which as prior state may have left the counter
|
|
%% at any value between 0 and Interval.
|
|
%%
|
|
if
|
|
ProxyMsgs > Threshold -> % definitely overloaded
|
|
eq(ResultOverload, true);
|
|
true ->
|
|
true % otherwise, not possible to check - vnode_resume gets us back
|
|
end.
|
|
|
|
%%
|
|
%% Test helpers
|
|
%%
|
|
|
|
%% Convert the test seed to request intervals, intervals and threshold
|
|
%% if unset, remove from the app env so the proxy init auto-configures
|
|
get_params(undefined) ->
|
|
{unset, unset, undefined};
|
|
get_params({Interval, ThresholdDelta}) ->
|
|
{unset, % let the proxy allocate it
|
|
Interval,
|
|
Interval + ThresholdDelta};
|
|
get_params({RequestInterval, IntervalDelta, ThresholdDelta}) ->
|
|
{RequestInterval,
|
|
RequestInterval + IntervalDelta,
|
|
RequestInterval + IntervalDelta + ThresholdDelta};
|
|
get_params(Threshold) when Threshold > 5000 ->
|
|
{unset, unset, Threshold};
|
|
get_params(Threshold) when Threshold < 5000 ->
|
|
%% Avoid annoying message about safety
|
|
{unset, Threshold div 2, Threshold}.
|
|
|
|
|
|
%% Set (or unset) params
|
|
prep_env(Var, unset) ->
|
|
application:unset_env(riak_core, Var, infinity);
|
|
prep_env(Var, Val) ->
|
|
application:set_env(riak_core, Var, Val).
|
|
|
|
%%
|
|
%% Wait until all messages are drained by the Pid. No guarantees
|
|
%% about future messages being sent, or that responses for the
|
|
%% last message consumed have been transmitted.
|
|
%% NOTE: The "drain 3 times in a row" was determined empirically,
|
|
%% and may not be sufficient (2 was not). Given time constraints,
|
|
%% living with it for now. If this fails, we should really add some
|
|
%% tracing code around the send of messages to Vnode and Proxy to
|
|
%% determine where extra messages are coming from rather than just
|
|
%% make this "try 4 times"
|
|
%%
|
|
drain(Pid) when is_pid(Pid) ->
|
|
drain([Pid], {-1, -1});
|
|
|
|
drain(Pids) when is_list(Pids) ->
|
|
drain(Pids, {-1, -1}).
|
|
drain(Pids, {PrevPrev, Prev}) ->
|
|
_ = [sys:suspend(Pid) || Pid <- Pids],
|
|
Len = lists:foldl(fun(Pid, Acc0) ->
|
|
{message_queue_len, Len} = erlang:process_info(Pid, message_queue_len),
|
|
Acc0 + Len
|
|
end, 0, Pids),
|
|
_ = [sys:resume(Pid) || Pid <- Pids],
|
|
case {PrevPrev, Prev, Len} of
|
|
{0, 0, 0} ->
|
|
ok;
|
|
_ ->
|
|
%% Attempt to ensure something else is scheduled before we try to drain again
|
|
erlang:yield(),
|
|
timer:sleep(1),
|
|
drain(Pids, {Prev, Len})
|
|
end.
|
|
|
|
%% Return the length of the message queue (or crash if proc dead)
|
|
msgq_len(Pid) ->
|
|
{message_queue_len, L} = process_info(Pid, message_queue_len),
|
|
L.
|
|
|
|
%% Kill the process and wait for it to exit
|
|
kill_and_wait(Pid) ->
|
|
Mref = erlang:monitor(process, Pid),
|
|
exit(Pid, kill),
|
|
receive
|
|
{'DOWN', Mref, _, _, _} ->
|
|
ok
|
|
end.
|
|
|
|
%%
|
|
%% Check if EQC is available on the local node, then add to remotes if needed
|
|
%% This could be refactored out if many tests need it.
|
|
setup_eqc(Nodes) ->
|
|
try
|
|
_ = eqc:reserved_until(), % will throw exception if problems
|
|
add_eqc_apps(Nodes)
|
|
catch
|
|
_:Err ->
|
|
lager:info("EQC unavailable: ~p\n", [Err]),
|
|
{error, unavailable}
|
|
end.
|
|
|
|
|
|
%% Add the QuickCheck in the r_t runner to the list of Nodes
|
|
add_eqc_apps(Nodes) ->
|
|
Apps = [eqc, pulse, pulse_otp, eqc_mcerlang],
|
|
[case code:lib_dir(App, ebin) of
|
|
{error, bad_name} -> % EQC component not installed locally
|
|
ok;
|
|
Path when is_list(Path) ->
|
|
case rpc:call(Node, code, priv_dir, [App]) of
|
|
{error, bad_name} ->
|
|
true = rpc:call(Node, code, add_pathz, [Path], 60000);
|
|
_ ->
|
|
ok
|
|
end
|
|
end || App <- Apps, Node <- Nodes],
|
|
ok.
|
|
|
|
|
|
wait_for_vnode_change(VPid0, Index) ->
|
|
{ok, VPid1} = riak_core_vnode_manager:get_vnode_pid(Index, riak_kv_vnode),
|
|
case VPid1 of
|
|
VPid0 ->
|
|
timer:sleep(1),
|
|
wait_for_vnode_change(VPid0, Index);
|
|
_ ->
|
|
VPid1
|
|
end.
|
|
|
|
-else. %% no EQC
|
|
|
|
-export([confirm/0]).
|
|
|
|
confirm() ->
|
|
lager:info("EQC not enabled, skipping test"),
|
|
pass.
|
|
|
|
-endif.
|