Update jmx_verify to actually verify supervisor behavior & retry settings. Previously would fail only if the retry + delay settings for riak_test were longer than the overall test_timeout, otherwise would fail even though it shouldn't.

This commit is contained in:
Doug Rohrer 2016-06-01 12:28:46 -04:00
parent 628c97f222
commit 5c321fbf12
2 changed files with 23 additions and 27 deletions

View File

@ -70,6 +70,7 @@
get_ip/1,
get_node_logs/0,
get_replica/5,
get_retry_settings/0,
get_ring/1,
get_version/0,
get_version/1,
@ -658,10 +659,14 @@ is_ring_ready(Node) ->
%% provided `rt_max_wait_time' and `rt_retry_delay' parameters in
%% specified `riak_test' config file.
wait_until(Fun) when is_function(Fun) ->
{Delay, Retry} = get_retry_settings(),
wait_until(Fun, Retry, Delay).
get_retry_settings() ->
MaxTime = rt_config:get(rt_max_wait_time),
Delay = rt_config:get(rt_retry_delay),
Retry = MaxTime div Delay,
wait_until(Fun, Retry, Delay).
{Delay, Retry}.
%% @doc Convenience wrapper for wait_until for the myriad functions that
%% take a node as single argument.
@ -1953,8 +1958,11 @@ setup_log_capture(Nodes) when is_list(Nodes) ->
setup_log_capture(Node) when not is_list(Node) ->
setup_log_capture([Node]).
expect_in_log(Node, Pattern) ->
{Delay, Retry} = get_retry_settings(),
expect_in_log(Node, Pattern, Retry, Delay).
expect_in_log(Node, Pattern, Retry, Delay) ->
CheckLogFun = fun() ->
Logs = rpc:call(Node, riak_test_lager_backend, get_logs, []),
lager:info("looking for pattern ~s in logs for ~p",
@ -1968,7 +1976,7 @@ expect_in_log(Node, Pattern) ->
false
end
end,
case rt:wait_until(CheckLogFun) of
case rt:wait_until(CheckLogFun, Retry, Delay) of
ok ->
true;
_ ->

View File

@ -118,7 +118,7 @@ confirm() ->
pass.
test_supervision() ->
JMXPort = 41111,
JMXPort = 22,
Config = [{riak_jmx, [{enabled, true}, {port, JMXPort}]}],
[Node|[]] = rt:deploy_nodes(1, Config),
timer:sleep(20000),
@ -144,30 +144,18 @@ test_supervision() ->
rpc:call(Node, riak_jmx, start, []),
lager:info("It can fail, it can fail 10 times"),
rt:wait_until(retry_check_fun(Node)),
%% NOTE: 10 times comes from riak_jmx_monitor.erl's MAX_RETRY macro (10).
%% Error logging is 0-based, so look for Retry #9
{Delay, _Retry} = rt:get_retry_settings(),
TwoMinutes = 2*60*1000,
TwoMinutsOfRetry = TwoMinutes div Delay,
?assertEqual(true, rt:expect_in_log(Node, "JMX server monitor .* exited with code .*\. Retry #9",
TwoMinutsOfRetry, Delay)),
?assertEqual(true, rt:expect_in_log(Node, "JMX server monitor .* exited with code .*\. Reached maximum retries of 10",
TwoMinutsOfRetry, Delay)),
rt:stop(Node),
ok_ok.
retry_check_fun(Node) ->
fun() ->
Logs = rpc:call(Node, riak_test_lager_backend, get_logs, []),
10 =:= lists:foldl(log_fold_fun(), 0, Logs)
end.
log_fold_fun() ->
fun(Log, Sum) ->
try case re:run(Log, "JMX server monitor .* exited with code .*\. Retry #.*", []) of
{match, _} -> 1 + Sum;
_ -> Sum
end
catch
Err:Reason ->
lager:error("jmx supervision re:run failed w/ ~p: ~p", [Err, Reason]),
Sum
end
end.
test_application_stop() ->
lager:info("Testing application:stop()"),
JMXPort = 41111,
@ -178,7 +166,7 @@ test_application_stop() ->
%% Let's make sure the java process is alive!
lager:info("checking for riak_jmx.jar running."),
rt:wait_until(Node, fun(_N) ->
?assertEqual(ok, rt:wait_until(Node, fun(_N) ->
try case re:run(rpc:call(Node, os, cmd, ["ps -Af"]), "riak_jmx.jar", []) of
nomatch -> false;
_ -> true
@ -188,7 +176,7 @@ test_application_stop() ->
lager:error("jmx stop re:run failed w/ ~p: ~p", [Err, Reason]),
false
end
end),
end)),
rpc:call(Node, riak_jmx, stop, ["Stopping riak_jmx"]),
timer:sleep(20000),