riak_test/tests/verify_busy_dist_port.erl
Kelly McLaughlin 1f538d7ee0 Update tests that return something other than pass for success
As of commit 3044839456 tests that
return something other than the prescribed success atom 'pass' to
indicate success result in test failure. Change tests that return the
atom 'ok' or some other value to instead return 'pass' to indicate
success.
2014-05-22 15:54:23 -06:00

110 lines
4.4 KiB
Erlang

%% -------------------------------------------------------------------
%%
%% Copyright (c) 2012 Basho Technologies, Inc.
%%
%% Test for regression in riak_sysmon where busy_port/busy_dist_port were not set to
%% true in app.config by default. Originally reported in az1018 (AgileZen 1018).
%%
%% This test starts two riak nodes and pauses the process of one of the node's vms
%% using "kill -STOP". The other node (not paused) is then directed to send thousands
%% of messages to the paused node, which should cause busy_dist_port. We then check
%% for busy_dist_port messages in the logs.
%%
%% see: https://issues.basho.com/show_bug.cgi?id=1305
%% see: https://github.com/basho/basho_expect/blob/master/basho_expect/regression_az1018.py
%%
%% -- ORIGINAL TICKET TEXT FROM AGILE ZEN (AZ1018) --
%% As we discovered in a customer's production network, riak_sysmon has been
%% mis-configured and buggy and therefore was not logging 'busy_dist_port' events
%% when they were happening. While triaging the customer's cluster, we made
%% several mistakes while assuming that those events weren't happening.
%%
%% Two fixes are required:
%%
%% Fix the riak_sysmon_filter:init() code.
%% Tune the app.config settings to correct values.
%%
%% -- END ORIGINAL TICKET --
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(verify_busy_dist_port).
-behavior(riak_test).
-export([confirm/0]).
-include_lib("eunit/include/eunit.hrl").
confirm() ->
[Node1, Node2] = rt:build_cluster(2),
lager:info("deployed 2 nodes"),
rt:load_modules_on_nodes([cause_bdp, verify_bdp_event_handler,
riak_test_lager_backend], [Node1]),
Res = rpc:call(Node1, verify_bdp_event_handler, add_handler, [self()]),
ok = rpc:call(Node1, gen_event, add_handler, [lager_event, riak_test_lager_backend, [info, false]]),
ok = rpc:call(Node1, lager, set_loglevel, [riak_test_lager_backend, info]),
lager:info("RES: ~p", [Res]),
OsPid = rpc:call(Node2, os, getpid, []),
lager:info("pausing node 2 (~p) pid ~s", [Node2, OsPid]),
%% must use cast here, call will never return
rpc:cast(Node2, os, cmd, [lists:flatten(io_lib:format("kill -STOP ~s", [OsPid]))]),
lager:info("flooding node 2 (paused) with messages from node 1"),
rpc:call(Node1, cause_bdp, spam_nodes, [[Node2]]),
receive
go ->
lager:info("busy_dist_port event fired on node 1 (~p), checking logs", [Node1])
after
rt_config:get(rt_max_wait_time) ->
lager:error("no busy_dist_port event fired on node 1. test is borked",
[])
end,
lager:info("Verifying busy_dist_port message ended up in the log"),
CheckLogFun = fun(Node) ->
Logs = rpc:call(Node, riak_test_lager_backend, get_logs, []),
try case re:run(Logs, "monitor busy_dist_port .*#Port", []) of
{match, _} -> true;
nomatch -> false
end
catch
Err:Reason ->
lager:error("busy_dist_port re:run failed w/ ~p: ~p", [Err, Reason]),
false
end
end,
Success = case rt:wait_until(Node1, CheckLogFun) of
ok ->
lager:info("found busy_dist_port message in log", []),
true;
_ ->
lager:error("busy_dist_port message not found in log", []),
false
end,
lager:info("continuing node 2 (~p) pid ~s", [Node2, OsPid]),
%% NOTE: this call must be executed on the OS running Node2 in order to unpause it
%% and not break future test runs. The command cannot be executed via
%% rpc:cast(Node2, os, cmd, ...) because Node2 is paused and will never process the
%% message!
rt:cmd(lists:flatten(io_lib:format("kill -CONT ~p", [OsPid]))),
?assert(Success),
pass.