diff --git a/intercepts/init_intercepts.erl b/intercepts/init_intercepts.erl new file mode 100644 index 00000000..ea423e08 --- /dev/null +++ b/intercepts/init_intercepts.erl @@ -0,0 +1,7 @@ +-module(init_intercepts). +-compile(export_all). +-include("intercept.hrl"). +-define(M, init_orig). + +get_status() -> + {starting, starting}. diff --git a/src/riak_test_escript.erl b/src/riak_test_escript.erl index a34da59f..b883364c 100644 --- a/src/riak_test_escript.erl +++ b/src/riak_test_escript.erl @@ -110,6 +110,7 @@ main(Args) -> notice end, + application:set_env(lager, error_logger_hwm, 250), %% helpful for debugging application:set_env(lager, handlers, [{lager_console_backend, ConsoleLagerLevel}, {lager_file_backend, [{file, "log/test.log"}, {level, ConsoleLagerLevel}]}]), diff --git a/src/rt.erl b/src/rt.erl index 3f686fad..22560a3a 100644 --- a/src/rt.erl +++ b/src/rt.erl @@ -409,18 +409,42 @@ slow_upgrade(Node, NewVersion, Nodes) -> ?assertEqual(ok, wait_until_no_pending_changes(Nodes)), ok. +%% Ideally we'd use `wait_until' for join retries, but it isn't +%% currently flexible enough to only retry on a specific error +%% tuple. Rather than rework all of the various arities, steal the +%% concept and worry about this later if the need arises. +join_with_retry(Fun) -> + MaxTime = rt_config:get(rt_max_wait_time), + Delay = rt_config:get(rt_retry_delay), + Retry = MaxTime div Delay, + join_retry(Fun(), Fun, Retry, Delay). + +join_retry(ok, _Fun, _Retry, _Delay) -> + ok; +join_retry({error, node_still_starting}, _Fun, 0, _Delay) -> + lager:warning("Too many retries, join failed"), + {error, too_many_retries}; +join_retry({error, node_still_starting}, Fun, RetryCount, Delay) -> + lager:warning("Join error because node is not yet ready, retrying after ~Bms", [Delay]), + timer:sleep(Delay), + join_retry(Fun(), Fun, RetryCount - 1, Delay); +join_retry(Error, _Fun, _Retry, _Delay) -> + Error. + %% @doc Have `Node' send a join request to `PNode' join(Node, PNode) -> - R = rpc:call(Node, riak_core, join, [PNode]), - lager:info("[join] ~p to (~p): ~p", [Node, PNode, R]), - ?assertEqual(ok, R), + Fun = fun() -> rpc:call(Node, riak_core, join, [PNode]) end, + lager:info("[join] ~p to (~p)", [Node, PNode]), + ?assertEqual(ok, join_with_retry(Fun)), ok. %% @doc Have `Node' send a join request to `PNode' staged_join(Node, PNode) -> - R = rpc:call(Node, riak_core, staged_join, [PNode]), - lager:info("[join] ~p to (~p): ~p", [Node, PNode, R]), - ?assertEqual(ok, R), + %% `riak_core:staged_join/1' can now return an `{error, + %% node_still_starting}' tuple which indicates retry. + Fun = fun() -> rpc:call(Node, riak_core, staged_join, [PNode]) end, + lager:info("[join] ~p to (~p)", [Node, PNode]), + ?assertEqual(ok, join_with_retry(Fun)), ok. plan_and_commit(Node) -> diff --git a/tests/riak667_mixed.erl b/tests/riak667_mixed.erl index ca6e5798..5b8dbf6b 100644 --- a/tests/riak667_mixed.erl +++ b/tests/riak667_mixed.erl @@ -76,8 +76,10 @@ confirm() -> ?KEY, riakc_map:to_op(Map2)), - %% Upgrade one node. + %% Upgrade one node and wait until the ring has converged so the correct + %% capabilities will be negotiated upgrade(Node2, "2.0.4"), + rt:wait_until_ring_converged([Node1, Node2]), lager:notice("running mixed 2.0.2 and 2.0.4"), diff --git a/tests/riak_control.erl b/tests/riak_control.erl index 41b1608e..a0967e6e 100644 --- a/tests/riak_control.erl +++ b/tests/riak_control.erl @@ -112,7 +112,7 @@ verify_control({Vsn, Node}, VersionedNodes) -> [{<<"partitions">>, NodePartitions}]} = verify_resource(Node, "/admin/partitions"), NodePartitions end, - validate_partitions({previous, Node}, Partitions, VersionedNodes), + validate_partitions({Vsn, Node}, Partitions, VersionedNodes), ok. diff --git a/tests/riak_control_authentication.erl b/tests/riak_control_authentication.erl index 34c6b283..7ef7bb98 100644 --- a/tests/riak_control_authentication.erl +++ b/tests/riak_control_authentication.erl @@ -137,6 +137,8 @@ verify_authentication(Vsn, ?RC_AUTH_NONE_CONFIG) -> [rt:http_url(Node), "/admin"]), ?assertEqual("200", os:cmd(Command)), + rt:stop_and_wait(Node), + pass; %% @doc Verify the disabled authentication method works with force SSL. verify_authentication(Vsn, ?RC_AUTH_NONE_CONFIG_FORCE_SSL) -> @@ -158,6 +160,8 @@ verify_authentication(Vsn, ?RC_AUTH_NONE_CONFIG_FORCE_SSL) -> % [rt:https_url(Node), "/admin"]), % ?assertEqual("200", os:cmd(AccessCommand)), + rt:stop_and_wait(Node), + pass; %% @doc Verify the userlist authentication method works. verify_authentication(Vsn, ?RC_AUTH_USERLIST_CONFIG) -> @@ -183,6 +187,8 @@ verify_authentication(Vsn, ?RC_AUTH_USERLIST_CONFIG) -> [rt:https_url(Node), "/admin"]), ?assertEqual("200", os:cmd(AuthCommand)), + rt:stop_and_wait(Node), + pass; %% @doc Verify the userlist authentication method works. verify_authentication(Vsn, ?RC_AUTH_USERLIST_CONFIG_FORCE_SSL) -> @@ -210,6 +216,8 @@ verify_authentication(Vsn, ?RC_AUTH_USERLIST_CONFIG_FORCE_SSL) -> % [rt:https_url(Node), "/admin"]), % ?assertEqual("200", os:cmd(AuthCommand)), + rt:stop_and_wait(Node), + pass; %% @doc Verify the userlist authentication method works. verify_authentication(Vsn, ?RC_AUTH_USERLIST_CONFIG_NO_FORCE_SSL) -> @@ -229,6 +237,8 @@ verify_authentication(Vsn, ?RC_AUTH_USERLIST_CONFIG_NO_FORCE_SSL) -> [rt:http_url(Node), "/admin"]), ?assertEqual("200", os:cmd(AuthCommand)), + rt:stop_and_wait(Node), + pass. %% @doc Build a one node cluster. @@ -241,7 +251,6 @@ build_singleton_cluster(Vsn, Config) -> %% the supervisor starts, we need to restart to ensure settings %% take effect. Node = lists:nth(1, Nodes), - rt:stop_and_wait(Node), rt:start_and_wait(Node), rt:wait_for_service(Node, riak_kv), diff --git a/tests/verify_build_cluster_caps_race.erl b/tests/verify_build_cluster_caps_race.erl new file mode 100644 index 00000000..d50b6c11 --- /dev/null +++ b/tests/verify_build_cluster_caps_race.erl @@ -0,0 +1,45 @@ +%% ------------------------------------------------------------------- +%% +%% Copyright (c) 2012 Basho Technologies, Inc. +%% +%% This file is provided to you under the Apache License, +%% Version 2.0 (the "License"); you may not use this file +%% except in compliance with the License. You may obtain +%% a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. +%% +%% ------------------------------------------------------------------- +-module(verify_build_cluster_caps_race). +-behavior(riak_test). +-export([confirm/0]). +-include_lib("eunit/include/eunit.hrl"). + +staged_join(InitiatingNode, DestinationNode) -> + rpc:call(InitiatingNode, riak_core, staged_join, + [DestinationNode]). + +confirm() -> + %% Deploy a set of new nodes + lager:info("Deploying nodes"), + + [Node1, Node2] = rt:deploy_nodes(2), + + configure_intercept(Node2), + + lager:info("joining Node 2 to the cluster..."), + ?assertMatch({error, _}, staged_join(Node2, Node1)), + pass. + +%% init must return `starting' status for join to fail +configure_intercept(Node) -> + lager:info("Doing unspeakably evil things to the VM"), + rt_intercept:add(Node, {init, + [{{get_status,0}, get_status}]}).