riak_test/tests/replication2_connections.erl
Engel A. Sanchez 556cb7210c Ensure riak_repl service is up across the board
Trying to use the repl features before newly started nodes have
riak_repl completely initialized leads to all sorts of nasty crashes and
noise. Frequently it makes fullsync stuck forever, which makes a lot of
the tests fail.

This also tweaks the AAE fullsync tests to remove assumptions about
failure stats when AAE transient errors occur. The behavior in the
handling of those errors has changed recently with the introduction of
soft exits.
2014-12-18 16:07:00 -05:00

301 lines
10 KiB
Erlang

%% -------------------------------------------------------------------
%%
%% Copyright (c) 2013 Basho Technologies, Inc.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
%%
-module(replication2_connections).
-behaviour(riak_test).
-export([confirm/0]).
-include_lib("eunit/include/eunit.hrl").
-define(HB_TIMEOUT, 2000).
confirm() ->
?assertEqual(pass, simple_test()),
?assertEqual(pass, disconnect_test()),
?assertEqual(pass, error_cleanup_test()),
pass.
simple_test() ->
Conf = [{riak_repl,
[
%% turn off fullsync
{fullsync_on_connect, false},
{fullsync_interval, disabled},
%% override defaults for RT heartbeat so that we
%% can see faults sooner and have a quicker test.
{rt_heartbeat_interval, ?HB_TIMEOUT},
{rt_heartbeat_timeout, ?HB_TIMEOUT}
]}],
rt:set_advanced_conf(all, Conf),
[ANodes, BNodes] = rt:build_clusters([3, 3]),
rt:wait_for_cluster_service(ANodes, riak_repl),
rt:wait_for_cluster_service(BNodes, riak_repl),
lager:info("ANodes: ~p", [ANodes]),
lager:info("BNodes: ~p", [BNodes]),
lager:info("Waiting for leader to converge on cluster A"),
?assertEqual(ok, repl_util:wait_until_leader_converge(ANodes)),
AFirst = hd(ANodes),
lager:info("Waiting for leader to converge on cluster B"),
?assertEqual(ok, repl_util:wait_until_leader_converge(BNodes)),
BFirst = hd(BNodes),
lager:info("Naming A"),
repl_util:name_cluster(AFirst, "A"),
?assertEqual(ok, rt:wait_until_ring_converged(ANodes)),
lager:info("Naming B"),
repl_util:name_cluster(BFirst, "B"),
?assertEqual(ok, rt:wait_until_ring_converged(BNodes)),
lager:info("Connecting A to B"),
connect_clusters(AFirst, BFirst),
lager:info("Enabling realtime replication from A to B."),
repl_util:enable_realtime(AFirst, "B"),
?assertEqual(ok, rt:wait_until_ring_converged(ANodes)),
repl_util:start_realtime(AFirst, "B"),
?assertEqual(ok, rt:wait_until_ring_converged(ANodes)),
lager:info("Connecting B to A"),
connect_clusters(BFirst, AFirst),
lager:info("Enabling realtime replication from B to A."),
repl_util:enable_realtime(BFirst, "A"),
?assertEqual(ok, rt:wait_until_ring_converged(BNodes)),
repl_util:start_realtime(BFirst, "A"),
?assertEqual(ok, rt:wait_until_ring_converged(BNodes)),
lager:info("Verifying connectivity between clusters."),
[verify_connectivity(Node, "B") || Node <- ANodes],
[verify_connectivity(Node, "A") || Node <- BNodes],
rt:clean_cluster(ANodes),
rt:clean_cluster(BNodes),
pass.
disconnect_test() ->
Conf = [{riak_repl,
[
%% turn off fullsync
{fullsync_on_connect, false},
{fullsync_interval, disabled},
%% override defaults for RT heartbeat so that we
%% can see faults sooner and have a quicker test.
{rt_heartbeat_interval, ?HB_TIMEOUT},
{rt_heartbeat_timeout, ?HB_TIMEOUT}
]}],
rt:set_advanced_conf(all, Conf),
[ANodes, BNodes] = rt:build_clusters([3, 3]),
lager:info("ANodes: ~p", [ANodes]),
lager:info("BNodes: ~p", [BNodes]),
lager:info("Waiting for leader to converge on cluster A"),
?assertEqual(ok, repl_util:wait_until_leader_converge(ANodes)),
AFirst = hd(ANodes),
lager:info("Waiting for leader to converge on cluster B"),
?assertEqual(ok, repl_util:wait_until_leader_converge(BNodes)),
BFirst = hd(BNodes),
lager:info("Naming A"),
repl_util:name_cluster(AFirst, "A"),
?assertEqual(ok, rt:wait_until_ring_converged(ANodes)),
lager:info("Naming B"),
repl_util:name_cluster(BFirst, "B"),
?assertEqual(ok, rt:wait_until_ring_converged(BNodes)),
lager:info("Connecting A to B"),
connect_clusters(AFirst, BFirst),
lager:info("Connecting B to A"),
connect_clusters(BFirst, AFirst),
lager:info("Verifying connectivity between clusters."),
[verify_connectivity(Node, "B") || Node <- ANodes],
[verify_connectivity(Node, "A") || Node <- BNodes],
lager:info("Disconnect A to B"),
repl_util:disconnect_cluster(AFirst, "B"),
lager:info("Verifying disconnect from A to B."),
[verify_disconnect(Node, "B") || Node <- ANodes],
lager:info("Disconnect B to A"),
repl_util:disconnect_cluster(BFirst, "A"),
lager:info("Verifying disconnect from B to A."),
[verify_disconnect(Node, "A") || Node <- BNodes],
rt:clean_cluster(ANodes),
rt:clean_cluster(BNodes),
pass.
error_cleanup_test() ->
NumNodes = rt_config:get(num_nodes, 6),
lager:info("Deploy ~p nodes", [NumNodes]),
Conf = [{riak_repl,
[
%% turn off fullsync
{fullsync_on_connect, false},
{fullsync_interval, disabled},
%% override defaults for RT heartbeat so that we
%% can see faults sooner and have a quicker test.
{rt_heartbeat_interval, ?HB_TIMEOUT},
{rt_heartbeat_timeout, ?HB_TIMEOUT},
%% Set quicker cancellation interval of 5 seconds
{cm_cancellation_interval, 5 * 1000}
]}],
rt:set_advanced_conf(all, Conf),
[ANodes, BNodes] = rt:build_clusters([3, 3]),
lager:info("ANodes: ~p", [ANodes]),
lager:info("BNodes: ~p", [BNodes]),
lager:info("Waiting for leader to converge on cluster A"),
?assertEqual(ok, repl_util:wait_until_leader_converge(ANodes)),
AFirst = hd(ANodes),
lager:info("Waiting for leader to converge on cluster B"),
?assertEqual(ok, repl_util:wait_until_leader_converge(BNodes)),
BFirst = hd(BNodes),
lager:info("Naming A"),
repl_util:name_cluster(AFirst, "A"),
?assertEqual(ok, rt:wait_until_ring_converged(ANodes)),
lager:info("Naming B"),
repl_util:name_cluster(BFirst, "B"),
?assertEqual(ok, rt:wait_until_ring_converged(BNodes)),
% Insert intercept to cause some errors on connect
lager:info("Adding intercept to cause econnrefused errors"),
Intercept = {riak_core_connection,[{{sync_connect, 2}, return_econnrefused}]},
[ok = rt_intercept:add(Target, Intercept) || Target <- ANodes],
lager:info("Connecting A to B"),
connect_clusters(AFirst, BFirst),
lager:info("Wait until errors in connection_manager status"),
?assertEqual(ok, repl_util:wait_until_connection_errors(repl_util:get_leader(AFirst), BFirst)),
lager:info("Disconnect A from B via IP/PORT"),
?assertEqual(ok, rpc:call(AFirst, riak_repl_console, disconnect,[["127.0.0.1","10046"]])),
lager:info("Wait until connections clear"),
?assertEqual(ok, repl_util:wait_until_connections_clear(repl_util:get_leader(AFirst))),
lager:info("Verify disconnect from A to B"),
[verify_full_disconnect(Node) || Node <- ANodes],
% Insert intercept to allow connections to occur
lager:info("Adding intercept to allow connections"),
Intercept2 = {riak_core_connection,[{{sync_connect, 2}, sync_connect}]},
[ok = rt_intercept:add(Target, Intercept2) || Target <- ANodes],
lager:info("Connecting A to B"),
connect_clusters(AFirst, BFirst),
lager:info("Verifying connection from A to B"),
[verify_connectivity(Node, "B") || Node <- ANodes],
rt:clean_cluster(ANodes),
rt:clean_cluster(BNodes),
pass.
%% @doc Verify connectivity between sources and sink.
verify_connectivity(Node, Cluster) ->
print_repl_ring(Node),
?assertEqual(ok,repl_util:wait_for_connection(Node, Cluster)),
print_repl_ring(Node),
restart_process(Node, riak_core_connection_manager),
?assertEqual(ok,repl_util:wait_for_connection(Node, Cluster)).
%% @doc Verify disconnect between Node and sink Cluster.
verify_disconnect(Node, Cluster) ->
print_repl_ring(Node),
?assertEqual(ok,repl_util:wait_for_disconnect(Node, Cluster)),
print_repl_ring(Node),
restart_process(Node, riak_core_connection_manager),
?assertEqual(ok,repl_util:wait_for_disconnect(Node, Cluster)).
%% @doc Verify no connections of any type on Node.
verify_full_disconnect(Node) ->
print_repl_ring(Node),
?assertEqual(ok,repl_util:wait_for_full_disconnect(Node)),
print_repl_ring(Node),
restart_process(Node, riak_core_connection_manager),
?assertEqual(ok,repl_util:wait_for_full_disconnect(Node)).
%% @doc Print the status of the ring.
print_repl_ring(Node) ->
{ok, Ring} = rpc:call(Node,
riak_core_ring_manager,
get_my_ring,
[]),
Clusters = rpc:call(Node,
riak_repl_ring,
get_clusters,
[Ring]),
lager:info("REPL ring shows clusters as: ~p", [Clusters]).
%% @doc Restart a given process by name.
restart_process(Node, Name) ->
lager:info("Restarting ~p on ~p.", [Name, Node]),
%% Find the process.
Pid0 = rpc:call(Node, erlang, whereis, [Name]),
lager:info("Found ~p on node ~p at ~p, killing.",
[Name, Node, Pid0]),
%% Kill it.
true = rpc:call(Node, erlang, exit, [Pid0, brutal_kill]),
%% Verify it restarts.
rt:wait_until(Node, fun(_) ->
lager:info("Waiting for ~p to restart...", [Name]),
Pid = rpc:call(Node, erlang, whereis, [Name]),
Pid =/= Pid0 andalso Pid =/= undefined
end),
lager:info("Process restarted.").
%% @doc Connect two clusters for replication using their respective
%% leader nodes.
connect_clusters(LeaderA, LeaderB) ->
{ok, {_IP, Port}} = rpc:call(LeaderB, application, get_env,
[riak_core, cluster_mgr]),
repl_util:connect_cluster(LeaderA, "127.0.0.1", Port).