Make tests pass regardless of order run by always writing default values for configuration settings, rather than trying to only change certain values.

- Always set values to keys that have no defaults in schema, as setting a value to `undefined` does not return the "default" value when application:get_env/3 is called
- Provide an easier way to get the configuration by creating the config record, with appropriate defaults
- Code cleanup on BKV1/2/3 - make them easier to read/understand and for future work in refactoring the tests and removing some of the complexity.
- Now that test is running more consistently, tighten tests to actually test something in test_vnode_protection.
- Bump vnode protection to ?THRESHOLD+1 as 2.0 seems to have one extra message.
- Convert lager:info to lager:debug in successful get cases to reduce noise.
- Remove use of ConsistentType (use macros & pass BKV down to validate).
This commit is contained in:
Doug Rohrer 2015-10-22 11:34:38 -04:00
parent b863efcd76
commit 818645ec83

View File

@ -31,53 +31,42 @@
-define(GET_RETRIES, 1000).
-define(BUCKET, <<"test">>).
-define(KEY, <<"hotkey">>).
-define(NORMAL_TYPE, <<"normal_type">>).
-define(CONSISTENT_TYPE, <<"consistent_type">>).
-define(WRITE_ONCE_TYPE, <<"write_once_type">>).
-define(NORMAL_BKV, {{?NORMAL_TYPE, ?BUCKET}, ?KEY, <<"test">>}).
-define(CONSISTENT_BKV, {{?CONSISTENT_TYPE, ?BUCKET}, ?KEY, <<"test">>}).
-define(WRITE_ONCE_BKV, {{?WRITE_ONCE_TYPE, ?BUCKET}, ?KEY, <<"test">>}).
confirm() ->
Nodes = setup(),
NormalType = <<"normal_type">>,
ConsistentType = <<"consistent_type">>,
WriteOnceType = <<"write_once_type">>,
ok = create_bucket_type(Nodes, NormalType, [{n_val, 3}]),
ok = create_bucket_type(Nodes, ConsistentType, [{consistent, true}, {n_val, 5}]),
ok = create_bucket_type(Nodes, WriteOnceType, [{write_once, true}, {n_val, 1}]),
rt:wait_until(ring_manager_check_fun(hd(Nodes))),
BKV1 = {{NormalType, ?BUCKET}, ?KEY, <<"test">>},
BKV2 = {{ConsistentType, ?BUCKET}, ?KEY, <<"test">>},
BKV3 = {{WriteOnceType, ?BUCKET}, ?KEY, <<"test">>},
Node1 = hd(Nodes),
write_once(Node1, BKV1),
write_once(Node1, BKV2),
write_once(Node1, BKV3),
Tests = [test_no_overload_protection,
test_vnode_protection,
test_fsm_protection,
test_cover_queries_overload],
[begin
lager:info("Starting Test ~p for ~p~n", [Test, BKV]),
ok = erlang:apply(?MODULE, Test, [Nodes, BKV, IsConsistent])
end || Test <- Tests,
{BKV, IsConsistent} <- [{BKV1, false},
{BKV2, true},
{BKV3, false}]],
pass.
setup() ->
ensemble_util:build_cluster(5, default_config(), 5).
%% This record contains the default values for config settings if they were not set
%% in the advanced.config file - because setting something to `undefined` is not the same
%% as not setting it at all, we need to make sure to overwrite with defaults for each test,
%% not just set things back to `undefined`. Also, makes the tests re-orderable as they always
%% set everything they need, and don't depend on a previous test to make changes.
-record(config, {
vnode_overload_threshold = 10000,
vnode_check_interval = 5000,
vnode_check_request_interval = 2500,
fsm_limit=undefined}).
default_config() ->
default_config(#config{}).
default_config(#config{
vnode_overload_threshold=VnodeOverloadThreshold,
vnode_check_interval = VnodeCheckInterval,
vnode_check_request_interval = VnodeCheckRequestInterval,
fsm_limit = FsmLimit
}) ->
[{riak_core, [{ring_creation_size, 8},
{default_bucket_props, [{n_val, 5}]},
{vnode_management_timer, 1000},
{enable_health_checks, false},
{enable_consensus, true},
{vnode_overload_threshold, undefined}]},
{riak_kv, [{fsm_limit, undefined},
{vnode_overload_threshold, VnodeOverloadThreshold},
{vnode_check_interval, VnodeCheckInterval},
{vnode_check_request_interval, VnodeCheckRequestInterval}]},
{riak_kv, [{fsm_limit, FsmLimit},
{storage_backend, riak_kv_eleveldb_backend},
{anti_entropy_build_limit, {100, 1000}},
{anti_entropy_concurrency, 100},
@ -86,23 +75,60 @@ default_config() ->
{anti_entropy_timeout, 5000}]},
{riak_api, [{pb_backlog, 1024}]}].
test_no_overload_protection(_Nodes, _BKV, true) ->
confirm() ->
Nodes = setup(),
ok = create_bucket_type(Nodes, ?NORMAL_TYPE, [{n_val, 3}]),
ok = create_bucket_type(Nodes, ?CONSISTENT_TYPE, [{consistent, true}, {n_val, 5}]),
ok = create_bucket_type(Nodes, ?WRITE_ONCE_TYPE, [{write_once, true}, {n_val, 1}]),
rt:wait_until(ring_manager_check_fun(hd(Nodes))),
Node1 = hd(Nodes),
write_once(Node1, ?NORMAL_BKV),
write_once(Node1, ?CONSISTENT_BKV),
write_once(Node1, ?WRITE_ONCE_BKV),
Tests = [test_no_overload_protection,
test_vnode_protection,
test_fsm_protection],
[begin
lager:info("Starting Test ~p for ~p~n", [Test, BKV]),
ok = erlang:apply(?MODULE, Test, [Nodes, BKV])
end || Test <- Tests,
BKV <- [?NORMAL_BKV,
?CONSISTENT_BKV,
?WRITE_ONCE_BKV]],
%% Test cover queries doesn't depend on bucket/keyvalue, just run it once
test_cover_queries_overload(Nodes),
pass.
setup() ->
ensemble_util:build_cluster(5, default_config(), 5).
test_no_overload_protection(_Nodes, ?CONSISTENT_BKV) ->
ok;
test_no_overload_protection(Nodes, BKV, ConsistentType) ->
test_no_overload_protection(Nodes, BKV) ->
lager:info("Setting default configuration for no overload protestion test."),
rt:pmap(fun(Node) ->
rt:update_app_config(Node, default_config())
end, Nodes),
lager:info("Testing with no overload protection"),
ProcFun = build_predicate_eq(test_no_overload_protection, ?NUM_REQUESTS,
"ProcFun", "Procs"),
QueueFun = build_predicate_gte(test_no_overload_protection, ?NUM_REQUESTS,
"QueueFun", "Queue Size"),
verify_test_results(run_test(Nodes, BKV), ConsistentType, ProcFun, QueueFun).
verify_test_results(run_test(Nodes, BKV), BKV, ProcFun, QueueFun).
verify_test_results({_NumProcs, QueueLen}, true, _, QueueFun) ->
verify_test_results({_NumProcs, QueueLen}, ?CONSISTENT_BKV, _ProcFun, QueueFun) ->
?assert(QueueFun(QueueLen));
verify_test_results({NumProcs, QueueLen}, false, ProcFun, QueueFun) ->
verify_test_results({NumProcs, QueueLen}, _BKV, ProcFun, QueueFun) ->
?assert(ProcFun(NumProcs)),
?assert(QueueFun(QueueLen)).
test_vnode_protection(Nodes, BKV, ConsistentType) ->
test_vnode_protection(Nodes, BKV) ->
%% Setting check_interval to one ensures that process_info is called
%% to check the queue length on each vnode send.
%% This allows us to artificially raise vnode queue lengths with dummy
@ -111,14 +137,13 @@ test_vnode_protection(Nodes, BKV, ConsistentType) ->
lager:info("Testing with vnode queue protection enabled"),
lager:info("Setting vnode overload threshold to ~b", [?THRESHOLD]),
lager:info("Setting vnode check interval to 1"),
Config = [{riak_core, [{vnode_overload_threshold, ?THRESHOLD},
{vnode_check_interval, 1}]}],
Config = default_config(#config{vnode_overload_threshold=?THRESHOLD, vnode_check_interval=1}),
rt:pmap(fun(Node) ->
rt:update_app_config(Node, Config)
end, Nodes),
ProcFun = build_predicate_lt(test_vnode_protection, (?NUM_REQUESTS+1), "ProcFun", "Procs"),
QueueFun = build_predicate_lt(test_vnode_protection, (?NUM_REQUESTS), "QueueFun", "QueueSize"),
verify_test_results(run_test(Nodes, BKV), ConsistentType, ProcFun, QueueFun),
QueueFun = build_predicate_lte(test_vnode_protection, (?THRESHOLD+1), "QueueFun", "QueueSize"),
verify_test_results(run_test(Nodes, BKV), BKV, ProcFun, QueueFun),
[Node1 | _] = Nodes,
CheckInterval = ?THRESHOLD div 2,
@ -132,36 +157,31 @@ test_vnode_protection(Nodes, BKV, ConsistentType) ->
Pid = suspend_vnode_proxy(Victim),
ProcFun2 = build_predicate_gte("test_vnode_protection after suspend",
(?NUM_REQUESTS), "ProcFun", "Procs"),
QueueFun2 = build_predicate_lt("test_vnode_protection after suspend",
(?NUM_REQUESTS), "QueueFun", "QueueSize"),
verify_test_results(run_test(Nodes, BKV), ConsistentType, ProcFun2, QueueFun2),
QueueFun2 = build_predicate_lte("test_vnode_protection after suspend",
(?THRESHOLD+1), "QueueFun", "QueueSize"),
verify_test_results(run_test(Nodes, BKV), BKV, ProcFun2, QueueFun2),
Pid ! resume,
ok.
%% Don't check on fast path
test_fsm_protection(_, {{<<"write_once_type">>, _}, _, _}, _) ->
test_fsm_protection(_, ?WRITE_ONCE_BKV) ->
ok;
%% Or consistent path - doesn't use FSMs either
test_fsm_protection(_, _, true) ->
%% Or consistent gets, as they don't use the FSM either
test_fsm_protection(_, ?CONSISTENT_BKV) ->
ok;
test_fsm_protection(Nodes, BKV, false) ->
test_fsm_protection(Nodes, BKV) ->
lager:info("Testing with coordinator protection enabled"),
lager:info("Setting FSM limit to ~b", [?THRESHOLD]),
Config = [{riak_kv, [
{fsm_limit, ?THRESHOLD},
{vnode_overload_threshold, undefined},
{vnode_check_interval, undefined}]}],
%% Set FSM limit and reset other changes from previous tests.
Config = default_config(#config{fsm_limit=?THRESHOLD}),
rt:pmap(fun(Node) ->
rt:update_app_config(Node, Config)
end, Nodes),
Node1 = hd(Nodes),
%% TODO: Figure out why just using rt:wait_for_service completely breaks this test,
%% but not waiting for riak_kv leaves us open to a race where the resource doesn't exist yet.
%% Do the retry dance instead for now inside get_calculated_sj_limit.
rt:wait_for_cluster_service(Nodes, riak_kv),
rt:load_modules_on_nodes([?MODULE], Nodes),
{ok, ExpectedFsms} = get_calculated_sj_limit(Node1, riak_kv_get_fsm_sj),
{ok, ExpectedFsms} = get_calculated_sj_limit(Node1, riak_kv_get_fsm_sj, 1),
%% We expect exactly ExpectedFsms, but because of a race in SideJob we sometimes get 1 more
%% Adding 2 (the highest observed rasce to date) to the lte predicate to handle the occasional case.
@ -170,7 +190,7 @@ test_fsm_protection(Nodes, BKV, false) ->
"ProcFun", "Procs"),
QueueFun = build_predicate_lt(test_fsm_protection, (?NUM_REQUESTS),
"QueueFun", "QueueSize"),
verify_test_results(run_test(Nodes, BKV), false, ProcFun, QueueFun),
verify_test_results(run_test(Nodes, BKV), BKV, ProcFun, QueueFun),
ok.
@ -191,16 +211,14 @@ get_calculated_sj_limit(Node, ResourceName, Retries) when Retries > 0 ->
get_calculated_sj_limit(Node, ResourceName, Retries) when Retries == 0 ->
{error, io_lib:format("Failed to retrieve sidejob limit from ~p for resource ~p. Giving up.", [Node, ResourceName])}.
test_cover_queries_overload(_Nodes, _, true) ->
ok;
test_cover_queries_overload(Nodes, _, false) ->
test_cover_queries_overload(Nodes) ->
lager:info("Testing cover queries with vnode queue protection enabled"),
lager:info("Setting vnode overload threshold to ~b", [?THRESHOLD]),
lager:info("Setting vnode check interval to 1"),
Config = [{riak_core, [{vnode_overload_threshold, ?THRESHOLD},
{vnode_check_request_interval, 2},
{vnode_check_interval, 1}]}],
Config = default_config(#config{vnode_overload_threshold=?THRESHOLD,
vnode_check_request_interval=2,
vnode_check_interval=1}),
rt:pmap(fun(Node) ->
rt:update_app_config(Node, Config)
end, Nodes),
@ -363,17 +381,17 @@ pb_get_fun(Node, Bucket, Key, TestPid) ->
PBC = rt:pbc(Node),
Result = case catch riakc_pb_socket:get(PBC, Bucket, Key) of
{error, <<"overload">>} ->
lager:info("overload detected in pb_get, continuing..."),
lager:debug("overload detected in pb_get, continuing..."),
true;
%% we expect timeouts in this test as we've shut down a vnode - return true in this case
{error, timeout} ->
lager:info("timeout detected in pb_get, continuing..."),
lager:debug("timeout detected in pb_get, continuing..."),
true;
{error, <<"timeout">>} ->
lager:info("timeout detected in pb_get, continuing..."),
lager:debug("timeout detected in pb_get, continuing..."),
true;
{ok, Res} ->
lager:info("riakc_pb_socket:get(~p, ~p, ~p) succeeded, Res:~p", [PBC, Bucket, Key, Res]),
lager:debug("riakc_pb_socket:get(~p, ~p, ~p) succeeded, Res:~p", [PBC, Bucket, Key, Res]),
true;
{error, Type} ->
lager:error("riakc_pb_socket threw error ~p reading {~p, ~p}, retrying...", [Type, Bucket, Key]),