mirror of
https://github.com/valitydev/riak_test.git
synced 2024-11-06 16:45:29 +00:00
Make tests pass regardless of order run by always writing default values for configuration settings, rather than trying to only change certain values.
- Always set values to keys that have no defaults in schema, as setting a value to `undefined` does not return the "default" value when application:get_env/3 is called - Provide an easier way to get the configuration by creating the config record, with appropriate defaults - Code cleanup on BKV1/2/3 - make them easier to read/understand and for future work in refactoring the tests and removing some of the complexity. - Now that test is running more consistently, tighten tests to actually test something in test_vnode_protection. - Bump vnode protection to ?THRESHOLD+1 as 2.0 seems to have one extra message. - Convert lager:info to lager:debug in successful get cases to reduce noise. - Remove use of ConsistentType (use macros & pass BKV down to validate).
This commit is contained in:
parent
b863efcd76
commit
818645ec83
@ -31,53 +31,42 @@
|
||||
-define(GET_RETRIES, 1000).
|
||||
-define(BUCKET, <<"test">>).
|
||||
-define(KEY, <<"hotkey">>).
|
||||
-define(NORMAL_TYPE, <<"normal_type">>).
|
||||
-define(CONSISTENT_TYPE, <<"consistent_type">>).
|
||||
-define(WRITE_ONCE_TYPE, <<"write_once_type">>).
|
||||
-define(NORMAL_BKV, {{?NORMAL_TYPE, ?BUCKET}, ?KEY, <<"test">>}).
|
||||
-define(CONSISTENT_BKV, {{?CONSISTENT_TYPE, ?BUCKET}, ?KEY, <<"test">>}).
|
||||
-define(WRITE_ONCE_BKV, {{?WRITE_ONCE_TYPE, ?BUCKET}, ?KEY, <<"test">>}).
|
||||
|
||||
confirm() ->
|
||||
Nodes = setup(),
|
||||
|
||||
NormalType = <<"normal_type">>,
|
||||
ConsistentType = <<"consistent_type">>,
|
||||
WriteOnceType = <<"write_once_type">>,
|
||||
|
||||
ok = create_bucket_type(Nodes, NormalType, [{n_val, 3}]),
|
||||
ok = create_bucket_type(Nodes, ConsistentType, [{consistent, true}, {n_val, 5}]),
|
||||
ok = create_bucket_type(Nodes, WriteOnceType, [{write_once, true}, {n_val, 1}]),
|
||||
rt:wait_until(ring_manager_check_fun(hd(Nodes))),
|
||||
|
||||
BKV1 = {{NormalType, ?BUCKET}, ?KEY, <<"test">>},
|
||||
BKV2 = {{ConsistentType, ?BUCKET}, ?KEY, <<"test">>},
|
||||
BKV3 = {{WriteOnceType, ?BUCKET}, ?KEY, <<"test">>},
|
||||
Node1 = hd(Nodes),
|
||||
write_once(Node1, BKV1),
|
||||
write_once(Node1, BKV2),
|
||||
write_once(Node1, BKV3),
|
||||
|
||||
Tests = [test_no_overload_protection,
|
||||
test_vnode_protection,
|
||||
test_fsm_protection,
|
||||
test_cover_queries_overload],
|
||||
|
||||
[begin
|
||||
lager:info("Starting Test ~p for ~p~n", [Test, BKV]),
|
||||
ok = erlang:apply(?MODULE, Test, [Nodes, BKV, IsConsistent])
|
||||
end || Test <- Tests,
|
||||
{BKV, IsConsistent} <- [{BKV1, false},
|
||||
{BKV2, true},
|
||||
{BKV3, false}]],
|
||||
pass.
|
||||
|
||||
|
||||
setup() ->
|
||||
ensemble_util:build_cluster(5, default_config(), 5).
|
||||
%% This record contains the default values for config settings if they were not set
|
||||
%% in the advanced.config file - because setting something to `undefined` is not the same
|
||||
%% as not setting it at all, we need to make sure to overwrite with defaults for each test,
|
||||
%% not just set things back to `undefined`. Also, makes the tests re-orderable as they always
|
||||
%% set everything they need, and don't depend on a previous test to make changes.
|
||||
-record(config, {
|
||||
vnode_overload_threshold = 10000,
|
||||
vnode_check_interval = 5000,
|
||||
vnode_check_request_interval = 2500,
|
||||
fsm_limit=undefined}).
|
||||
|
||||
default_config() ->
|
||||
default_config(#config{}).
|
||||
|
||||
default_config(#config{
|
||||
vnode_overload_threshold=VnodeOverloadThreshold,
|
||||
vnode_check_interval = VnodeCheckInterval,
|
||||
vnode_check_request_interval = VnodeCheckRequestInterval,
|
||||
fsm_limit = FsmLimit
|
||||
}) ->
|
||||
[{riak_core, [{ring_creation_size, 8},
|
||||
{default_bucket_props, [{n_val, 5}]},
|
||||
{vnode_management_timer, 1000},
|
||||
{enable_health_checks, false},
|
||||
{enable_consensus, true},
|
||||
{vnode_overload_threshold, undefined}]},
|
||||
{riak_kv, [{fsm_limit, undefined},
|
||||
{vnode_overload_threshold, VnodeOverloadThreshold},
|
||||
{vnode_check_interval, VnodeCheckInterval},
|
||||
{vnode_check_request_interval, VnodeCheckRequestInterval}]},
|
||||
{riak_kv, [{fsm_limit, FsmLimit},
|
||||
{storage_backend, riak_kv_eleveldb_backend},
|
||||
{anti_entropy_build_limit, {100, 1000}},
|
||||
{anti_entropy_concurrency, 100},
|
||||
@ -86,23 +75,60 @@ default_config() ->
|
||||
{anti_entropy_timeout, 5000}]},
|
||||
{riak_api, [{pb_backlog, 1024}]}].
|
||||
|
||||
test_no_overload_protection(_Nodes, _BKV, true) ->
|
||||
confirm() ->
|
||||
Nodes = setup(),
|
||||
|
||||
ok = create_bucket_type(Nodes, ?NORMAL_TYPE, [{n_val, 3}]),
|
||||
ok = create_bucket_type(Nodes, ?CONSISTENT_TYPE, [{consistent, true}, {n_val, 5}]),
|
||||
ok = create_bucket_type(Nodes, ?WRITE_ONCE_TYPE, [{write_once, true}, {n_val, 1}]),
|
||||
rt:wait_until(ring_manager_check_fun(hd(Nodes))),
|
||||
|
||||
|
||||
Node1 = hd(Nodes),
|
||||
write_once(Node1, ?NORMAL_BKV),
|
||||
write_once(Node1, ?CONSISTENT_BKV),
|
||||
write_once(Node1, ?WRITE_ONCE_BKV),
|
||||
|
||||
Tests = [test_no_overload_protection,
|
||||
test_vnode_protection,
|
||||
test_fsm_protection],
|
||||
|
||||
[begin
|
||||
lager:info("Starting Test ~p for ~p~n", [Test, BKV]),
|
||||
ok = erlang:apply(?MODULE, Test, [Nodes, BKV])
|
||||
end || Test <- Tests,
|
||||
BKV <- [?NORMAL_BKV,
|
||||
?CONSISTENT_BKV,
|
||||
?WRITE_ONCE_BKV]],
|
||||
%% Test cover queries doesn't depend on bucket/keyvalue, just run it once
|
||||
test_cover_queries_overload(Nodes),
|
||||
pass.
|
||||
|
||||
|
||||
setup() ->
|
||||
ensemble_util:build_cluster(5, default_config(), 5).
|
||||
|
||||
test_no_overload_protection(_Nodes, ?CONSISTENT_BKV) ->
|
||||
ok;
|
||||
test_no_overload_protection(Nodes, BKV, ConsistentType) ->
|
||||
test_no_overload_protection(Nodes, BKV) ->
|
||||
lager:info("Setting default configuration for no overload protestion test."),
|
||||
rt:pmap(fun(Node) ->
|
||||
rt:update_app_config(Node, default_config())
|
||||
end, Nodes),
|
||||
lager:info("Testing with no overload protection"),
|
||||
ProcFun = build_predicate_eq(test_no_overload_protection, ?NUM_REQUESTS,
|
||||
"ProcFun", "Procs"),
|
||||
QueueFun = build_predicate_gte(test_no_overload_protection, ?NUM_REQUESTS,
|
||||
"QueueFun", "Queue Size"),
|
||||
verify_test_results(run_test(Nodes, BKV), ConsistentType, ProcFun, QueueFun).
|
||||
verify_test_results(run_test(Nodes, BKV), BKV, ProcFun, QueueFun).
|
||||
|
||||
verify_test_results({_NumProcs, QueueLen}, true, _, QueueFun) ->
|
||||
verify_test_results({_NumProcs, QueueLen}, ?CONSISTENT_BKV, _ProcFun, QueueFun) ->
|
||||
?assert(QueueFun(QueueLen));
|
||||
verify_test_results({NumProcs, QueueLen}, false, ProcFun, QueueFun) ->
|
||||
verify_test_results({NumProcs, QueueLen}, _BKV, ProcFun, QueueFun) ->
|
||||
?assert(ProcFun(NumProcs)),
|
||||
?assert(QueueFun(QueueLen)).
|
||||
|
||||
test_vnode_protection(Nodes, BKV, ConsistentType) ->
|
||||
test_vnode_protection(Nodes, BKV) ->
|
||||
%% Setting check_interval to one ensures that process_info is called
|
||||
%% to check the queue length on each vnode send.
|
||||
%% This allows us to artificially raise vnode queue lengths with dummy
|
||||
@ -111,14 +137,13 @@ test_vnode_protection(Nodes, BKV, ConsistentType) ->
|
||||
lager:info("Testing with vnode queue protection enabled"),
|
||||
lager:info("Setting vnode overload threshold to ~b", [?THRESHOLD]),
|
||||
lager:info("Setting vnode check interval to 1"),
|
||||
Config = [{riak_core, [{vnode_overload_threshold, ?THRESHOLD},
|
||||
{vnode_check_interval, 1}]}],
|
||||
Config = default_config(#config{vnode_overload_threshold=?THRESHOLD, vnode_check_interval=1}),
|
||||
rt:pmap(fun(Node) ->
|
||||
rt:update_app_config(Node, Config)
|
||||
end, Nodes),
|
||||
ProcFun = build_predicate_lt(test_vnode_protection, (?NUM_REQUESTS+1), "ProcFun", "Procs"),
|
||||
QueueFun = build_predicate_lt(test_vnode_protection, (?NUM_REQUESTS), "QueueFun", "QueueSize"),
|
||||
verify_test_results(run_test(Nodes, BKV), ConsistentType, ProcFun, QueueFun),
|
||||
QueueFun = build_predicate_lte(test_vnode_protection, (?THRESHOLD+1), "QueueFun", "QueueSize"),
|
||||
verify_test_results(run_test(Nodes, BKV), BKV, ProcFun, QueueFun),
|
||||
|
||||
[Node1 | _] = Nodes,
|
||||
CheckInterval = ?THRESHOLD div 2,
|
||||
@ -132,36 +157,31 @@ test_vnode_protection(Nodes, BKV, ConsistentType) ->
|
||||
Pid = suspend_vnode_proxy(Victim),
|
||||
ProcFun2 = build_predicate_gte("test_vnode_protection after suspend",
|
||||
(?NUM_REQUESTS), "ProcFun", "Procs"),
|
||||
QueueFun2 = build_predicate_lt("test_vnode_protection after suspend",
|
||||
(?NUM_REQUESTS), "QueueFun", "QueueSize"),
|
||||
verify_test_results(run_test(Nodes, BKV), ConsistentType, ProcFun2, QueueFun2),
|
||||
QueueFun2 = build_predicate_lte("test_vnode_protection after suspend",
|
||||
(?THRESHOLD+1), "QueueFun", "QueueSize"),
|
||||
verify_test_results(run_test(Nodes, BKV), BKV, ProcFun2, QueueFun2),
|
||||
Pid ! resume,
|
||||
ok.
|
||||
|
||||
%% Don't check on fast path
|
||||
test_fsm_protection(_, {{<<"write_once_type">>, _}, _, _}, _) ->
|
||||
test_fsm_protection(_, ?WRITE_ONCE_BKV) ->
|
||||
ok;
|
||||
%% Or consistent path - doesn't use FSMs either
|
||||
test_fsm_protection(_, _, true) ->
|
||||
%% Or consistent gets, as they don't use the FSM either
|
||||
test_fsm_protection(_, ?CONSISTENT_BKV) ->
|
||||
ok;
|
||||
test_fsm_protection(Nodes, BKV, false) ->
|
||||
test_fsm_protection(Nodes, BKV) ->
|
||||
lager:info("Testing with coordinator protection enabled"),
|
||||
lager:info("Setting FSM limit to ~b", [?THRESHOLD]),
|
||||
Config = [{riak_kv, [
|
||||
{fsm_limit, ?THRESHOLD},
|
||||
{vnode_overload_threshold, undefined},
|
||||
{vnode_check_interval, undefined}]}],
|
||||
%% Set FSM limit and reset other changes from previous tests.
|
||||
Config = default_config(#config{fsm_limit=?THRESHOLD}),
|
||||
rt:pmap(fun(Node) ->
|
||||
rt:update_app_config(Node, Config)
|
||||
end, Nodes),
|
||||
Node1 = hd(Nodes),
|
||||
|
||||
%% TODO: Figure out why just using rt:wait_for_service completely breaks this test,
|
||||
%% but not waiting for riak_kv leaves us open to a race where the resource doesn't exist yet.
|
||||
%% Do the retry dance instead for now inside get_calculated_sj_limit.
|
||||
rt:wait_for_cluster_service(Nodes, riak_kv),
|
||||
rt:load_modules_on_nodes([?MODULE], Nodes),
|
||||
{ok, ExpectedFsms} = get_calculated_sj_limit(Node1, riak_kv_get_fsm_sj),
|
||||
{ok, ExpectedFsms} = get_calculated_sj_limit(Node1, riak_kv_get_fsm_sj, 1),
|
||||
|
||||
%% We expect exactly ExpectedFsms, but because of a race in SideJob we sometimes get 1 more
|
||||
%% Adding 2 (the highest observed rasce to date) to the lte predicate to handle the occasional case.
|
||||
@ -170,7 +190,7 @@ test_fsm_protection(Nodes, BKV, false) ->
|
||||
"ProcFun", "Procs"),
|
||||
QueueFun = build_predicate_lt(test_fsm_protection, (?NUM_REQUESTS),
|
||||
"QueueFun", "QueueSize"),
|
||||
verify_test_results(run_test(Nodes, BKV), false, ProcFun, QueueFun),
|
||||
verify_test_results(run_test(Nodes, BKV), BKV, ProcFun, QueueFun),
|
||||
|
||||
ok.
|
||||
|
||||
@ -191,16 +211,14 @@ get_calculated_sj_limit(Node, ResourceName, Retries) when Retries > 0 ->
|
||||
get_calculated_sj_limit(Node, ResourceName, Retries) when Retries == 0 ->
|
||||
{error, io_lib:format("Failed to retrieve sidejob limit from ~p for resource ~p. Giving up.", [Node, ResourceName])}.
|
||||
|
||||
test_cover_queries_overload(_Nodes, _, true) ->
|
||||
ok;
|
||||
test_cover_queries_overload(Nodes, _, false) ->
|
||||
test_cover_queries_overload(Nodes) ->
|
||||
lager:info("Testing cover queries with vnode queue protection enabled"),
|
||||
lager:info("Setting vnode overload threshold to ~b", [?THRESHOLD]),
|
||||
lager:info("Setting vnode check interval to 1"),
|
||||
|
||||
Config = [{riak_core, [{vnode_overload_threshold, ?THRESHOLD},
|
||||
{vnode_check_request_interval, 2},
|
||||
{vnode_check_interval, 1}]}],
|
||||
Config = default_config(#config{vnode_overload_threshold=?THRESHOLD,
|
||||
vnode_check_request_interval=2,
|
||||
vnode_check_interval=1}),
|
||||
rt:pmap(fun(Node) ->
|
||||
rt:update_app_config(Node, Config)
|
||||
end, Nodes),
|
||||
@ -363,17 +381,17 @@ pb_get_fun(Node, Bucket, Key, TestPid) ->
|
||||
PBC = rt:pbc(Node),
|
||||
Result = case catch riakc_pb_socket:get(PBC, Bucket, Key) of
|
||||
{error, <<"overload">>} ->
|
||||
lager:info("overload detected in pb_get, continuing..."),
|
||||
lager:debug("overload detected in pb_get, continuing..."),
|
||||
true;
|
||||
%% we expect timeouts in this test as we've shut down a vnode - return true in this case
|
||||
{error, timeout} ->
|
||||
lager:info("timeout detected in pb_get, continuing..."),
|
||||
lager:debug("timeout detected in pb_get, continuing..."),
|
||||
true;
|
||||
{error, <<"timeout">>} ->
|
||||
lager:info("timeout detected in pb_get, continuing..."),
|
||||
lager:debug("timeout detected in pb_get, continuing..."),
|
||||
true;
|
||||
{ok, Res} ->
|
||||
lager:info("riakc_pb_socket:get(~p, ~p, ~p) succeeded, Res:~p", [PBC, Bucket, Key, Res]),
|
||||
lager:debug("riakc_pb_socket:get(~p, ~p, ~p) succeeded, Res:~p", [PBC, Bucket, Key, Res]),
|
||||
true;
|
||||
{error, Type} ->
|
||||
lager:error("riakc_pb_socket threw error ~p reading {~p, ~p}, retrying...", [Type, Bucket, Key]),
|
||||
|
Loading…
Reference in New Issue
Block a user