Make tests pass regardless of order run by always writing default values for configuration settings, rather than trying to only change certain values.

- Always set values to keys that have no defaults in schema, as setting a value to `undefined` does not return the "default" value when application:get_env/3 is called - Provide an easier way to get the configuration by creating the config record, with appropriate defaults - Code cleanup on BKV1/2/3 - make them easier to read/understand and for future work in refactoring the tests and removing some of the complexity. - Now that test is running more consistently, tighten tests to actually test something in test_vnode_protection. - Bump vnode protection to ?THRESHOLD+1 as 2.0 seems to have one extra message. - Convert lager:info to lager:debug in successful get cases to reduce noise. - Remove use of ConsistentType (use macros & pass BKV down to validate).
2024-11-06 16:45:29 +00:00 · 2015-10-22 11:34:38 -04:00 · 2015-10-22 11:34:38 -04:00 · 818645ec83
commit 818645ec83
parent b863efcd76
1 changed files with 93 additions and 75 deletions
--- a/tests/overload.erl
+++ b/tests/overload.erl
@ -31,53 +31,42 @@
 -define(GET_RETRIES, 1000).
 -define(BUCKET, <<"test">>).
 -define(KEY, <<"hotkey">>).
+-define(NORMAL_TYPE, <<"normal_type">>).
+-define(CONSISTENT_TYPE, <<"consistent_type">>).
+-define(WRITE_ONCE_TYPE, <<"write_once_type">>).
+-define(NORMAL_BKV, {{?NORMAL_TYPE, ?BUCKET}, ?KEY, <<"test">>}).
+-define(CONSISTENT_BKV, {{?CONSISTENT_TYPE, ?BUCKET}, ?KEY, <<"test">>}).
+-define(WRITE_ONCE_BKV, {{?WRITE_ONCE_TYPE, ?BUCKET}, ?KEY, <<"test">>}).

-confirm() ->
-    Nodes = setup(),
-
-    NormalType = <<"normal_type">>,
-    ConsistentType = <<"consistent_type">>,
-    WriteOnceType = <<"write_once_type">>,
-
-    ok = create_bucket_type(Nodes, NormalType, [{n_val, 3}]),
-    ok = create_bucket_type(Nodes, ConsistentType, [{consistent, true}, {n_val, 5}]),
-    ok = create_bucket_type(Nodes, WriteOnceType, [{write_once, true}, {n_val, 1}]),
-    rt:wait_until(ring_manager_check_fun(hd(Nodes))),
-
-    BKV1 = {{NormalType, ?BUCKET}, ?KEY, <<"test">>},
-    BKV2 = {{ConsistentType, ?BUCKET}, ?KEY, <<"test">>},
-    BKV3 = {{WriteOnceType, ?BUCKET}, ?KEY, <<"test">>},
-    Node1 = hd(Nodes),
-    write_once(Node1, BKV1),
-    write_once(Node1, BKV2),
-    write_once(Node1, BKV3),
-
-    Tests = [test_no_overload_protection,
-             test_vnode_protection,
-             test_fsm_protection,
-             test_cover_queries_overload],
-
-    [begin
-         lager:info("Starting Test ~p for ~p~n", [Test, BKV]),
-         ok = erlang:apply(?MODULE, Test, [Nodes, BKV, IsConsistent])
-     end || Test <- Tests,
-            {BKV, IsConsistent} <- [{BKV1, false},
-                                    {BKV2, true},
-                                    {BKV3, false}]],
-    pass.
-
-
-setup() ->
-    ensemble_util:build_cluster(5, default_config(), 5).
+%% This record contains the default values for config settings if they were not set
+%% in the advanced.config file - because setting something to `undefined` is not the same
+%% as not setting it at all, we need to make sure to overwrite with defaults for each test,
+%% not just set things back to `undefined`. Also, makes the tests re-orderable as they always
+%% set everything they need, and don't depend on a previous test to make changes.
+-record(config, {
+    vnode_overload_threshold = 10000,
+    vnode_check_interval = 5000,
+    vnode_check_request_interval = 2500,
+    fsm_limit=undefined}).

 default_config() ->
+    default_config(#config{}).
+
+default_config(#config{
+    vnode_overload_threshold=VnodeOverloadThreshold,
+    vnode_check_interval = VnodeCheckInterval,
+    vnode_check_request_interval = VnodeCheckRequestInterval,
+    fsm_limit = FsmLimit
+}) ->
    [{riak_core, [{ring_creation_size, 8},
        {default_bucket_props, [{n_val, 5}]},
        {vnode_management_timer, 1000},
        {enable_health_checks, false},
        {enable_consensus, true},
-        {vnode_overload_threshold, undefined}]},
-        {riak_kv, [{fsm_limit, undefined},
+        {vnode_overload_threshold, VnodeOverloadThreshold},
+        {vnode_check_interval, VnodeCheckInterval},
+        {vnode_check_request_interval, VnodeCheckRequestInterval}]},
+        {riak_kv, [{fsm_limit, FsmLimit},
            {storage_backend, riak_kv_eleveldb_backend},
            {anti_entropy_build_limit, {100, 1000}},
            {anti_entropy_concurrency, 100},
@ -86,23 +75,60 @@ default_config() ->
            {anti_entropy_timeout, 5000}]},
        {riak_api, [{pb_backlog, 1024}]}].

-test_no_overload_protection(_Nodes, _BKV, true) ->
+confirm() ->
+    Nodes = setup(),
+
+    ok = create_bucket_type(Nodes, ?NORMAL_TYPE, [{n_val, 3}]),
+    ok = create_bucket_type(Nodes, ?CONSISTENT_TYPE, [{consistent, true}, {n_val, 5}]),
+    ok = create_bucket_type(Nodes, ?WRITE_ONCE_TYPE, [{write_once, true}, {n_val, 1}]),
+    rt:wait_until(ring_manager_check_fun(hd(Nodes))),
+
+
+    Node1 = hd(Nodes),
+    write_once(Node1, ?NORMAL_BKV),
+    write_once(Node1, ?CONSISTENT_BKV),
+    write_once(Node1, ?WRITE_ONCE_BKV),
+
+    Tests = [test_no_overload_protection,
+             test_vnode_protection,
+             test_fsm_protection],
+
+    [begin
+         lager:info("Starting Test ~p for ~p~n", [Test, BKV]),
+         ok = erlang:apply(?MODULE, Test, [Nodes, BKV])
+     end || Test <- Tests,
+            BKV <- [?NORMAL_BKV,
+                    ?CONSISTENT_BKV,
+                    ?WRITE_ONCE_BKV]],
+    %% Test cover queries doesn't depend on bucket/keyvalue, just run it once
+    test_cover_queries_overload(Nodes),
+    pass.
+
+
+setup() ->
+    ensemble_util:build_cluster(5, default_config(), 5).
+
+test_no_overload_protection(_Nodes, ?CONSISTENT_BKV) ->
    ok;
-test_no_overload_protection(Nodes, BKV, ConsistentType) ->
+test_no_overload_protection(Nodes, BKV) ->
+    lager:info("Setting default configuration for no overload protestion test."),
+    rt:pmap(fun(Node) ->
+        rt:update_app_config(Node, default_config())
+    end, Nodes),
    lager:info("Testing with no overload protection"),
    ProcFun = build_predicate_eq(test_no_overload_protection, ?NUM_REQUESTS,
                                  "ProcFun", "Procs"),
    QueueFun = build_predicate_gte(test_no_overload_protection, ?NUM_REQUESTS,
                                   "QueueFun", "Queue Size"),
-    verify_test_results(run_test(Nodes, BKV), ConsistentType, ProcFun, QueueFun).
+    verify_test_results(run_test(Nodes, BKV), BKV, ProcFun, QueueFun).

-verify_test_results({_NumProcs, QueueLen}, true, _, QueueFun) ->
+verify_test_results({_NumProcs, QueueLen}, ?CONSISTENT_BKV, _ProcFun, QueueFun) ->
    ?assert(QueueFun(QueueLen));
-verify_test_results({NumProcs, QueueLen}, false, ProcFun, QueueFun) ->
+verify_test_results({NumProcs, QueueLen}, _BKV, ProcFun, QueueFun) ->
    ?assert(ProcFun(NumProcs)),
    ?assert(QueueFun(QueueLen)).

-test_vnode_protection(Nodes, BKV, ConsistentType) ->
+test_vnode_protection(Nodes, BKV) ->
    %% Setting check_interval to one ensures that process_info is called
    %% to check the queue length on each vnode send.
    %% This allows us to artificially raise vnode queue lengths with dummy
@ -111,14 +137,13 @@ test_vnode_protection(Nodes, BKV, ConsistentType) ->
    lager:info("Testing with vnode queue protection enabled"),
    lager:info("Setting vnode overload threshold to ~b", [?THRESHOLD]),
    lager:info("Setting vnode check interval to 1"),
-    Config = [{riak_core, [{vnode_overload_threshold, ?THRESHOLD},
-                           {vnode_check_interval, 1}]}],
+    Config = default_config(#config{vnode_overload_threshold=?THRESHOLD, vnode_check_interval=1}),
    rt:pmap(fun(Node) ->
                    rt:update_app_config(Node, Config)
            end, Nodes),
    ProcFun = build_predicate_lt(test_vnode_protection, (?NUM_REQUESTS+1), "ProcFun", "Procs"),
-    QueueFun = build_predicate_lt(test_vnode_protection, (?NUM_REQUESTS), "QueueFun", "QueueSize"),
-    verify_test_results(run_test(Nodes, BKV), ConsistentType, ProcFun, QueueFun),
+    QueueFun = build_predicate_lte(test_vnode_protection, (?THRESHOLD+1), "QueueFun", "QueueSize"),
+    verify_test_results(run_test(Nodes, BKV), BKV, ProcFun, QueueFun),

    [Node1 | _] = Nodes,
    CheckInterval = ?THRESHOLD div 2,
@ -132,36 +157,31 @@ test_vnode_protection(Nodes, BKV, ConsistentType) ->
    Pid = suspend_vnode_proxy(Victim),
    ProcFun2 = build_predicate_gte("test_vnode_protection after suspend",
                                   (?NUM_REQUESTS), "ProcFun", "Procs"),
-    QueueFun2 = build_predicate_lt("test_vnode_protection after suspend",
-                                   (?NUM_REQUESTS), "QueueFun", "QueueSize"),
-    verify_test_results(run_test(Nodes, BKV), ConsistentType, ProcFun2, QueueFun2),
+    QueueFun2 = build_predicate_lte("test_vnode_protection after suspend",
+                                   (?THRESHOLD+1), "QueueFun", "QueueSize"),
+    verify_test_results(run_test(Nodes, BKV), BKV, ProcFun2, QueueFun2),
    Pid ! resume,
    ok.

 %% Don't check on fast path
-test_fsm_protection(_, {{<<"write_once_type">>, _}, _, _}, _) ->
+test_fsm_protection(_, ?WRITE_ONCE_BKV) ->
    ok;
-%% Or consistent path - doesn't use FSMs either
-test_fsm_protection(_, _, true) ->
+%% Or consistent gets, as they don't use the FSM either
+test_fsm_protection(_, ?CONSISTENT_BKV) ->
    ok;
-test_fsm_protection(Nodes, BKV, false) ->
+test_fsm_protection(Nodes, BKV) ->
    lager:info("Testing with coordinator protection enabled"),
    lager:info("Setting FSM limit to ~b", [?THRESHOLD]),
-    Config = [{riak_kv, [
-        {fsm_limit, ?THRESHOLD},
-        {vnode_overload_threshold, undefined},
-        {vnode_check_interval, undefined}]}],
+    %% Set FSM limit and reset other changes from previous tests.
+    Config = default_config(#config{fsm_limit=?THRESHOLD}),
    rt:pmap(fun(Node) ->
                    rt:update_app_config(Node, Config)
            end, Nodes),
    Node1 = hd(Nodes),

-    %% TODO: Figure out why just using rt:wait_for_service completely breaks this test,
-    %% but not waiting for riak_kv leaves us open to a race where the resource doesn't exist yet.
-    %% Do the retry dance instead for now inside get_calculated_sj_limit.
    rt:wait_for_cluster_service(Nodes, riak_kv),
    rt:load_modules_on_nodes([?MODULE], Nodes),
-    {ok, ExpectedFsms} = get_calculated_sj_limit(Node1, riak_kv_get_fsm_sj),
+    {ok, ExpectedFsms} = get_calculated_sj_limit(Node1, riak_kv_get_fsm_sj, 1),

    %% We expect exactly ExpectedFsms, but because of a race in SideJob we sometimes get 1 more
    %% Adding 2 (the highest observed rasce to date) to the lte predicate to handle the occasional case.
@ -170,7 +190,7 @@ test_fsm_protection(Nodes, BKV, false) ->
                                 "ProcFun", "Procs"),
    QueueFun = build_predicate_lt(test_fsm_protection, (?NUM_REQUESTS),
                                  "QueueFun", "QueueSize"),
-    verify_test_results(run_test(Nodes, BKV), false, ProcFun, QueueFun),
+    verify_test_results(run_test(Nodes, BKV), BKV, ProcFun, QueueFun),

    ok.

@ -191,16 +211,14 @@ get_calculated_sj_limit(Node, ResourceName, Retries) when Retries > 0 ->
 get_calculated_sj_limit(Node, ResourceName, Retries) when Retries == 0 ->
    {error, io_lib:format("Failed to retrieve sidejob limit from ~p for resource ~p. Giving up.", [Node, ResourceName])}.

-test_cover_queries_overload(_Nodes, _, true) ->
-    ok;
-test_cover_queries_overload(Nodes, _, false) ->
+test_cover_queries_overload(Nodes) ->
    lager:info("Testing cover queries with vnode queue protection enabled"),
    lager:info("Setting vnode overload threshold to ~b", [?THRESHOLD]),
    lager:info("Setting vnode check interval to 1"),

-    Config = [{riak_core, [{vnode_overload_threshold, ?THRESHOLD},
-                           {vnode_check_request_interval, 2},
-                           {vnode_check_interval, 1}]}],
+    Config = default_config(#config{vnode_overload_threshold=?THRESHOLD,
+                           vnode_check_request_interval=2,
+                           vnode_check_interval=1}),
    rt:pmap(fun(Node) ->
                    rt:update_app_config(Node, Config)
            end, Nodes),
@ -363,17 +381,17 @@ pb_get_fun(Node, Bucket, Key, TestPid) ->
            PBC = rt:pbc(Node),
            Result = case catch riakc_pb_socket:get(PBC, Bucket, Key) of
                         {error, <<"overload">>} ->
-                             lager:info("overload detected in pb_get, continuing..."),
+                             lager:debug("overload detected in pb_get, continuing..."),
                             true;
                         %% we expect timeouts in this test as we've shut down a vnode - return true in this case
                         {error, timeout} ->
-                             lager:info("timeout detected in pb_get, continuing..."),
+                             lager:debug("timeout detected in pb_get, continuing..."),
                             true;
                         {error, <<"timeout">>} ->
-                             lager:info("timeout detected in pb_get, continuing..."),
+                             lager:debug("timeout detected in pb_get, continuing..."),
                             true;
                         {ok, Res} ->
-                             lager:info("riakc_pb_socket:get(~p, ~p, ~p) succeeded, Res:~p", [PBC, Bucket, Key, Res]),
+                             lager:debug("riakc_pb_socket:get(~p, ~p, ~p) succeeded, Res:~p", [PBC, Bucket, Key, Res]),
                             true;
                         {error, Type} ->
                             lager:error("riakc_pb_socket threw error ~p reading {~p, ~p}, retrying...", [Type, Bucket, Key]),