riak_test/tests/partition_repair.erl
Nick Marino bff9ddc872 Fix race condition in partition_repair
If we just wait for the old vnode to die, we are not guaranteed that the
new one will have yet been started and registered with the vnode
manager, so it's possible we will end up trying to do a call into the
old dead vnode in the subsequent test code. We saw a couple of test
failures in giddyup recently which I believe were caused by this race
condition.

To fix, we can wait for the vnode manager to return a new pid instead of
just waiting for the old pid to die.
2016-09-22 13:02:04 -04:00

345 lines
13 KiB
Erlang

%% -------------------------------------------------------------------
%%
%% Copyright (c) 2012 Basho Technologies, Inc.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(partition_repair).
-compile(export_all).
-include_lib("eunit/include/eunit.hrl").
-behavior(riak_test).
-export([confirm/0]).
-define(FMT(S, L), lists:flatten(io_lib:format(S, L))).
%% @doc This test verifies that partition repair successfully repairs
%% all data after it has wiped out by a simulated disk crash.
confirm() ->
SpamDir = rt_config:config_or_os_env(spam_dir),
RingSize = list_to_integer(rt_config:config_or_os_env(ring_size, "16")),
NVal = rt_config:config_or_os_env(n_val, undefined),
TestMetaData = riak_test_runner:metadata(),
KVBackend = proplists:get_value(backend, TestMetaData),
NumNodes = rt_config:config_or_os_env(num_nodes, 4),
HOConcurrency = rt_config:config_or_os_env(ho_concurrency, 2),
{_KVBackendMod, KVDataDir} = backend_mod_dir(KVBackend),
Bucket = <<"scotts_spam">>,
lager:info("Build a cluster"),
lager:info("riak_search enabled: true"),
lager:info("ring_creation_size: ~p", [RingSize]),
lager:info("n_val: ~p", [NVal]),
lager:info("num nodes: ~p", [NumNodes]),
lager:info("riak_core handoff_concurrency: ~p", [HOConcurrency]),
lager:info("riak_core vnode_management_timer 1000"),
Conf = [
{riak_core,
[
{ring_creation_size, RingSize},
{handoff_manager_timeout, 1000},
{vnode_management_timer, 1000},
{handoff_concurrency, HOConcurrency}
]},
{riak_search,
[
{enabled, true}
]},
%% @TODO This is only to test whether the test failure happens
%% without AAE. The AAE errors found in the logs could be unrelated
{riak_kv,
[
{anti_entropy, {off, []}}
]
}
%% {lager,
%% [{handlers,
%% [{lager_file_backend,
%% [{"./log/console.log",debug,10485760,"$D0",5}]}]}]}
],
Nodes = rt:build_cluster(NumNodes, Conf),
case NVal of
undefined ->
ok;
_ ->
lager:info("Set n_val to ~p", [NVal]),
set_search_schema_nval(Bucket, NVal)
end,
lager:info("Enable search hook"),
rt:enable_search_hook(hd(Nodes), Bucket),
lager:info("Insert Scott's spam emails"),
Pbc = rt:pbc(hd(Nodes)),
rt:pbc_put_dir(Pbc, Bucket, SpamDir),
lager:info("Stash ITFs for each partition"),
%% @todo Should riak_test guarantee that the scratch pad is clean instead?
?assertCmd("rm -rf " ++ base_stash_path()),
%% need to load the module so riak can see the fold fun
rt:load_modules_on_nodes([?MODULE], Nodes),
Ring = rt:get_ring(hd(Nodes)),
Owners = riak_core_ring:all_owners(Ring),
[stash_data(riak_search, Owner) || Owner <- Owners],
lager:info("Stash KV data for each partition"),
[stash_data(riak_kv, Owner) || Owner <- Owners],
lager:info("Emulate data loss for riak_search, repair, verify correct data"),
[kill_repair_verify(Owner, "merge_index", riak_search) || Owner <- Owners],
%% TODO: parameterize backend
lager:info("Emulate data loss for riak_kv, repair, verify correct data"),
[kill_repair_verify(Owner, KVDataDir, riak_kv) || Owner <- Owners],
lager:info("TEST PASSED"),
pass.
kill_repair_verify({Partition, Node}, DataSuffix, Service) ->
StashPath = stash_path(Service, Partition),
{ok, [Stash]} = file:consult(StashPath),
ExpectToVerify = dict:size(Stash),
VNodeName = list_to_atom(atom_to_list(Service) ++ "_vnode"),
%% kill the partition data
Path = DataSuffix ++ "/" ++ integer_to_list(Partition),
lager:info("Killing data for ~p on ~p at ~s", [Partition, Node, Path]),
rt:clean_data_dir([Node], Path),
%% force restart of vnode since some data is kept in memory
lager:info("Restarting ~p vnode for ~p on ~p", [Service, Partition, Node]),
{ok, Pid} = rpc:call(Node, riak_core_vnode_manager, get_vnode_pid,
[Partition, VNodeName]),
?assert(rpc:call(Node, erlang, exit, [Pid, kill_for_test])),
%% We used to wait for the old pid to die here, but there is a delay between
%% the vnode process dying and a new one being registered with the vnode
%% manager. If we don't wait for the manager to return a new vnode pid, it's
%% possible for the test to fail with a gen_server:call timeout.
rt:wait_until(fun() -> {ok, Pid} =/=
rpc:call(Node, riak_core_vnode_manager, get_vnode_pid,
[Partition, VNodeName])
end),
lager:info("Verify data is missing"),
?assertEqual(0, count_data(Service, {Partition, Node})),
%% repair the partition, ignore return for now
lager:info("Invoking repair for ~p on ~p", [Partition, Node]),
%% TODO: Don't ignore return, check version of Riak and if greater
%% or equal to 1.x then expect OK.
Return =
case Service of
riak_kv ->
rpc:call(Node, riak_kv_vnode, repair, [Partition]);
riak_search ->
rpc:call(Node, riak_search_vnode, repair, [Partition])
end,
%% Kill sending vnode to verify HO sender is killed
%% {ok, [{KPart, KNode}|_]} = Return,
%% {ok, NewPid} = rpc:call(KNode, riak_core_vnode_manager, get_vnode_pid,
%% [KPart, VNodeName]),
%% lager:info("killing src pid: ~p/~p ~p", [KNode, KPart, NewPid]),
%% KR = rpc:call(KNode, erlang, exit, [NewPid, kill]),
%% lager:info("result of kill: ~p", [KR]),
%% timer:sleep(1000),
%% ?assertNot(rpc:call(KNode, erlang, is_process_alive, [NewPid])),
lager:info("return value of repair_index ~p", [Return]),
lager:info("Wait for repair to finish"),
wait_for_repair(Service, {Partition, Node}, 30),
lager:info("Verify ~p on ~p is fully repaired", [Partition, Node]),
Data2 = get_data(Service, {Partition, Node}),
{Verified, NotFound} = dict:fold(verify(Service, Data2), {0, []}, Stash),
case NotFound of
[] -> ok;
_ ->
NF = StashPath ++ ".nofound",
lager:info("Some data not found, writing that to ~s", [NF]),
?assertEqual(ok, file:write_file(NF, io_lib:format("~p.", [NotFound])))
end,
%% NOTE: If the following assert fails then check the .notfound
%% file written above...it contains all postings that were in the
%% stash that weren't found after the repair.
?assertEqual({Service, ExpectToVerify}, {Service, Verified}),
{ok, [{BeforeP, _BeforeOwner}=B, _, {AfterP, _AfterOwner}=A]} = Return,
lager:info("Verify before src partition ~p still has data", [B]),
StashPathB = stash_path(Service, BeforeP),
{ok, [StashB]} = file:consult(StashPathB),
ExpectToVerifyB = dict:size(StashB),
BeforeData = get_data(Service, B),
{VerifiedB, NotFoundB} = dict:fold(verify(Service, BeforeData), {0, []}, StashB),
case NotFoundB of
[] -> ok;
_ ->
NFB = StashPathB ++ ".notfound",
?assertEqual(ok, file:write_file(NFB, io_lib:format("~p.", [NotFoundB]))),
throw({src_partition_missing_data, NFB})
end,
?assertEqual(ExpectToVerifyB, VerifiedB),
lager:info("Verify after src partition ~p still has data", [A]),
StashPathA = stash_path(Service, AfterP),
{ok, [StashA]} = file:consult(StashPathA),
ExpectToVerifyA = dict:size(StashA),
AfterData = get_data(Service, A),
{VerifiedA, NotFoundA} = dict:fold(verify(Service, AfterData), {0, []}, StashA),
case NotFoundA of
[] -> ok;
_ ->
NFA = StashPathA ++ ".notfound",
?assertEqual(ok, file:write_file(NFA, io_lib:format("~p.", [NotFoundA]))),
throw({src_partition_missing_data, NFA})
end,
?assertEqual(ExpectToVerifyA, VerifiedA).
verify(riak_kv, DataAfterRepair) ->
fun(BKey, StashedValue, {Verified, NotFound}) ->
StashedData={BKey, StashedValue},
case dict:find(BKey, DataAfterRepair) of
error -> {Verified, [StashedData|NotFound]};
{ok, Value} ->
if Value == StashedValue -> {Verified+1, NotFound};
true -> {Verified, [StashedData|NotFound]}
end
end
end;
verify(riak_search, PostingsAfterRepair) ->
fun(IFT, StashedPostings, {Verified, NotFound}) ->
StashedPosting={IFT, StashedPostings},
case dict:find(IFT, PostingsAfterRepair) of
error -> {Verified, [StashedPosting|NotFound]};
{ok, RepairedPostings} ->
case lists:all(fun is_true/1,
[lists:member(P, RepairedPostings)
|| P <- StashedPostings]) of
true -> {Verified+1, NotFound};
false -> {Verified, [StashedPosting|NotFound]}
end
end
end.
is_true(X) ->
X == true.
count_data(Service, {Partition, Node}) ->
dict:size(get_data(Service, {Partition, Node})).
get_data(Service, {Partition, Node}) ->
VMaster =
case Service of
riak_kv -> riak_kv_vnode_master;
riak_search -> riak_search_vnode_master
end,
%% TODO: add compile time support for riak_test
Req =
case Service of
riak_kv ->
{riak_core_fold_req_v1, fun stash_kv/3, dict:new()};
riak_search ->
{riak_core_fold_req_v1, fun stash_search/3, dict:new()}
end,
Data = riak_core_vnode_master:sync_command({Partition, Node},
Req,
VMaster,
rt_config:get(rt_max_wait_time)),
Data.
stash_data(Service, {Partition, Node}) ->
File = stash_path(Service, Partition),
?assertEqual(ok, filelib:ensure_dir(File)),
lager:info("Stashing ~p/~p at ~p to ~p", [Service, Partition, Node, File]),
Postings = get_data(Service, {Partition, Node}),
?assertEqual(ok, file:write_file(File, io_lib:format("~p.", [Postings]))).
stash_kv(Key, Value, Stash) ->
dict:store(Key, Value, Stash).
stash_search({_I,{_F,_T}}=K, _Postings=V, Stash) ->
dict:append_list(K, V, Stash).
base_stash_path() ->
rt_config:get(rt_scratch_dir) ++ "/dev/data_stash/".
stash_path(Service, Partition) ->
base_stash_path() ++ atom_to_list(Service) ++ "/" ++ integer_to_list(Partition) ++ ".stash".
file_list(Dir) ->
filelib:wildcard(Dir ++ "/*").
wait_for_repair(_, _, 0) ->
throw(wait_for_repair_max_tries);
wait_for_repair(Service, {Partition, Node}, Tries) ->
Reply =
case Service of
riak_kv ->
rpc:call(Node, riak_kv_vnode, repair_status, [Partition]);
riak_search ->
rpc:call(Node, riak_search_vnode, repair_status, [Partition])
end,
case Reply of
not_found -> ok;
in_progress ->
timer:sleep(timer:seconds(1)),
wait_for_repair(Service, {Partition, Node}, Tries - 1)
end.
data_path(Node, Suffix, Partition) ->
[Name, _] = string:tokens(atom_to_list(Node), "@"),
Base = rt_config:get('rtdev_path.current') ++ "/dev/" ++ Name ++ "/data",
Base ++ "/" ++ Suffix ++ "/" ++ integer_to_list(Partition).
backend_mod_dir(undefined) ->
%% riak_test defaults to bitcask when undefined
backend_mod_dir(bitcask);
backend_mod_dir(bitcask) ->
{riak_kv_bitcask_backend, "bitcask"};
backend_mod_dir(eleveldb) ->
{riak_kv_eleveldb_backend, "leveldb"}.
-spec set_search_schema_nval(binary(), pos_integer()) -> ok.
set_search_schema_nval(Bucket, NVal) ->
%% TODO: Search currently offers no easy way to pragmatically
%% change a schema and save it. This is because the external and
%% internal formats of the schema are different. The parser reads
%% the external format and an internal representation is created
%% which is then stored/access via `riak_search_config'. Rather
%% than allowing the internal format to be modified and set you
%% must send the update in the external format.
BucketStr = binary_to_list(Bucket),
SearchCmd = ?FMT("~s/dev/dev1/bin/search-cmd", [rt_config:get('rtdev_path.current')]),
GetSchema = ?FMT("~s show-schema ~s > current-schema",
[SearchCmd, BucketStr]),
ModifyNVal = ?FMT("sed -E 's/n_val, [0-9]+/n_val, ~s/' "
"current-schema > new-schema",
[NVal]),
SetSchema = ?FMT("~s set-schema ~s new-schema", [SearchCmd, BucketStr]),
ClearCache = ?FMT("~s clear-schema-cache", [SearchCmd]),
?assertCmd(GetSchema),
?assertCmd(ModifyNVal),
?assertCmd(SetSchema),
?assertCmd(ClearCache).