riak_test/tests/partition_repair.erl

%% -------------------------------------------------------------------
%%
%% Copyright (c) 2012 Basho Technologies, Inc.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License.  You may obtain
%% a copy of the License at
%%
%%   http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied.  See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(partition_repair).
-compile(export_all).
-include_lib("eunit/include/eunit.hrl").

-behavior(riak_test).
-export([confirm/0]).

-define(FMT(S, L), lists:flatten(io_lib:format(S, L))).

%% @doc This test verifies that partition repair successfully repairs
%% all data after it has wiped out by a simulated disk crash.
confirm() ->
    SpamDir = rt_config:config_or_os_env(spam_dir),
    RingSize = list_to_integer(rt_config:config_or_os_env(ring_size, "16")),
    NVal = rt_config:config_or_os_env(n_val, undefined),
    TestMetaData = riak_test_runner:metadata(),
    KVBackend = proplists:get_value(backend, TestMetaData),

    NumNodes = rt_config:config_or_os_env(num_nodes, 4),
    HOConcurrency = rt_config:config_or_os_env(ho_concurrency, 2),
    {_KVBackendMod, KVDataDir} = backend_mod_dir(KVBackend),
    Bucket = <<"scotts_spam">>,

    lager:info("Build a cluster"),
    lager:info("riak_search enabled: true"),
    lager:info("ring_creation_size: ~p", [RingSize]),
    lager:info("n_val: ~p", [NVal]),
    lager:info("num nodes: ~p", [NumNodes]),
    lager:info("riak_core handoff_concurrency: ~p", [HOConcurrency]),
    lager:info("riak_core vnode_management_timer 1000"),
    Conf = [
            {riak_core,
             [
              {ring_creation_size, RingSize},
              {handoff_manager_timeout, 1000},
              {vnode_management_timer, 1000},
              {handoff_concurrency, HOConcurrency}
             ]},
            {riak_search,
             [
              {enabled, true}
             ]},
            %% @TODO This is only to test whether the test failure happens
            %% without AAE. The AAE errors found in the logs could be unrelated
            {riak_kv,
             [
              {anti_entropy, {off, []}}
             ]
            }
            %% {lager,
            %%  [{handlers,
            %%    [{lager_file_backend,
            %%      [{"./log/console.log",debug,10485760,"$D0",5}]}]}]}
           ],

    Nodes = rt:build_cluster(NumNodes, Conf),

    case NVal of
        undefined ->
            ok;
        _ ->
            lager:info("Set n_val to ~p", [NVal]),
            set_search_schema_nval(Bucket, NVal)
    end,

    lager:info("Enable search hook"),
    rt:enable_search_hook(hd(Nodes), Bucket),

    lager:info("Insert Scott's spam emails"),
    Pbc = rt:pbc(hd(Nodes)),
    rt:pbc_put_dir(Pbc, Bucket, SpamDir),

    lager:info("Stash ITFs for each partition"),
    %% @todo Should riak_test guarantee that the scratch pad is clean instead?
    ?assertCmd("rm -rf " ++ base_stash_path()),
    %% need to load the module so riak can see the fold fun
    rt:load_modules_on_nodes([?MODULE], Nodes),
    Ring = rt:get_ring(hd(Nodes)),
    Owners = riak_core_ring:all_owners(Ring),
    [stash_data(riak_search, Owner) || Owner <- Owners],

    lager:info("Stash KV data for each partition"),
    [stash_data(riak_kv, Owner) || Owner <- Owners],

    lager:info("Emulate data loss for riak_search, repair, verify correct data"),
    [kill_repair_verify(Owner, "merge_index", riak_search) || Owner <- Owners],

    %% TODO: parameterize backend
    lager:info("Emulate data loss for riak_kv, repair, verify correct data"),
    [kill_repair_verify(Owner, KVDataDir, riak_kv) || Owner <- Owners],

    lager:info("TEST PASSED"),
    pass.

kill_repair_verify({Partition, Node}, DataSuffix, Service) ->
    StashPath = stash_path(Service, Partition),
    {ok, [Stash]} = file:consult(StashPath),
    ExpectToVerify = dict:size(Stash),
    VNodeName = list_to_atom(atom_to_list(Service) ++ "_vnode"),

    %% kill the partition data
    Path = DataSuffix ++ "/" ++ integer_to_list(Partition),
    lager:info("Killing data for ~p on ~p at ~s", [Partition, Node, Path]),
    rt:clean_data_dir([Node], Path),

    %% force restart of vnode since some data is kept in memory
    lager:info("Restarting ~p vnode for ~p on ~p", [Service, Partition, Node]),
    {ok, Pid} = rpc:call(Node, riak_core_vnode_manager, get_vnode_pid,
                         [Partition, VNodeName]),
    ?assert(rpc:call(Node, erlang, exit, [Pid, kill_for_test])),

    %% We used to wait for the old pid to die here, but there is a delay between
    %% the vnode process dying and a new one being registered with the vnode
    %% manager. If we don't wait for the manager to return a new vnode pid, it's
    %% possible for the test to fail with a gen_server:call timeout.
    rt:wait_until(fun() -> {ok, Pid} =/=
                           rpc:call(Node, riak_core_vnode_manager, get_vnode_pid,
                                    [Partition, VNodeName])
                  end),

    lager:info("Verify data is missing"),
    ?assertEqual(0, count_data(Service, {Partition, Node})),

    %% repair the partition, ignore return for now
    lager:info("Invoking repair for ~p on ~p", [Partition, Node]),
    %% TODO: Don't ignore return, check version of Riak and if greater
    %% or equal to 1.x then expect OK.
    Return =
        case Service of
            riak_kv ->
                rpc:call(Node, riak_kv_vnode, repair, [Partition]);
            riak_search ->
                rpc:call(Node, riak_search_vnode, repair, [Partition])
        end,

    %% Kill sending vnode to verify HO sender is killed
    %% {ok, [{KPart, KNode}|_]} = Return,
    %% {ok, NewPid} = rpc:call(KNode, riak_core_vnode_manager, get_vnode_pid,
    %%                         [KPart, VNodeName]),
    %% lager:info("killing src pid: ~p/~p ~p", [KNode, KPart, NewPid]),
    %% KR = rpc:call(KNode, erlang, exit, [NewPid, kill]),
    %% lager:info("result of kill: ~p", [KR]),
    %% timer:sleep(1000),
    %% ?assertNot(rpc:call(KNode, erlang, is_process_alive, [NewPid])),


    lager:info("return value of repair_index ~p", [Return]),
    lager:info("Wait for repair to finish"),
    wait_for_repair(Service, {Partition, Node}, 30),

    lager:info("Verify ~p on ~p is fully repaired", [Partition, Node]),
    Data2 = get_data(Service, {Partition, Node}),
    {Verified, NotFound} = dict:fold(verify(Service, Data2), {0, []}, Stash),
    case NotFound of
        [] -> ok;
        _ ->
            NF = StashPath ++ ".nofound",
            lager:info("Some data not found, writing that to ~s", [NF]),
            ?assertEqual(ok, file:write_file(NF, io_lib:format("~p.", [NotFound])))
    end,
    %% NOTE: If the following assert fails then check the .notfound
    %% file written above...it contains all postings that were in the
    %% stash that weren't found after the repair.
    ?assertEqual({Service, ExpectToVerify}, {Service, Verified}),

    {ok, [{BeforeP, _BeforeOwner}=B, _, {AfterP, _AfterOwner}=A]} = Return,
    lager:info("Verify before src partition ~p still has data", [B]),
    StashPathB = stash_path(Service, BeforeP),
    {ok, [StashB]} = file:consult(StashPathB),
    ExpectToVerifyB = dict:size(StashB),
    BeforeData = get_data(Service, B),
    {VerifiedB, NotFoundB} = dict:fold(verify(Service, BeforeData), {0, []}, StashB),
    case NotFoundB of
        [] -> ok;
        _ ->
            NFB = StashPathB ++ ".notfound",
            ?assertEqual(ok, file:write_file(NFB, io_lib:format("~p.", [NotFoundB]))),
            throw({src_partition_missing_data, NFB})
    end,
    ?assertEqual(ExpectToVerifyB, VerifiedB),

    lager:info("Verify after src partition ~p still has data", [A]),
    StashPathA = stash_path(Service, AfterP),
    {ok, [StashA]} = file:consult(StashPathA),
    ExpectToVerifyA = dict:size(StashA),
    AfterData = get_data(Service, A),
    {VerifiedA, NotFoundA} = dict:fold(verify(Service, AfterData), {0, []}, StashA),
    case NotFoundA of
        [] -> ok;
        _ ->
            NFA = StashPathA ++ ".notfound",
            ?assertEqual(ok, file:write_file(NFA, io_lib:format("~p.", [NotFoundA]))),
            throw({src_partition_missing_data, NFA})
    end,
    ?assertEqual(ExpectToVerifyA, VerifiedA).


verify(riak_kv, DataAfterRepair) ->
    fun(BKey, StashedValue, {Verified, NotFound}) ->
            StashedData={BKey, StashedValue},
            case dict:find(BKey, DataAfterRepair) of
                error -> {Verified, [StashedData|NotFound]};
                {ok, Value} ->
                    if Value == StashedValue -> {Verified+1, NotFound};
                       true -> {Verified, [StashedData|NotFound]}
                    end
            end
    end;

verify(riak_search, PostingsAfterRepair) ->
    fun(IFT, StashedPostings, {Verified, NotFound}) ->
            StashedPosting={IFT, StashedPostings},
            case dict:find(IFT, PostingsAfterRepair) of
                error -> {Verified, [StashedPosting|NotFound]};
                {ok, RepairedPostings} ->
                    case lists:all(fun is_true/1,
                                   [lists:member(P, RepairedPostings)
                                    || P <- StashedPostings]) of
                        true -> {Verified+1, NotFound};
                        false -> {Verified, [StashedPosting|NotFound]}
                    end
            end
    end.

is_true(X) ->
    X == true.

count_data(Service, {Partition, Node}) ->
    dict:size(get_data(Service, {Partition, Node})).

get_data(Service, {Partition, Node}) ->
    VMaster =
        case Service of
            riak_kv -> riak_kv_vnode_master;
            riak_search -> riak_search_vnode_master
        end,
    %% TODO: add compile time support for riak_test
    Req =
        case Service of
            riak_kv ->
                {riak_core_fold_req_v1, fun stash_kv/3, dict:new()};
            riak_search ->
                {riak_core_fold_req_v1, fun stash_search/3, dict:new()}
        end,
    Data = riak_core_vnode_master:sync_command({Partition, Node},
                                               Req,
                                               VMaster,
                                               rt_config:get(rt_max_wait_time)),
    Data.

stash_data(Service, {Partition, Node}) ->
    File = stash_path(Service, Partition),
    ?assertEqual(ok, filelib:ensure_dir(File)),
    lager:info("Stashing ~p/~p at ~p to ~p", [Service, Partition, Node, File]),
    Postings = get_data(Service, {Partition, Node}),
    ?assertEqual(ok, file:write_file(File, io_lib:format("~p.", [Postings]))).

stash_kv(Key, Value, Stash) ->
    dict:store(Key, Value, Stash).

stash_search({_I,{_F,_T}}=K, _Postings=V, Stash) ->
    dict:append_list(K, V, Stash).

base_stash_path() ->
    rt_config:get(rt_scratch_dir) ++ "/dev/data_stash/".

stash_path(Service, Partition) ->
    base_stash_path() ++ atom_to_list(Service) ++ "/" ++ integer_to_list(Partition) ++ ".stash".

file_list(Dir) ->
    filelib:wildcard(Dir ++ "/*").

wait_for_repair(_, _, 0) ->
    throw(wait_for_repair_max_tries);
wait_for_repair(Service, {Partition, Node}, Tries) ->
    Reply =
        case Service of
            riak_kv ->
                rpc:call(Node, riak_kv_vnode, repair_status, [Partition]);
            riak_search ->
                rpc:call(Node, riak_search_vnode, repair_status, [Partition])
        end,
    case Reply of
        not_found -> ok;
        in_progress ->
            timer:sleep(timer:seconds(1)),
            wait_for_repair(Service, {Partition, Node}, Tries - 1)
    end.

data_path(Node, Suffix, Partition) ->
    [Name, _] = string:tokens(atom_to_list(Node), "@"),
    Base = rt_config:get('rtdev_path.current') ++ "/dev/" ++ Name ++ "/data",
    Base ++ "/" ++ Suffix ++ "/" ++ integer_to_list(Partition).

backend_mod_dir(undefined) ->
    %% riak_test defaults to bitcask when undefined
    backend_mod_dir(bitcask);
backend_mod_dir(bitcask) ->
    {riak_kv_bitcask_backend, "bitcask"};
backend_mod_dir(eleveldb) ->
    {riak_kv_eleveldb_backend, "leveldb"}.


-spec set_search_schema_nval(binary(), pos_integer()) -> ok.
set_search_schema_nval(Bucket, NVal) ->
    %% TODO: Search currently offers no easy way to pragmatically
    %% change a schema and save it.  This is because the external and
    %% internal formats of the schema are different.  The parser reads
    %% the external format and an internal representation is created
    %% which is then stored/access via `riak_search_config'.  Rather
    %% than allowing the internal format to be modified and set you
    %% must send the update in the external format.
    BucketStr = binary_to_list(Bucket),
    SearchCmd = ?FMT("~s/dev/dev1/bin/search-cmd", [rt_config:get('rtdev_path.current')]),
    GetSchema = ?FMT("~s show-schema ~s > current-schema",
                     [SearchCmd, BucketStr]),
    ModifyNVal = ?FMT("sed -E 's/n_val, [0-9]+/n_val, ~s/' "
                      "current-schema > new-schema",
                      [NVal]),
    SetSchema = ?FMT("~s set-schema ~s new-schema", [SearchCmd, BucketStr]),
    ClearCache = ?FMT("~s clear-schema-cache", [SearchCmd]),
    ?assertCmd(GetSchema),
    ?assertCmd(ModifyNVal),
    ?assertCmd(SetSchema),
    ?assertCmd(ClearCache).