From bff9ddc872917a55fd154a8d2b536f6f07da512c Mon Sep 17 00:00:00 2001 From: Nick Marino Date: Thu, 22 Sep 2016 13:02:04 -0400 Subject: [PATCH] Fix race condition in partition_repair If we just wait for the old vnode to die, we are not guaranteed that the new one will have yet been started and registered with the vnode manager, so it's possible we will end up trying to do a call into the old dead vnode in the subsequent test code. We saw a couple of test failures in giddyup recently which I believe were caused by this race condition. To fix, we can wait for the vnode manager to return a new pid instead of just waiting for the old pid to die. --- tests/partition_repair.erl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/partition_repair.erl b/tests/partition_repair.erl index 62ba5806..b5c54da6 100644 --- a/tests/partition_repair.erl +++ b/tests/partition_repair.erl @@ -128,7 +128,14 @@ kill_repair_verify({Partition, Node}, DataSuffix, Service) -> [Partition, VNodeName]), ?assert(rpc:call(Node, erlang, exit, [Pid, kill_for_test])), - rt:wait_until(Node, fun(N) -> not(rpc:call(N, erlang, is_process_alive, [Pid])) end), + %% We used to wait for the old pid to die here, but there is a delay between + %% the vnode process dying and a new one being registered with the vnode + %% manager. If we don't wait for the manager to return a new vnode pid, it's + %% possible for the test to fail with a gen_server:call timeout. + rt:wait_until(fun() -> {ok, Pid} =/= + rpc:call(Node, riak_core_vnode_manager, get_vnode_pid, + [Partition, VNodeName]) + end), lager:info("Verify data is missing"), ?assertEqual(0, count_data(Service, {Partition, Node})),