From bff9ddc872917a55fd154a8d2b536f6f07da512c Mon Sep 17 00:00:00 2001
From: Nick Marino <nmarino@basho.com>
Date: Thu, 22 Sep 2016 13:02:04 -0400
Subject: [PATCH] Fix race condition in partition_repair

If we just wait for the old vnode to die, we are not guaranteed that the
new one will have yet been started and registered with the vnode
manager, so it's possible we will end up trying to do a call into the
old dead vnode in the subsequent test code. We saw a couple of test
failures in giddyup recently which I believe were caused by this race
condition.

To fix, we can wait for the vnode manager to return a new pid instead of
just waiting for the old pid to die.
---
 tests/partition_repair.erl | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/partition_repair.erl b/tests/partition_repair.erl
index 62ba5806..b5c54da6 100644
--- a/tests/partition_repair.erl
+++ b/tests/partition_repair.erl
@@ -128,7 +128,14 @@ kill_repair_verify({Partition, Node}, DataSuffix, Service) ->
                          [Partition, VNodeName]),
     ?assert(rpc:call(Node, erlang, exit, [Pid, kill_for_test])),
     
-    rt:wait_until(Node, fun(N) -> not(rpc:call(N, erlang, is_process_alive, [Pid])) end),
+    %% We used to wait for the old pid to die here, but there is a delay between
+    %% the vnode process dying and a new one being registered with the vnode
+    %% manager. If we don't wait for the manager to return a new vnode pid, it's
+    %% possible for the test to fail with a gen_server:call timeout.
+    rt:wait_until(fun() -> {ok, Pid} =/=
+                           rpc:call(Node, riak_core_vnode_manager, get_vnode_pid,
+                                    [Partition, VNodeName])
+                  end),
 
     lager:info("Verify data is missing"),
     ?assertEqual(0, count_data(Service, {Partition, Node})),