Fix race in deleting lingering writers

Deleting a writer with unacknowledged data present in its WHC causes it
to linger for a configurable duration.  Once it is lingering, there are
two routes to actually deleting the writer: because the samples get
acknowledged, or because the linger duration elapses.

When these two happen roughly concurrently, there was a possibility of
both succeeding in looking up the writer by its GUID, in which case one
of them then asserts on removing it from the entity index (if assertions
are enabled, if not, things are worse).

This fixes that by ensuring only one of the two actually does something,
as was always the intent.

Signed-off-by: Erik Boasson <eb@ilities.com>
This commit is contained in:
Erik Boasson 2020-03-27 10:31:42 +01:00 committed by eboasson
parent 5d53e74029
commit 1c31fba043
2 changed files with 17 additions and 2 deletions

View file

@ -3393,8 +3393,23 @@ dds_return_t writer_wait_for_acks (struct writer *wr, dds_time_t abstimeout)
dds_return_t delete_writer_nolinger_locked (struct writer *wr) dds_return_t delete_writer_nolinger_locked (struct writer *wr)
{ {
ELOGDISC (wr, "delete_writer_nolinger(guid "PGUIDFMT") ...\n", PGUID (wr->e.guid));
ASSERT_MUTEX_HELD (&wr->e.lock); ASSERT_MUTEX_HELD (&wr->e.lock);
/* We can get here via multiple paths in parallel, in particular: because all data got
ACK'd while lingering, and because the linger timeout elapses. Those two race each
other, the first calling this function directly, the second calling from
handle_xevk_delete_writer via delete_writer_nolinger.
There are two practical options to decide whether to ignore the call: one is to check
whether the writer is still in the GUID hashes, the second to check whether the state
is WRST_DELETING. The latter seems a bit less surprising. */
if (wr->state == WRST_DELETING)
{
ELOGDISC (wr, "delete_writer_nolinger(guid "PGUIDFMT") already done\n", PGUID (wr->e.guid));
return 0;
}
ELOGDISC (wr, "delete_writer_nolinger(guid "PGUIDFMT") ...\n", PGUID (wr->e.guid));
builtintopic_write (wr->e.gv->builtin_topic_interface, &wr->e, ddsrt_time_wallclock(), false); builtintopic_write (wr->e.gv->builtin_topic_interface, &wr->e, ddsrt_time_wallclock(), false);
local_reader_ary_setinvalid (&wr->rdary); local_reader_ary_setinvalid (&wr->rdary);
entidx_remove_writer_guid (wr->e.gv->entity_index, wr); entidx_remove_writer_guid (wr->e.gv->entity_index, wr);

View file

@ -1149,7 +1149,7 @@ static void handle_xevk_pmd_update (struct thread_state1 * const ts1, struct nn_
static void handle_xevk_delete_writer (UNUSED_ARG (struct nn_xpack *xp), struct xevent *ev, UNUSED_ARG (ddsrt_mtime_t tnow)) static void handle_xevk_delete_writer (UNUSED_ARG (struct nn_xpack *xp), struct xevent *ev, UNUSED_ARG (ddsrt_mtime_t tnow))
{ {
/* don't worry if the writer is already gone by the time we get here. */ /* don't worry if the writer is already gone by the time we get here, delete_writer_nolinger checks for that. */
struct ddsi_domaingv * const gv = ev->evq->gv; struct ddsi_domaingv * const gv = ev->evq->gv;
GVTRACE ("handle_xevk_delete_writer: "PGUIDFMT"\n", PGUID (ev->u.delete_writer.guid)); GVTRACE ("handle_xevk_delete_writer: "PGUIDFMT"\n", PGUID (ev->u.delete_writer.guid));
delete_writer_nolinger (gv, &ev->u.delete_writer.guid); delete_writer_nolinger (gv, &ev->u.delete_writer.guid);