Make expiry_liveliness_kinds more robust

At least part of the flakiness of this test was caused by failures
checking the number of remote writers that were still alive: sometimes
the number of writers that had lost its liveliness was as expected, at
which point the number that had retained its liveliness should also be
as expected, and yet the second read for the latter gave an unexpected
value.

The hypothesis is that the transitions happened in the short space of
time between reading the two.  Some factors appear to be in play:

* Sometimes a test stressing the discovery path gets scheduled in
  parallel, which can delay the liveliness updates, or perhaps even
  cause it to be lost in transit.

* If the second happens to be the case, the relatively short lease
  duration (200ms) makes the retransmit tight if the timers are at
  default values and it so happens that it is the last packet with those
  updates that got lost (in that case, the only trigger is a 100ms
  timer).

* The multiplication factor of exactly 2 means that the lease expiry is
  likely to happen around the time these checks get done, and so can
  easily cause a very odd-looking result.

This commit restarts the test with longer lease durations in case either
of the two counts is wrong, where before it would restart if the "not
alive" count was wrong, but fail immediately if the "alive" count was.
It also slightly stretches the check duration.  Finally it prints all
the actual and expected counts that cause the test to pass or fail,
improving the chances of making sense of future failures.

Signed-off-by: Erik Boasson <eb@ilities.com>
This commit is contained in:
Erik Boasson 2020-07-05 12:00:46 +02:00 committed by eboasson
parent 1f8c4f2a4c
commit 41cc5953e0

View file

@ -233,7 +233,7 @@ CU_Theory((dds_liveliness_kind_t kind, uint32_t ldur, double mult), ddsc_livelin
*/ */
CU_TheoryDataPoints(ddsc_liveliness, expire_liveliness_kinds) = { CU_TheoryDataPoints(ddsc_liveliness, expire_liveliness_kinds) = {
CU_DataPoints(uint32_t, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200), /* lease duration for initial test run (increased for each retry when test fails) */ CU_DataPoints(uint32_t, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200), /* lease duration for initial test run (increased for each retry when test fails) */
CU_DataPoints(double, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 2, 2, 2, 2, 2, 2, 2, 2, 2), /* delay (n times lease duration) */ CU_DataPoints(double, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1), /* delay (n times lease duration) */
CU_DataPoints(uint32_t, 1, 0, 2, 0, 1, 0, 0, 1, 1, 2, 0, 5, 0, 15, 15), /* number of writers with automatic liveliness */ CU_DataPoints(uint32_t, 1, 0, 2, 0, 1, 0, 0, 1, 1, 2, 0, 5, 0, 15, 15), /* number of writers with automatic liveliness */
CU_DataPoints(uint32_t, 1, 1, 2, 2, 0, 0, 0, 1, 0, 2, 2, 5, 10, 0, 15), /* number of writers with manual-by-participant liveliness */ CU_DataPoints(uint32_t, 1, 1, 2, 2, 0, 0, 0, 1, 0, 2, 2, 5, 10, 0, 15), /* number of writers with manual-by-participant liveliness */
CU_DataPoints(uint32_t, 1, 1, 2, 2, 1, 1, 1, 1, 0, 1, 1, 2, 5, 0, 10), /* number of writers with manual-by-topic liveliness */ CU_DataPoints(uint32_t, 1, 1, 2, 2, 1, 1, 1, 1, 0, 1, 1, 2, 5, 0, 10), /* number of writers with manual-by-topic liveliness */
@ -257,7 +257,7 @@ static void test_expire_liveliness_kinds(uint32_t ldur, double mult, uint32_t wr
do do
{ {
tstart = dds_time(); tstart = dds_time();
printf("%d.%06d running test: lease duration %d, delay %f, auto/man-by-part/man-by-topic %u/%u/%u\n, %s reader", printf("%d.%06d running test: lease duration %d, delay %f, auto/man-by-part/man-by-topic %u/%u/%u\n, %s reader\n",
(int32_t)(tstart / DDS_NSECS_IN_SEC), (int32_t)(tstart % DDS_NSECS_IN_SEC) / 1000, (int32_t)(tstart / DDS_NSECS_IN_SEC), (int32_t)(tstart % DDS_NSECS_IN_SEC) / 1000,
ldur, mult, wr_cnt_auto, wr_cnt_man_pp, wr_cnt_man_tp, remote_reader ? "remote" : "local"); ldur, mult, wr_cnt_auto, wr_cnt_man_pp, wr_cnt_man_tp, remote_reader ? "remote" : "local");
@ -326,18 +326,19 @@ static void test_expire_liveliness_kinds(uint32_t ldur, double mult, uint32_t wr
(int32_t)(t / DDS_NSECS_IN_SEC), (int32_t)(t % DDS_NSECS_IN_SEC) / 1000, stopped); (int32_t)(t / DDS_NSECS_IN_SEC), (int32_t)(t % DDS_NSECS_IN_SEC) / 1000, stopped);
size_t exp_stopped = mult < 1 ? 0 : (wr_cnt_man_pp + wr_cnt_man_tp); size_t exp_stopped = mult < 1 ? 0 : (wr_cnt_man_pp + wr_cnt_man_tp);
if (stopped != exp_stopped) size_t exp_alive = mult < 1 ? wr_cnt : wr_cnt_auto;
{ CU_ASSERT_EQUAL_FATAL(dds_get_liveliness_changed_status(reader, &lstatus), DDS_RETCODE_OK);
ldur *= 10 / (run + 1); printf("%d.%06d writers alive: %u (exp: %u) not-alive: %u (exp %u)\n",
printf("%d.%06d incorrect number of stopped writers\n", (int32_t)(t / DDS_NSECS_IN_SEC), (int32_t)(t % DDS_NSECS_IN_SEC) / 1000,
(int32_t)(t / DDS_NSECS_IN_SEC), (int32_t)(t % DDS_NSECS_IN_SEC) / 1000); lstatus.alive_count, (unsigned) exp_alive,
} lstatus.not_alive_count, (unsigned) exp_stopped);
if (stopped == exp_stopped && lstatus.alive_count == exp_alive)
test_finished = true;
else else
{ {
/* check alive count */ ldur *= 10 / (run + 1);
CU_ASSERT_EQUAL_FATAL(dds_get_liveliness_changed_status(reader, &lstatus), DDS_RETCODE_OK); printf("%d.%06d incorrect number of stopped/alive writers\n",
CU_ASSERT_EQUAL(lstatus.alive_count, mult < 1 ? wr_cnt : wr_cnt_auto); (int32_t)(t / DDS_NSECS_IN_SEC), (int32_t)(t % DDS_NSECS_IN_SEC) / 1000);
test_finished = true;
} }
} }