From 032b7b49c9f50fa8e4e049d066e5f1ddb6295d89 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Sun, 30 Jun 2024 10:35:09 +1000 Subject: [PATCH] ctdb-scripts: Only consider statistics on timeout Checking statistics is only really relevant to timeouts. That is, if an rpcinfo times out it is worth checking if the service making progress. If the RPC service is not registered then the statistics don't need to be checked because they shouldn't be changing. The 2 previously added tests added to check statistics progress now behave identically and fail on all iterations. To support testing with "timeouts", an optional TIMEOUT flag can now be added to the RPC service passed to nfs_iterate_test(). 2 new tests are added to exercise the new behaviour. The 2 new "if" statements in nfs_iterate_test() could be combined. However, a subsequent commit would split them and would be more difficult to read. Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs --- ctdb/config/events/legacy/60.nfs.script | 6 ++ ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh | 24 +++++++ ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh | 24 +++++++ ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh | 77 +++++++++++++++++----- ctdb/tests/UNIT/eventscripts/stubs/rpcinfo | 13 +++- 5 files changed, 127 insertions(+), 17 deletions(-) create mode 100755 ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh create mode 100755 ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh diff --git a/ctdb/config/events/legacy/60.nfs.script b/ctdb/config/events/legacy/60.nfs.script index 246a856bca8..bdea3d63fa1 100755 --- a/ctdb/config/events/legacy/60.nfs.script +++ b/ctdb/config/events/legacy/60.nfs.script @@ -167,7 +167,13 @@ nfs_check_service() fi eval "$service_stats_cmd" >"$_curr" 2>&1 + # Only consider statistics on timeout. This + # is done below by checking if this string is + # contained in $_err. + _t="rpcinfo: RPC: Timed out" + if ! $_ok && + [ "${_err#*"${_t}"}" != "$_err" ] && ! cmp "$_prev" "$_curr" >/dev/null 2>&1; then # Stats always implicitly change on # the first monitor event, since diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh new file mode 100755 index 00000000000..9d96bf1891a --- /dev/null +++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh @@ -0,0 +1,24 @@ +#!/bin/sh + +. "${TEST_SCRIPTS_DIR}/unit.sh" + +define_test "NFS RPC service timeout, stats change, 10 iterations" + +setup + +cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <"${CTDB_BASE}/nfs-checks.d/20.nfs.check" </dev/null 2>&1; then - + _why="" + _ri_out=$(rpcinfo -T tcp localhost "$_rpc_service" 2>&1) + # Check exit code separately for readability + # shellcheck disable=SC2181 + if [ $? -eq 0 ]; then echo 0 >"$_failcount_file" exit # from subshell - elif nfs_stats_check_changed \ - "$_rpc_service" "$_iteration"; then + elif rpcinfo_timed_out "$_ri_out"; then + _why="Timed out" - rpc_failure \ - "WARNING: statistics changed but" \ - "$_rpc_service" \ - "$_ver" \ - >"$_out" - echo 0 >"$_failcount_file" - exit # from subshell + if nfs_stats_check_changed \ + "$_rpc_service" "$_iteration"; then + + rpc_failure \ + "WARNING: statistics changed but" \ + "$_rpc_service" \ + "$_ver" \ + "$_why" \ + >"$_out" + echo 0 >"$_failcount_file" + exit # from subshell + fi fi _numfails=$((_numfails + 1)) @@ -367,6 +395,7 @@ rpc_set_service_failure_response() "ERROR:" \ "$_rpc_service" \ "$_ver" \ + "$_why" \ >"$_out" else _unhealthy=false @@ -379,6 +408,7 @@ rpc_set_service_failure_response() "WARNING:" \ "$_rpc_service" \ "$_ver" \ + "$_why" \ >"$_out" fi @@ -424,7 +454,8 @@ program_stack_traces() # # - 1st argument is the number of iterations. # -# - 2nd argument is the NFS/RPC service being tested +# - 2nd argument is the NFS/RPC service being tested, with optional +# TIMEOUT flag # # This service is marked down before the 1st iteration. # @@ -454,7 +485,23 @@ nfs_iterate_test() debug <&2 +echo "$_fail_msg" >&2 if [ -n "$v" ]; then echo "program ${p} version ${v} is not available" else -- 2.11.4.GIT