From 032b7b49c9f50fa8e4e049d066e5f1ddb6295d89 Mon Sep 17 00:00:00 2001
From: Martin Schwenke <mschwenke@ddn.com>
Date: Sun, 30 Jun 2024 10:35:09 +1000
Subject: [PATCH] ctdb-scripts: Only consider statistics on timeout

Checking statistics is only really relevant to timeouts.  That is, if
an rpcinfo times out it is worth checking if the service making
progress.  If the RPC service is not registered then the statistics
don't need to be checked because they shouldn't be changing.

The 2 previously added tests added to check statistics progress now
behave identically and fail on all iterations.  To support testing
with "timeouts", an optional TIMEOUT flag can now be added to the RPC
service passed to nfs_iterate_test().  2 new tests are added to
exercise the new behaviour.

The 2 new "if" statements in nfs_iterate_test() could be combined.
However, a subsequent commit would split them and would be more
difficult to read.

Signed-off-by: Martin Schwenke <mschwenke@ddn.com>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
---
 ctdb/config/events/legacy/60.nfs.script            |  6 ++
 ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh | 24 +++++++
 ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh | 24 +++++++
 ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh     | 77 +++++++++++++++++-----
 ctdb/tests/UNIT/eventscripts/stubs/rpcinfo         | 13 +++-
 5 files changed, 127 insertions(+), 17 deletions(-)
 create mode 100755 ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh
 create mode 100755 ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh

diff --git a/ctdb/config/events/legacy/60.nfs.script b/ctdb/config/events/legacy/60.nfs.script
index 246a856bca8..bdea3d63fa1 100755
--- a/ctdb/config/events/legacy/60.nfs.script
+++ b/ctdb/config/events/legacy/60.nfs.script
@@ -167,7 +167,13 @@ nfs_check_service()
 			fi
 			eval "$service_stats_cmd" >"$_curr" 2>&1
 
+			# Only consider statistics on timeout.  This
+			# is done below by checking if this string is
+			# contained in $_err.
+			_t="rpcinfo: RPC: Timed out"
+
 			if ! $_ok &&
+				[ "${_err#*"${_t}"}" != "$_err" ] &&
 				! cmp "$_prev" "$_curr" >/dev/null 2>&1; then
 				# Stats always implicitly change on
 				# the first monitor event, since
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh
new file mode 100755
index 00000000000..9d96bf1891a
--- /dev/null
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.117.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "NFS RPC service timeout, stats change, 10 iterations"
+
+setup
+
+cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
+service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
+service_debug_cmd="program_stack_traces nfsd 5"
+# Dummy pipeline confirms that pipelines work in this context
+service_stats_cmd="date --rfc-3339=ns | grep ."
+EOF
+
+# Test flag to indicate that stats are expected to change
+nfs_stats_set_changed "nfs" "status"
+
+nfs_iterate_test 10 "nfs:TIMEOUT"
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh
new file mode 100755
index 00000000000..76013f10eae
--- /dev/null
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.118.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "NFS RPC service timeout, stats don't change, 10 iterations"
+
+setup
+
+cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
+service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
+service_debug_cmd="program_stack_traces nfsd 5"
+# Dummy pipeline confirms that pipelines work in this context
+service_stats_cmd="echo 'hello world' | grep ."
+EOF
+
+# Test flag to indicate that stats are expected to change
+nfs_stats_set_changed "status"
+
+nfs_iterate_test 10 "nfs:TIMEOUT"
diff --git a/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh b/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
index cabcc17a6cc..24f30970424 100644
--- a/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
+++ b/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
@@ -82,6 +82,21 @@ _rpc_services_down()
 	FAKE_RPCINFO_SERVICES="$_out"
 }
 
+_rpc_services_timeout()
+{
+	_out=""
+	for _s in $FAKE_RPCINFO_SERVICES; do
+		for _i; do
+			if [ "$_i" = "${_s%%:*}" ]; then
+				debug "Marking RPC service \"${_i}\" as TIMEOUT"
+				_s="${_s}:TIMEOUT"
+			fi
+		done
+		_out="${_out}${_out:+ }${_s}"
+	done
+	FAKE_RPCINFO_SERVICES="$_out"
+}
+
 _rpc_services_up()
 {
 	_out="$FAKE_RPCINFO_SERVICES"
@@ -146,6 +161,11 @@ nfs_stats_check_changed()
 	return 1
 }
 
+rpcinfo_timed_out()
+{
+	echo "$1" | grep -q "Timed out"
+}
+
 guess_output()
 {
 	case "$1" in
@@ -256,10 +276,11 @@ rpc_failure()
 	_err_or_warn="$1"
 	_rpc_service="$2"
 	_ver="$3"
+	_why="${4:-Program not registered}"
 
 	cat <<EOF
 ${_err_or_warn} ${_rpc_service} failed RPC check:
-rpcinfo: RPC: Program not registered
+rpcinfo: RPC: ${_why}
 program ${_rpc_service}${_ver:+ version }${_ver} is not available
 EOF
 }
@@ -339,21 +360,28 @@ rpc_set_service_failure_response()
 			esac
 		fi
 
-		if rpcinfo -T tcp localhost "$_rpc_service" \
-			>/dev/null 2>&1; then
-
+		_why=""
+		_ri_out=$(rpcinfo -T tcp localhost "$_rpc_service" 2>&1)
+		# Check exit code separately for readability
+		# shellcheck disable=SC2181
+		if [ $? -eq 0 ]; then
 			echo 0 >"$_failcount_file"
 			exit # from subshell
-		elif nfs_stats_check_changed \
-			"$_rpc_service" "$_iteration"; then
+		elif rpcinfo_timed_out "$_ri_out"; then
+			_why="Timed out"
 
-			rpc_failure \
-				"WARNING: statistics changed but" \
-				"$_rpc_service" \
-				"$_ver" \
-				>"$_out"
-			echo 0 >"$_failcount_file"
-			exit # from subshell
+			if nfs_stats_check_changed \
+				"$_rpc_service" "$_iteration"; then
+
+				rpc_failure \
+					"WARNING: statistics changed but" \
+					"$_rpc_service" \
+					"$_ver" \
+					"$_why" \
+					>"$_out"
+				echo 0 >"$_failcount_file"
+				exit # from subshell
+			fi
 		fi
 
 		_numfails=$((_numfails + 1))
@@ -367,6 +395,7 @@ rpc_set_service_failure_response()
 				"ERROR:" \
 				"$_rpc_service" \
 				"$_ver" \
+				"$_why" \
 				>"$_out"
 		else
 			_unhealthy=false
@@ -379,6 +408,7 @@ rpc_set_service_failure_response()
 					"WARNING:" \
 					"$_rpc_service" \
 					"$_ver" \
+					"$_why" \
 					>"$_out"
 			fi
 
@@ -424,7 +454,8 @@ program_stack_traces()
 #
 # - 1st argument is the number of iterations.
 #
-# - 2nd argument is the NFS/RPC service being tested
+# - 2nd argument is the NFS/RPC service being tested, with optional
+#   TIMEOUT flag
 #
 #   This service is marked down before the 1st iteration.
 #
@@ -454,7 +485,23 @@ nfs_iterate_test()
 		debug <<EOF
 --------------------------------------------------
 EOF
-		_rpc_services_down "$_rpc_service"
+
+		_action="${_rpc_service#*:}"
+		if [ "$_action" != "$_rpc_service" ]; then
+			_rpc_service="${_rpc_service%:*}"
+		else
+			_action=""
+		fi
+
+		if [ -n "$_action" ]; then
+			case "$_action" in
+			TIMEOUT)
+				_rpc_services_timeout "$_rpc_service"
+				;;
+			esac
+		else
+			_rpc_services_down "$_rpc_service"
+		fi
 	fi
 
 	debug <<EOF
diff --git a/ctdb/tests/UNIT/eventscripts/stubs/rpcinfo b/ctdb/tests/UNIT/eventscripts/stubs/rpcinfo
index 8732751396e..86b809411c4 100755
--- a/ctdb/tests/UNIT/eventscripts/stubs/rpcinfo
+++ b/ctdb/tests/UNIT/eventscripts/stubs/rpcinfo
@@ -40,6 +40,8 @@ parse_options()
 
 parse_options "$@"
 
+_fail_msg="rpcinfo: RPC: Program not registered"
+
 for i in ${FAKE_RPCINFO_SERVICES}; do
 	# This is stupidly cumulative, but needs to happen after the
 	# initial split of the list above.
@@ -47,9 +49,16 @@ for i in ${FAKE_RPCINFO_SERVICES}; do
 	# Want glob expansion
 	# shellcheck disable=SC2086
 	set -- $i
-	# $1 = program, $2 = low version, $3 = high version
+	# $1 = program, $2 = low version, $3 = high version, $4 = flag
 
 	if [ "$1" = "$p" ]; then
+		case "$4" in
+		TIMEOUT)
+			_fail_msg="rpcinfo: RPC: Timed out"
+			break
+			;;
+		esac
+
 		if [ -n "$v" ]; then
 			if [ "$2" -le "$v" ] && [ "$v" -le "$3" ]; then
 				echo "program ${p} version ${v} ready and waiting"
@@ -68,7 +77,7 @@ for i in ${FAKE_RPCINFO_SERVICES}; do
 	fi
 done
 
-echo "rpcinfo: RPC: Program not registered" >&2
+echo "$_fail_msg" >&2
 if [ -n "$v" ]; then
 	echo "program ${p} version ${v} is not available"
 else
-- 
2.11.4.GIT