ctdb/tools/statd_callout_helper

   1 #!/bin/sh
   2
   3 # statd must be configured to use this script as its high availability call-out.
   4 #
   5 # Modern NFS utils versions use /etc/nfs.conf:
   6 #
   7 #   [statd]
   8 #     name = mycluster
   9 #     ha-callout = /usr/local/libexec/ctdb/statd_callout
  10 #
  11 # Older Linux versions may use something like the following...
  12 #
  13 # /etc/sysconfig/nfs (Red Hat) or /etc/default/nfs-common (Debian):
  14 #   STATD_HOSTNAME="mycluster -H /usr/local/libexec/ctdb/statd_callout"
  15 #
  16 # If using Linux kernel NFS then the following should also be set in
  17 # /etc/nfs.conf:
  18 #
  19 #   [sm-notify]
  20 #      lift-grace = n
  21 #
  22 # See sm-notify(8) for details.  This doesn't matter when using
  23 # NFS-Ganesha because sm-notify's attempt to lift grace will fail
  24 # silently if /proc/fs/lockd/nlm_end_grace is not found.
  25 #
  26
  27 if [ -z "$CTDB_BASE" ] ; then
  28         export CTDB_BASE="/usr/local/etc/ctdb"
  29 fi
  30
  31 . "${CTDB_BASE}/functions"
  32
  33 # Overwrite this so we get some logging
  34 die()
  35 {
  36         script_log "statd_callout_helper" "$@"
  37         exit 1
  38 }
  39
  40 ############################################################
  41
  42 ctdb_setup_state_dir "service" "nfs"
  43
  44 find_statd_sm_dir()
  45 {
  46         if [ -n "$CTDB_TEST_MODE" ]; then
  47                 _f="${CTDB_TEST_TMP_DIR}/sm"
  48                 mkdir -p "$_f" "${_f}.bak"
  49                 echo "$_f"
  50                 return
  51         fi
  52
  53         for _sm_dir in /var/lib/nfs/statd/sm /var/lib/nfs/sm; do
  54                 if [ -d "$_sm_dir" ]; then
  55                         echo "$_sm_dir"
  56                         break
  57                 fi
  58         done
  59 }
  60
  61 # Ensure the state directory exists and can be written when called as
  62 # a non-root user.  Assume the user to run as is the owner of the
  63 # system statd sm directory, since both rpc.statd and sm-notify run as
  64 # this directory's owner, so it can read and modify the directory.
  65 create_add_del_client_dir()
  66 {
  67         _dir="$1"
  68
  69         if [ ! -d "$_dir" ]; then
  70                 mkdir -p "$_dir" || die "Failed to create directory \"${_dir}\""
  71                 ref=$(find_statd_sm_dir)
  72                 [ -n "$ref" ] || die "Failed to find statd sm directory"
  73                 chown --reference="$ref" "$_dir"
  74         fi
  75 }
  76
  77 # script_state_dir set by ctdb_setup_state_dir()
  78 # shellcheck disable=SC2154
  79 statd_callout_state_dir="${script_state_dir}/statd_callout"
  80
  81 statd_callout_db="ctdb.tdb"
  82 statd_callout_queue_dir="${statd_callout_state_dir}/queue"
  83
  84 ############################################################
  85
  86 # Read pairs of:
  87 #   server-IP client-IP
  88 # from stdin and send associated SM_NOTIFY packets.
  89 send_notifies()
  90 {
  91         # State must monotonically increase, across the entire
  92         # cluster.  Use seconds since epoch and assume the time is in
  93         # sync across nodes.  Even numbers mean service is shut down,
  94         # odd numbers mean service is up.  However, sm-notify always
  95         # reads the state and converts it to odd (if necessary, by
  96         # adding 1 when it is even) because it only sends "up"
  97         # notifications.  Note that there is a 2038 issue here but we
  98         # will get to that later.
  99         _state=$(date '+%s')
 100
 101         _helper="${CTDB_HELPER_BINDIR}/ctdb_smnotify_helper"
 102
 103         _notify_dir="${statd_callout_state_dir}/sm-notify"
 104         mkdir -p "$_notify_dir"
 105
 106         while read -r _sip _cip; do
 107                 # Create a directory per server IP containing a file
 108                 # for each client IP
 109                 mkdir -p \
 110                         "${_notify_dir}/${_sip}/sm" \
 111                         "${_notify_dir}/${_sip}/sm.bak"
 112
 113                 _out="${_notify_dir}/${_sip}/sm/${_cip}"
 114                 "$_helper" "monitor" "$_cip" "$_sip" >"$_out"
 115         done
 116
 117         # Send notifications for server startup
 118         _ref=$(find_statd_sm_dir)
 119         for _sip_dir in "$_notify_dir"/*; do
 120                 if [ "$_sip_dir" = "${_notify_dir}/*" ]; then
 121                         break
 122                 fi
 123
 124                 _sip="${_sip_dir##*/}" # basename
 125
 126                 # Write the state as a host order 32-bit integer.  See
 127                 # note at top of function about state.
 128                 _out="${_sip_dir}/state"
 129                 "$_helper" "state" "$_state" >"$_out"
 130
 131                 # The ownership of the directory and contents should
 132                 # match the system's statd sm directory, so that
 133                 # sm-notify drops privileges and switches to run as
 134                 # the directory owner.
 135                 chown -R --reference="$_ref" "$_sip_dir"
 136                 timeout 10 sm-notify -d -f -m 0 -n -P "$_sip_dir" -v "$_sip"
 137
 138                 rm -rf "$_sip_dir"
 139         done
 140 }
 141
 142 delete_records()
 143 {
 144         while read -r _sip _cip; do
 145                 _key="statd-state@${_sip}@${_cip}"
 146                 echo "\"${_key}\" \"\""
 147         done | $CTDB ptrans "$statd_callout_db"
 148 }
 149
 150 ############################################################
 151
 152 # Keep a file per server-IP/client-IP pair, to keep track of the last
 153 # "add-client" or "del-client'.  These get pushed to a database during
 154 # "update", which will generally be run once each "monitor" cycle.  In
 155 # this way we avoid scalability problems with flood of persistent
 156 # transactions after a "notify" when all the clients re-take their
 157 # locks.
 158
 159 startup()
 160 {
 161         create_add_del_client_dir "$statd_callout_queue_dir"
 162
 163         $CTDB attach "$statd_callout_db" persistent
 164
 165         _default="${CTDB_SCRIPT_VARDIR}/statd_callout.conf"
 166         _config_file="${CTDB_STATD_CALLOUT_CONFIG_FILE:-"${_default}"}"
 167         cat >"$_config_file" <<EOF
 168 persistent_db
 169 ${statd_callout_queue_dir}
 170 ${CTDB_MY_PUBLIC_IPS_CACHE}
 171 EOF
 172 }
 173
 174 ############################################################
 175
 176 case "$1" in
 177 startup)
 178         startup
 179         ;;
 180
 181 update)
 182         cd "$statd_callout_queue_dir" ||
 183                 die "Failed to change directory to \"${statd_callout_queue_dir}\""
 184         files=$(echo statd-state@*)
 185         if [ "$files" = "statd-state@*" ]; then
 186                 # No files!
 187                 exit 0
 188         fi
 189         sed_expr=$(awk '{
 190                 ip = $1; gsub(/\./, "\\.", ip);
 191                 printf "/statd-state@%s@/p\n", ip }' "$CTDB_MY_PUBLIC_IPS_CACHE")
 192         # Intentional multi-word expansion for multiple files
 193         # shellcheck disable=SC2086
 194         items=$(sed -n "$sed_expr" $files)
 195         if [ -n "$items" ]; then
 196                 if echo "$items" | $CTDB ptrans "$statd_callout_db"; then
 197                         # shellcheck disable=SC2086
 198                         rm $files
 199                 fi
 200         fi
 201         ;;
 202
 203 notify)
 204         # we must restart the lockmanager (on all nodes) so that we get
 205         # a clusterwide grace period (so other clients don't take out
 206         # conflicting locks through other nodes before all locks have been
 207         # reclaimed)
 208
 209         # we need these settings to make sure that no tcp connections survive
 210         # across a very fast failover/failback
 211         #echo 10 > /proc/sys/net/ipv4/tcp_fin_timeout
 212         #echo 0 > /proc/sys/net/ipv4/tcp_max_tw_buckets
 213         #echo 0 > /proc/sys/net/ipv4/tcp_max_orphans
 214
 215         # Delete the notification list for statd, we don't want it to
 216         # ping any clients
 217         dir=$(find_statd_sm_dir)
 218         rm -f "${dir}/"* "${dir}.bak/"*
 219
 220         # We must also let some time pass between stopping and
 221         # restarting the lock manager.  Otherwise there is a window
 222         # where the lock manager will respond "strangely" immediately
 223         # after restarting it, which causes clients to fail to reclaim
 224         # their locks.
 225         nfs_callout_init
 226         "$CTDB_NFS_CALLOUT" "stop" "nlockmgr" >/dev/null 2>&1
 227         sleep 2
 228         "$CTDB_NFS_CALLOUT" "start" "nlockmgr" >/dev/null 2>&1
 229
 230         # we now need to send out additional statd notifications to ensure
 231         # that clients understand that the lockmanager has restarted.
 232         # we have three cases:
 233         # 1, clients that ignore the ip address the stat notification came from
 234         #    and ONLY care about the 'name' in the notify packet.
 235         #    these clients ONLY work with lock failover IFF that name
 236         #    can be resolved into an ipaddress that matches the one used
 237         #    to mount the share.  (==linux clients)
 238         #    This is handled when starting lockmanager above,  but those
 239         #    packets are sent from the "wrong" ip address, something linux
 240         #    clients are ok with, buth other clients will barf at.
 241         # 2, Some clients only accept statd packets IFF they come from the
 242         #    'correct' ip address.
 243         #    Send out the notification using the 'correct' ip address and also
 244         #    specify the 'correct' hostname in the statd packet.
 245         #    Some clients require both the correct source address and also the
 246         #    correct name. (these clients also ONLY work if the ip addresses
 247         #    used to map the share can be resolved into the name returned in
 248         #    the notify packet.)
 249         #
 250         # For all IPs we serve, collect info and push to the config database
 251
 252         # Construct a sed expression to take catdb output and produce pairs of:
 253         #   server-IP client-IP
 254         # but only for the server-IPs that are hosted on this node.
 255         sed_expr=$(awk '{
 256                        ip = $1; gsub(/\./, "\\.", ip);
 257                        printf "s/^key.*=.*statd-state@\\(%s\\)@\\([^\"]*\\).*/\\1 \\2/p\n", ip }' \
 258                                "$CTDB_MY_PUBLIC_IPS_CACHE")
 259
 260         statd_state=$($CTDB catdb "$statd_callout_db" |
 261                 sed -n "$sed_expr" |
 262                 sort)
 263         [ -n "$statd_state" ] || exit 0
 264
 265         echo "$statd_state" | send_notifies
 266         echo "$statd_state" | delete_records
 267
 268         # Remove any stale touch files (i.e. for IPs not currently
 269         # hosted on this node and created since the last "update").
 270         # There's nothing else we can do with them at this stage.
 271         pnn=$(ctdb_get_pnn)
 272         $CTDB ip all |
 273                 tail -n +2 |
 274                 awk -v pnn="$pnn" 'pnn != $2 { print $1 }' |
 275                 while read -r sip; do
 276                         rm -f "${statd_callout_queue_dir}/statd-state@${sip}@"*
 277                 done
 278         ;;
 279 esac