Improve command and option handling
[ganeti-roll-helper.git] / ganeti-roll-helper.sh
blob080fa5d440a9edfbafbb185a4bc4ca1892ce05c0
1 #!/bin/sh
3 # COPYRIGHT & LICENSE
5 # Copyright 2017 Easy Connect AS
7 # Permission is hereby granted, free of charge, to any person
8 # obtaining a copy of this software and associated documentation files
9 # (the "Software"), to deal in the Software without restriction,
10 # including without limitation the rights to use, copy, modify, merge,
11 # publish, distribute, sublicense, and/or sell copies of the Software,
12 # and to permit persons to whom the Software is furnished to do so,
13 # subject to the following conditions:
15 # The above copyright notice and this permission notice shall be
16 # included in all copies or substantial portions of the Software.
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
22 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
23 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 # SOFTWARE.
27 # AUTHORS
29 # Knut Arne Bjørndal <knut.arne.bjorndal@easyconnect.no>
32 set -u;
34 _run_invasive() {
35 if [ -n "$dryrun" ]; then
36 echo "DRY-RUN: " "$@"
37 else
38 echo "Executing:" "$@"
39 "$@"
43 verify_cluster() {
44 if hcheck -L --no-simulation --machine-readable | grep ^HCHECK_INIT_CLUSTER_OFFLINE_PRI= | grep -v =0 >/dev/null; then
45 echo "hcheck reports instances with offline or drained primary nodes, aborting";
46 return 1
48 _run_invasive gnt-cluster verify || return $?
51 check_cluster_nplus1() {
52 hcheck -L --no-simulation --machine-readable | grep ^HCHECK_INIT_CLUSTER_N1_FAIL= | grep "=0" > /dev/null
55 check_balanced() {
56 for nodegroup in $(gnt-group list --no-headers -o uuid); do
57 if [ "$(hbal -L -G "$nodegroup" | egrep '^Solution length=.*' | cut -d= -f2)" -gt 0 ]; then
58 return 1
60 done
61 # If we get here all the node groups are balanced
62 return 0
65 list_nodes() {
66 if [ $# -gt 0 ]; then
67 gnt-node list --no-headers -o name -F "$@"
68 else
69 gnt-node list --no-headers -o name
73 tag_nodes() {
74 tag="$1"; shift;
75 for n in "$@"; do
76 _run_invasive gnt-node add-tags "$n" "$tag" || return $?
77 done
80 untag_nodes() {
81 tag="$1"; shift;
82 for n in "$@"; do
83 _run_invasive gnt-node remove-tags "$n" "$tag" || return $?
84 done
87 get_rebootgroup() {
88 hroller -L --node-tags "$needs_maintenance_tag" --one-step-only --no-headers
91 abort_if_master_in_rebootgroup() {
92 rebootgroup="$*"
93 master=$(gnt-cluster getmaster) || return $?
94 for n in $rebootgroup; do
95 if [ "$n" = "$master" ]; then
96 echo "Master ($master) is in next rebootgroup, please failover to a different master candidate and re-run this script there to continue";
97 return 1
99 done
101 return 0;
104 set_node_drained() {
105 _run_invasive gnt-node modify -D yes "$1"
108 set_node_offline() {
109 _run_invasive gnt-node modify -O yes "$1"
112 unset_node_offline() {
113 _run_invasive gnt-node modify -O no "$1"
116 balance_cluster() {
117 for nodegroup in $(gnt-group list --no-headers -o uuid); do
118 _run_invasive hbal -L -G "$nodegroup" -X "$@" || return $?
119 done
122 # shellcheck disable=SC2086
123 tag() {
124 nodes=$(list_nodes "$@") || return $?;
125 tag_nodes "$needs_maintenance_tag" $nodes || return $?;
127 echo "Nodes with $needs_maintenance_tag tag:"
128 list_nodes "'$needs_maintenance_tag' in tags";
131 # shellcheck disable=SC2086
132 next_tag_maintenance_group() {
133 verify_cluster || return $?;
135 rebootgroup=$(get_rebootgroup) || return $?;
136 echo "Next maintenance group:";
137 echo "$rebootgroup";
139 abort_if_master_in_rebootgroup $rebootgroup || return $?;
141 tag_nodes "$next_maintenance_tag" $rebootgroup || return $?;
144 next_evacuate_nodes() {
145 nodes="$(list_nodes "'$next_maintenance_tag' in tags" )" || return $?;
147 for n in $nodes; do
148 set_node_drained "$n" || return $?
150 _run_invasive gnt-node migrate -f "$n" || return $?
151 done
153 gnt-cluster verify --no-nplus1-mem || return $?
155 for n in $nodes; do
156 set_node_offline "$n" || return $?;
157 tag_nodes "$in_maintenance_tag" "$n" || return $?;
158 untag_nodes "$next_maintenance_tag" "$n" || return $?;
159 done
161 echo "Nodes drained and ready for maintenance:";
162 echo "$nodes";
165 interactive_confirm_finished_maintenance() {
166 echo "Press enter when the following nodes are online again after maintenance:";
167 list_nodes "'$in_maintenance_tag' in tags"
168 read -r _;
171 next_clear_offline() {
172 in_maintenance="$(list_nodes "'$in_maintenance_tag' in tags")" || return $?;
174 for n in $in_maintenance; do
175 if ping -c 1 "$n" > /dev/null; then
176 unset_node_offline "$n" || return $?;
177 untag_nodes "$in_maintenance_tag" "$n" || return $?;
178 untag_nodes "$needs_maintenance_tag" "$n" || return $?;
179 else
180 echo "$n not reachable, leaving in offline state" >&2;
182 done
184 _run_invasive ganeti-watcher || return $?;
185 balance_cluster --no-disk-moves || return $?;
186 check_cluster_nplus1 || {
187 echo "hbal --no-disk-moves did not leave the cluster N+1, trying again";
188 balance_cluster || return $?;
192 next() {
193 test $# -eq 0 || {
194 echo "next (and run) does not take any arguments" >&2
195 return 64
198 if [ "$(list_nodes "'$next_maintenance_tag' in tags" | wc -l)" -gt 0 ]; then
199 next_evacuate_nodes || return $?;
200 elif [ "$(list_nodes "'$in_maintenance_tag' in tags" | wc -l)" -gt 0 ]; then
201 interactive_confirm_finished_maintenance || return $?;
202 next_clear_offline || return $?;
203 elif [ "$(list_nodes "'$needs_maintenance_tag' in tags" | wc -l)" -gt 0 ]; then
204 echo "Finding the next maintenance group";
205 next_tag_maintenance_group || return $?
206 elif ! check_balanced; then
207 balance_cluster || return $?;
208 else
209 echo "nothing more to do";
210 finished=1;
214 run() {
215 while [ ! "$finished" ]; do
216 next "$@" || return $?
217 done
220 usage() {
221 if [ $# -eq 2 ]; then
222 echo "$1" >&2
223 shift;
226 cat <<EOF
227 Usage: $0 [options] COMMAND
229 COMMANDS:
231 tag [filter]
232 Tag all nodes (matching optional filter) as needing maintenance
234 next
235 Does the first of these steps that are applicable:
236 * If there are nodes scheduled for maintenance: Drain them and mark them as offline
237 * If there are nodes down for maintenance: Mark them as online after asking for confirmation
238 * If there are nodes tagged as needing maintenance: Find a group using hroller and schedule them for maintenance
239 * If the cluster is unbalanced: Balance it
242 Runs "next" in a loop until there is nothing more to do
244 OPTIONS:
245 --needs-maintenance-tag
247 Tag used to indicate that a node needs
248 maintenance. Defaults to needsreboot
250 --in-maintenance-tag
252 Tag used to indicate that a node is currently
253 undergoing maintenance. Defaults to in_maintenance
255 --next-maintenance-tag
257 Tag used to indicate the next set of nodes scheduled
258 for maintenance. Defaults to next_maintenance_group
260 --dry-run
262 Don't actually run any commands that change the state
263 of the cluster
267 if [ $# -eq 1 ]; then
268 exit "$1"
272 needs_maintenance_tag=needsreboot
273 next_maintenance_tag=next_maintenance_group
274 in_maintenance_tag=in_maintenance
275 dryrun=''
277 finished=''
279 while [ $# -gt 0 ]; do
280 case "$1" in
281 --needs-maintenance-tag|-t)
282 needs_maintenance_tag="$2";
283 shift 2;
285 --in-maintenance-tag)
286 in_maintenance_tag="$2";
287 shift 2;
289 --next-maintenance-tag)
290 next_maintenance_tag="$2";
291 shift 2;
293 --dry-run|--dryrun|-n)
294 dryrun=1
295 shift 1;
297 -h|--help)
298 usage 0;
301 usage "Unknown option $1" 64;
304 break
306 esac
307 done
309 test $# -eq 0 && usage "Missing command" 64
311 command=''
312 case "$1" in
313 tag) command='tag'; shift ;;
314 next) command='next'; shift ;;
315 run) command='run'; shift ;;
317 usage "Unknown command $1" 64;
319 esac
321 "$command" "$@" || exit $?