5 # Copyright 2017 Easy Connect AS
7 # Permission is hereby granted, free of charge, to any person
8 # obtaining a copy of this software and associated documentation files
9 # (the "Software"), to deal in the Software without restriction,
10 # including without limitation the rights to use, copy, modify, merge,
11 # publish, distribute, sublicense, and/or sell copies of the Software,
12 # and to permit persons to whom the Software is furnished to do so,
13 # subject to the following conditions:
15 # The above copyright notice and this permission notice shall be
16 # included in all copies or substantial portions of the Software.
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
22 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
23 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29 # Knut Arne Bjørndal <knut.arne.bjorndal@easyconnect.no>
35 if [ -n "$dryrun" ]; then
38 echo "Executing:" "$@"
44 if hcheck
-L --no-simulation --machine-readable |
grep ^HCHECK_INIT_CLUSTER_OFFLINE_PRI
= |
grep -v =0 >/dev
/null
; then
45 echo "hcheck reports instances with offline or drained primary nodes, aborting";
48 _run_invasive gnt-cluster verify ||
return $?
51 check_cluster_nplus1
() {
52 hcheck
-L --no-simulation --machine-readable |
grep ^HCHECK_INIT_CLUSTER_N1_FAIL
= |
grep "=0" > /dev
/null
56 for nodegroup
in $
(gnt-group list
--no-headers -o uuid
); do
57 if [ "$(hbal -L -G "$nodegroup" | egrep '^Solution length=.*' | cut -d= -f2)" -gt 0 ]; then
61 # If we get here all the node groups are balanced
67 gnt-node list
--no-headers -o name
-F "$@"
69 gnt-node list
--no-headers -o name
76 _run_invasive gnt-node add-tags
"$n" "$tag" ||
return $?
83 _run_invasive gnt-node remove-tags
"$n" "$tag" ||
return $?
88 hroller
-L --node-tags "$needs_maintenance_tag" --one-step-only --no-headers
91 abort_if_master_in_rebootgroup
() {
93 master
=$
(gnt-cluster getmaster
) ||
return $?
94 for n
in $rebootgroup; do
95 if [ "$n" = "$master" ]; then
96 echo "Master ($master) is in next rebootgroup, please failover to a different master candidate and re-run this script there to continue";
105 _run_invasive gnt-node modify
-D yes "$1"
109 _run_invasive gnt-node modify
-O yes "$1"
112 unset_node_offline
() {
113 _run_invasive gnt-node modify
-O no
"$1"
117 for nodegroup
in $
(gnt-group list
--no-headers -o uuid
); do
118 _run_invasive hbal
-L -G "$nodegroup" -X "$@" ||
return $?
122 # shellcheck disable=SC2086
124 nodes
=$
(list_nodes
"$@") ||
return $?
;
125 tag_nodes
"$needs_maintenance_tag" $nodes ||
return $?
;
127 echo "Nodes with $needs_maintenance_tag tag:"
128 list_nodes
"'$needs_maintenance_tag' in tags";
131 # shellcheck disable=SC2086
132 next_tag_maintenance_group
() {
133 verify_cluster ||
return $?
;
135 rebootgroup
=$
(get_rebootgroup
) ||
return $?
;
136 echo "Next maintenance group:";
139 abort_if_master_in_rebootgroup
$rebootgroup ||
return $?
;
141 tag_nodes
"$next_maintenance_tag" $rebootgroup ||
return $?
;
144 next_evacuate_nodes
() {
145 nodes
="$(list_nodes "'$next_maintenance_tag' in tags
" )" ||
return $?
;
148 set_node_drained
"$n" ||
return $?
150 _run_invasive gnt-node migrate
-f "$n" ||
return $?
153 gnt-cluster verify
--no-nplus1-mem ||
return $?
156 set_node_offline
"$n" ||
return $?
;
157 tag_nodes
"$in_maintenance_tag" "$n" ||
return $?
;
158 untag_nodes
"$next_maintenance_tag" "$n" ||
return $?
;
161 echo "Nodes drained and ready for maintenance:";
165 interactive_confirm_finished_maintenance
() {
166 echo "Press enter when the following nodes are online again after maintenance:";
167 list_nodes
"'$in_maintenance_tag' in tags"
171 next_clear_offline
() {
172 in_maintenance
="$(list_nodes "'$in_maintenance_tag' in tags
")" ||
return $?
;
174 for n
in $in_maintenance; do
175 if ping -c 1 "$n" > /dev
/null
; then
176 unset_node_offline
"$n" ||
return $?
;
177 untag_nodes
"$in_maintenance_tag" "$n" ||
return $?
;
178 untag_nodes
"$needs_maintenance_tag" "$n" ||
return $?
;
180 echo "$n not reachable, leaving in offline state" >&2;
184 _run_invasive ganeti-watcher ||
return $?
;
185 balance_cluster
--no-disk-moves ||
return $?
;
186 check_cluster_nplus1 ||
{
187 echo "hbal --no-disk-moves did not leave the cluster N+1, trying again";
188 balance_cluster ||
return $?
;
194 echo "next (and run) does not take any arguments" >&2
198 if [ "$(list_nodes "'$next_maintenance_tag' in tags
" | wc -l)" -gt 0 ]; then
199 next_evacuate_nodes ||
return $?
;
200 elif [ "$(list_nodes "'$in_maintenance_tag' in tags
" | wc -l)" -gt 0 ]; then
201 interactive_confirm_finished_maintenance ||
return $?
;
202 next_clear_offline ||
return $?
;
203 elif [ "$(list_nodes "'$needs_maintenance_tag' in tags
" | wc -l)" -gt 0 ]; then
204 echo "Finding the next maintenance group";
205 next_tag_maintenance_group ||
return $?
206 elif ! check_balanced
; then
207 balance_cluster ||
return $?
;
209 echo "nothing more to do";
215 while [ ! "$finished" ]; do
216 next
"$@" ||
return $?
221 if [ $# -eq 2 ]; then
227 Usage: $0 [options] COMMAND
232 Tag all nodes (matching optional filter) as needing maintenance
235 Does the first of these steps that are applicable:
236 * If there are nodes scheduled for maintenance: Drain them and mark them as offline
237 * If there are nodes down for maintenance: Mark them as online after asking for confirmation
238 * If there are nodes tagged as needing maintenance: Find a group using hroller and schedule them for maintenance
239 * If the cluster is unbalanced: Balance it
242 Runs "next" in a loop until there is nothing more to do
245 --needs-maintenance-tag
247 Tag used to indicate that a node needs
248 maintenance. Defaults to needsreboot
252 Tag used to indicate that a node is currently
253 undergoing maintenance. Defaults to in_maintenance
255 --next-maintenance-tag
257 Tag used to indicate the next set of nodes scheduled
258 for maintenance. Defaults to next_maintenance_group
262 Don't actually run any commands that change the state
267 if [ $# -eq 1 ]; then
272 needs_maintenance_tag
=needsreboot
273 next_maintenance_tag
=next_maintenance_group
274 in_maintenance_tag
=in_maintenance
279 while [ $# -gt 0 ]; do
281 --needs-maintenance-tag|
-t)
282 needs_maintenance_tag
="$2";
285 --in-maintenance-tag)
286 in_maintenance_tag
="$2";
289 --next-maintenance-tag)
290 next_maintenance_tag
="$2";
293 --dry-run|
--dryrun|
-n)
301 usage
"Unknown option $1" 64;
309 test $# -eq 0 && usage
"Missing command" 64
313 tag
) command='tag'; shift ;;
314 next
) command='next'; shift ;;
315 run
) command='run'; shift ;;
317 usage
"Unknown command $1" 64;
321 "$command" "$@" ||
exit $?