HBASE-26811 Secondary replica may be disabled for read forever (#4182)
[hbase.git] / bin / rolling-restart.sh
blob46d5cba51b20a1dd90d8c2d7e3ba214c478f11d7
1 #!/usr/bin/env bash
3 #/**
4 # * Licensed to the Apache Software Foundation (ASF) under one
5 # * or more contributor license agreements. See the NOTICE file
6 # * distributed with this work for additional information
7 # * regarding copyright ownership. The ASF licenses this file
8 # * to you under the Apache License, Version 2.0 (the
9 # * "License"); you may not use this file except in compliance
10 # * with the License. You may obtain a copy of the License at
11 # *
12 # * http://www.apache.org/licenses/LICENSE-2.0
13 # *
14 # * Unless required by applicable law or agreed to in writing, software
15 # * distributed under the License is distributed on an "AS IS" BASIS,
16 # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 # * See the License for the specific language governing permissions and
18 # * limitations under the License.
19 # */
21 # Run a shell command on all regionserver hosts.
23 # Environment Variables
25 # HBASE_REGIONSERVERS File naming remote hosts.
26 # Default is ${HADOOP_CONF_DIR}/regionservers
27 # HADOOP_CONF_DIR Alternate conf dir. Default is ${HADOOP_HOME}/conf.
28 # HBASE_CONF_DIR Alternate hbase conf dir. Default is ${HBASE_HOME}/conf.
29 # HBASE_SLAVE_SLEEP Seconds to sleep between spawning remote commands.
30 # HBASE_SLAVE_TIMEOUT Seconds to wait for timing out a remote command.
31 # HBASE_SSH_OPTS Options passed to ssh when running remote commands.
33 # Modelled after $HADOOP_HOME/bin/slaves.sh.
35 usage_str="Usage: `basename $0` [--config <hbase-confdir>] [--autostart-window-size <window size in hours>]\
36 [--autostart-window-retry-limit <retry count limit for autostart>] [--autostart] [--rs-only] [--master-only] \
37 [--graceful] [--maxthreads xx] [--noack] [--movetimeout]]"
39 function usage() {
40 echo "${usage_str}"
43 bin=`dirname "$0"`
44 bin=`cd "$bin">/dev/null; pwd`
46 # default autostart args value indicating infinite window size and no retry limit
47 AUTOSTART_WINDOW_SIZE=0
48 AUTOSTART_WINDOW_RETRY_LIMIT=0
50 . "$bin"/hbase-config.sh
52 # start hbase daemons
53 errCode=$?
54 if [ $errCode -ne 0 ]
55 then
56 exit $errCode
59 RR_RS=1
60 RR_MASTER=1
61 RR_GRACEFUL=0
62 RR_MAXTHREADS=1
63 RR_MOVE_TIMEOUT=2147483647
64 START_CMD_NON_DIST_MODE=restart
65 START_CMD_DIST_MODE=start
66 RESTART_CMD_REGIONSERVER=restart
68 while [ $# -gt 0 ]; do
69 case "$1" in
70 --rs-only|-r)
71 RR_RS=1
72 RR_MASTER=0
73 RR_GRACEFUL=0
74 shift
76 --autostart)
77 START_CMD_NON_DIST_MODE="--autostart-window-size ${AUTOSTART_WINDOW_SIZE} --autostart-window-retry-limit ${AUTOSTART_WINDOW_RETRY_LIMIT} autorestart"
78 START_CMD_DIST_MODE="--autostart-window-size ${AUTOSTART_WINDOW_SIZE} --autostart-window-retry-limit ${AUTOSTART_WINDOW_RETRY_LIMIT} autostart"
79 RESTART_CMD_REGIONSERVER="--autostart-window-size ${AUTOSTART_WINDOW_SIZE} --autostart-window-retry-limit ${AUTOSTART_WINDOW_RETRY_LIMIT} autorestart"
80 shift
82 --master-only)
83 RR_RS=0
84 RR_MASTER=1
85 RR_GRACEFUL=0
86 shift
88 --graceful)
89 RR_RS=0
90 RR_MASTER=0
91 RR_GRACEFUL=1
92 shift
94 --maxthreads)
95 shift
96 RR_MAXTHREADS=$1
97 shift
99 --noack)
100 RR_NOACK="--noack"
101 shift
103 --movetimeout)
104 shift
105 RR_MOVE_TIMEOUT=$1
106 shift
108 --help|-h)
109 usage
110 exit 0
113 echo Bad argument: $1
114 usage
115 exit 1
117 esac
118 done
120 # quick function to get a value from the HBase config file
121 # HBASE-6504 - only take the first line of the output in case verbose gc is on
122 distMode=`HBASE_CONF_DIR=${HBASE_CONF_DIR} $bin/hbase org.apache.hadoop.hbase.util.HBaseConfTool hbase.cluster.distributed | head -n 1`
123 if [ "$distMode" == 'false' ]; then
124 if [ $RR_RS -ne 1 ] || [ $RR_MASTER -ne 1 ]; then
125 echo Cant do selective rolling restart if not running distributed
126 exit 1
128 "$bin"/hbase-daemon.sh ${START_CMD_NON_DIST_MODE} master
129 else
130 zparent=`$bin/hbase org.apache.hadoop.hbase.util.HBaseConfTool zookeeper.znode.parent`
131 if [ "$zparent" == "null" ]; then zparent="/hbase"; fi
133 if [ $RR_MASTER -eq 1 ]; then
134 # stop all masters before re-start to avoid races for master znode
135 "$bin"/hbase-daemon.sh --config "${HBASE_CONF_DIR}" stop master
136 "$bin"/hbase-daemons.sh --config "${HBASE_CONF_DIR}" \
137 --hosts "${HBASE_BACKUP_MASTERS}" stop master-backup
139 # make sure the master znode has been deleted before continuing
140 zmaster=`$bin/hbase org.apache.hadoop.hbase.util.HBaseConfTool zookeeper.znode.master`
141 if [ "$zmaster" == "null" ]; then zmaster="master"; fi
142 zmaster=$zparent/$zmaster
143 echo -n "Waiting for Master ZNode ${zmaster} to expire"
144 echo
145 while ! "$bin"/hbase zkcli stat $zmaster 2>&1 | grep "Node does not exist"; do
146 echo -n "."
147 sleep 1
148 done
149 echo #force a newline
151 # all masters are down, now restart
152 "$bin"/hbase-daemon.sh --config "${HBASE_CONF_DIR}" ${START_CMD_DIST_MODE} master
153 "$bin"/hbase-daemons.sh --config "${HBASE_CONF_DIR}" \
154 --hosts "${HBASE_BACKUP_MASTERS}" ${START_CMD_DIST_MODE} master-backup
156 echo "Wait a minute for master to come up join cluster"
157 sleep 60
159 # Master joing cluster will start in cleaning out regions in transition.
160 # Wait until the master has cleaned out regions in transition before
161 # giving it a bunch of work to do; master is vulnerable during startup
162 zunassigned=`$bin/hbase org.apache.hadoop.hbase.util.HBaseConfTool zookeeper.znode.unassigned`
163 if [ "$zunassigned" == "null" ]; then zunassigned="region-in-transition"; fi
164 zunassigned="$zparent/$zunassigned"
165 # Checking if /hbase/region-in-transition exist
166 ritZnodeCheck=`$bin/hbase zkcli stat ${zunassigned} 2>&1 | tail -1 \
167 | grep "Node does not exist:" >/dev/null`
168 ret=$?
169 if test 0 -eq ${ret}
170 then
171 echo "Znode ${zunassigned} does not exist"
172 else
173 echo -n "Waiting for ${zunassigned} to empty"
174 while true ; do
175 unassigned=`$bin/hbase zkcli stat ${zunassigned} 2>&1 \
176 | grep -e 'numChildren = '|sed -e 's,numChildren = ,,'`
177 if test 0 -eq ${unassigned}
178 then
179 echo
180 break
181 else
182 echo -n " ${unassigned}"
184 sleep 1
185 done
189 if [ $RR_RS -eq 1 ]; then
190 # unlike the masters, roll all regionservers one-at-a-time
191 export HBASE_SLAVE_PARALLEL=false
192 "$bin"/hbase-daemons.sh --config "${HBASE_CONF_DIR}" \
193 --hosts "${HBASE_REGIONSERVERS}" ${RESTART_CMD_REGIONSERVER} regionserver
196 if [ $RR_GRACEFUL -eq 1 ]; then
197 # gracefully restart all online regionservers
198 masterport=`$bin/hbase org.apache.hadoop.hbase.util.HBaseConfTool hbase.master.port`
199 if [ "$masterport" == "null" ]; then masterport="16000"; fi
200 zkrs=`$bin/hbase org.apache.hadoop.hbase.util.HBaseConfTool zookeeper.znode.rs`
201 if [ "$zkrs" == "null" ]; then zkrs="rs"; fi
202 zkrs="$zparent/$zkrs"
203 online_regionservers=`$bin/hbase zkcli ls $zkrs 2>&1 | tail -1 | sed "s/\[//" | sed "s/\]//"`
204 echo "Disabling load balancer"
205 HBASE_BALANCER_STATE=$(echo 'balance_switch false' | "$bin"/hbase --config "${HBASE_CONF_DIR}" shell -n | tail -1)
206 echo "Previous balancer state was $HBASE_BALANCER_STATE"
208 for rs in $online_regionservers
210 rs_parts=(${rs//,/ })
211 hostname=${rs_parts[0]}
212 port=${rs_parts[1]}
213 if [ "$port" -eq "$masterport" ]; then
214 echo "Skipping regionserver on master machine $hostname:$port"
215 continue
216 else
217 echo "Gracefully restarting: $hostname"
218 "$bin"/graceful_stop.sh --config ${HBASE_CONF_DIR} --restart --reload -nob --maxthreads \
219 ${RR_MAXTHREADS} ${RR_NOACK} --movetimeout ${RR_MOVE_TIMEOUT} $hostname
220 sleep 1
222 done
223 if [ "$HBASE_BALANCER_STATE" != "false" ]; then
224 echo "Restoring balancer state to $HBASE_BALANCER_STATE"
225 echo "balance_switch $HBASE_BALANCER_STATE" | "$bin"/hbase --config "${HBASE_CONF_DIR}" shell &> /dev/null