2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
18 package org
.apache
.hadoop
.hbase
.master
;
20 import static org
.junit
.Assert
.assertEquals
;
21 import static org
.junit
.Assert
.assertNotEquals
;
22 import static org
.junit
.Assert
.assertNotNull
;
23 import java
.io
.IOException
;
24 import java
.util
.List
;
25 import java
.util
.concurrent
.TimeUnit
;
26 import org
.apache
.hadoop
.conf
.Configuration
;
27 import org
.apache
.hadoop
.hbase
.ClusterMetrics
;
28 import org
.apache
.hadoop
.hbase
.HBaseClassTestRule
;
29 import org
.apache
.hadoop
.hbase
.HBaseConfiguration
;
30 import org
.apache
.hadoop
.hbase
.HBaseTestingUtility
;
31 import org
.apache
.hadoop
.hbase
.LocalHBaseCluster
;
32 import org
.apache
.hadoop
.hbase
.MiniHBaseCluster
;
33 import org
.apache
.hadoop
.hbase
.StartMiniClusterOption
;
34 import org
.apache
.hadoop
.hbase
.client
.RetriesExhaustedException
;
35 import org
.apache
.hadoop
.hbase
.exceptions
.ConnectionClosedException
;
36 import org
.apache
.hadoop
.hbase
.testclassification
.LargeTests
;
37 import org
.apache
.hadoop
.hbase
.testclassification
.MasterTests
;
38 import org
.apache
.hadoop
.hbase
.util
.JVMClusterUtil
.MasterThread
;
39 import org
.apache
.hadoop
.hbase
.zookeeper
.ReadOnlyZKClient
;
40 import org
.junit
.Before
;
41 import org
.junit
.ClassRule
;
42 import org
.junit
.Test
;
43 import org
.junit
.experimental
.categories
.Category
;
44 import org
.slf4j
.Logger
;
45 import org
.slf4j
.LoggerFactory
;
46 import org
.apache
.hbase
.thirdparty
.org
.apache
.commons
.collections4
.CollectionUtils
;
48 @Category({MasterTests
.class, LargeTests
.class})
49 public class TestMasterShutdown
{
50 private static final Logger LOG
= LoggerFactory
.getLogger(TestMasterShutdown
.class);
53 public static final HBaseClassTestRule CLASS_RULE
=
54 HBaseClassTestRule
.forClass(TestMasterShutdown
.class);
56 private HBaseTestingUtility htu
;
59 public void shutdownCluster() throws IOException
{
61 // an extra check in case the test cluster was not terminated after HBaseClassTestRule's
62 // Timeout interrupted the test thread.
63 LOG
.warn("found non-null TestingUtility -- previous test did not terminate cleanly.");
64 htu
.shutdownMiniCluster();
69 * Simple test of shutdown.
71 * Starts with three masters. Tells the active master to shutdown the cluster.
72 * Verifies that all masters are properly shutdown.
75 public void testMasterShutdown() throws Exception
{
76 // Create config to use for this cluster
77 Configuration conf
= HBaseConfiguration
.create();
81 htu
= new HBaseTestingUtility(conf
);
82 StartMiniClusterOption option
= StartMiniClusterOption
.builder()
87 final MiniHBaseCluster cluster
= htu
.startMiniCluster(option
);
89 // wait for all master thread to spawn and start their run loop.
90 final long thirtySeconds
= TimeUnit
.SECONDS
.toMillis(30);
91 final long oneSecond
= TimeUnit
.SECONDS
.toMillis(1);
92 assertNotEquals(-1, htu
.waitFor(thirtySeconds
, oneSecond
, () -> {
93 final List
<MasterThread
> masterThreads
= cluster
.getMasterThreads();
94 return masterThreads
!= null
95 && masterThreads
.size() >= 3
96 && masterThreads
.stream().allMatch(Thread
::isAlive
);
99 // find the active master
100 final HMaster active
= cluster
.getMaster();
101 assertNotNull(active
);
103 // make sure the other two are backup masters
104 ClusterMetrics status
= active
.getClusterMetrics();
105 assertEquals(2, status
.getBackupMasterNames().size());
107 // tell the active master to shutdown the cluster
109 assertNotEquals(-1, htu
.waitFor(thirtySeconds
, oneSecond
,
110 () -> CollectionUtils
.isEmpty(cluster
.getLiveMasterThreads())));
111 assertNotEquals(-1, htu
.waitFor(thirtySeconds
, oneSecond
,
112 () -> CollectionUtils
.isEmpty(cluster
.getLiveRegionServerThreads())));
115 htu
.shutdownMiniCluster();
122 * This test appears to be an intentional race between a thread that issues a shutdown RPC to the
123 * master, while the master is concurrently realizing it cannot initialize because there are no
124 * region servers available to it. The expected behavior is that master initialization is
125 * interruptable via said shutdown RPC.
128 public void testMasterShutdownBeforeStartingAnyRegionServer() throws Exception
{
129 LocalHBaseCluster hbaseCluster
= null;
131 htu
= new HBaseTestingUtility(
132 createMasterShutdownBeforeStartingAnyRegionServerConfiguration());
134 // configure a cluster with
135 final StartMiniClusterOption options
= StartMiniClusterOption
.builder()
139 .masterClass(HMaster
.class)
140 .rsClass(MiniHBaseCluster
.MiniHBaseClusterRegionServer
.class)
144 // Can't simply `htu.startMiniCluster(options)` because that method waits for the master to
145 // start completely. However, this test's premise is that a partially started master should
146 // still respond to a shutdown RPC. So instead, we manage each component lifecycle
148 // I think it's not worth refactoring HTU's helper methods just for this class.
149 htu
.startMiniDFSCluster(options
.getNumDataNodes());
150 htu
.startMiniZKCluster(options
.getNumZkServers());
152 hbaseCluster
= new LocalHBaseCluster(htu
.getConfiguration(), options
.getNumMasters(),
153 options
.getNumRegionServers(), options
.getMasterClass(), options
.getRsClass());
154 final MasterThread masterThread
= hbaseCluster
.getMasters().get(0);
156 masterThread
.start();
157 // Switching to master registry exacerbated a race in the master bootstrap that can result
158 // in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because
159 // the server manager in HMaster is not initialized by the time shutdown() RPC (below) is
160 // made to the master. The suspected reason as to why it was uncommon before HBASE-18095
161 // is because the connection creation with ZK registry is so slow that by then the server
162 // manager is usually init'ed in time for the RPC to be made. For now, adding an explicit
163 // wait() in the test, waiting for the server manager to become available.
164 final long timeout
= TimeUnit
.MINUTES
.toMillis(10);
165 assertNotEquals("timeout waiting for server manager to become available.", -1,
166 htu
.waitFor(timeout
, () -> masterThread
.getMaster().getServerManager() != null));
168 // Master has come up far enough that we can terminate it without creating a zombie.
170 // HBASE-24327 : (Resolve Flaky connection issues)
171 // shutdown() RPC can have flaky ZK connection issues.
173 // ERROR [RpcServer.priority.RWQ.Fifo.read.handler=1,queue=1,port=53033]
174 // master.HMaster(2878): ZooKeeper exception trying to set cluster as down in ZK
175 // org.apache.zookeeper.KeeperException$SystemErrorException:
176 // KeeperErrorCode = SystemError
178 // However, even when above flakes happen, shutdown call does get completed even if
179 // RPC call has failure. Hence, subsequent retries will never succeed as HMaster is
180 // already shutdown. Hence, it can fail. To resolve it, after making one shutdown()
181 // call, we are ignoring IOException.
182 htu
.getConnection().getAdmin().shutdown();
183 } catch (RetriesExhaustedException e
) {
184 if (e
.getCause() instanceof ConnectionClosedException
) {
185 LOG
.info("Connection is Closed to the cluster. The cluster is already down.", e
);
190 LOG
.info("Shutdown RPC sent.");
193 if (hbaseCluster
!= null) {
194 hbaseCluster
.shutdown();
197 htu
.shutdownMiniCluster();
204 * Create a cluster configuration suitable for
205 * {@link #testMasterShutdownBeforeStartingAnyRegionServer()}.
207 private static Configuration
createMasterShutdownBeforeStartingAnyRegionServerConfiguration() {
208 final Configuration conf
= HBaseConfiguration
.create();
209 // make sure the master will wait forever in the absence of a RS.
210 conf
.setInt(ServerManager
.WAIT_ON_REGIONSERVERS_MINTOSTART
, 1);
211 // don't need a long write pipeline for this test.
212 conf
.setInt("dfs.replication", 1);
213 // reduce client retries
214 conf
.setInt("hbase.client.retries.number", 1);
215 // Recoverable ZK configs are tuned more aggressively
216 conf
.setInt(ReadOnlyZKClient
.RECOVERY_RETRY
, 3);
217 conf
.setInt(ReadOnlyZKClient
.RECOVERY_RETRY_INTERVAL_MILLIS
, 100);