HBASE-25032 Do not assign regions to region server which has not called regionServerR...
[hbase.git] / hbase-server / src / test / java / org / apache / hadoop / hbase / master / TestMasterShutdown.java
blobcd8c4d2a16c6c4618dbde704147bcf1f67e99056
1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
18 package org.apache.hadoop.hbase.master;
20 import static org.junit.Assert.assertEquals;
21 import static org.junit.Assert.assertNotEquals;
22 import static org.junit.Assert.assertNotNull;
23 import java.io.IOException;
24 import java.util.List;
25 import java.util.concurrent.TimeUnit;
26 import org.apache.hadoop.conf.Configuration;
27 import org.apache.hadoop.hbase.ClusterMetrics;
28 import org.apache.hadoop.hbase.HBaseClassTestRule;
29 import org.apache.hadoop.hbase.HBaseConfiguration;
30 import org.apache.hadoop.hbase.HBaseTestingUtility;
31 import org.apache.hadoop.hbase.LocalHBaseCluster;
32 import org.apache.hadoop.hbase.MiniHBaseCluster;
33 import org.apache.hadoop.hbase.StartMiniClusterOption;
34 import org.apache.hadoop.hbase.client.RetriesExhaustedException;
35 import org.apache.hadoop.hbase.exceptions.ConnectionClosedException;
36 import org.apache.hadoop.hbase.testclassification.LargeTests;
37 import org.apache.hadoop.hbase.testclassification.MasterTests;
38 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
39 import org.apache.hadoop.hbase.zookeeper.ReadOnlyZKClient;
40 import org.junit.Before;
41 import org.junit.ClassRule;
42 import org.junit.Test;
43 import org.junit.experimental.categories.Category;
44 import org.slf4j.Logger;
45 import org.slf4j.LoggerFactory;
46 import org.apache.hbase.thirdparty.org.apache.commons.collections4.CollectionUtils;
48 @Category({MasterTests.class, LargeTests.class})
49 public class TestMasterShutdown {
50 private static final Logger LOG = LoggerFactory.getLogger(TestMasterShutdown.class);
52 @ClassRule
53 public static final HBaseClassTestRule CLASS_RULE =
54 HBaseClassTestRule.forClass(TestMasterShutdown.class);
56 private HBaseTestingUtility htu;
58 @Before
59 public void shutdownCluster() throws IOException {
60 if (htu != null) {
61 // an extra check in case the test cluster was not terminated after HBaseClassTestRule's
62 // Timeout interrupted the test thread.
63 LOG.warn("found non-null TestingUtility -- previous test did not terminate cleanly.");
64 htu.shutdownMiniCluster();
68 /**
69 * Simple test of shutdown.
70 * <p>
71 * Starts with three masters. Tells the active master to shutdown the cluster.
72 * Verifies that all masters are properly shutdown.
74 @Test
75 public void testMasterShutdown() throws Exception {
76 // Create config to use for this cluster
77 Configuration conf = HBaseConfiguration.create();
79 // Start the cluster
80 try {
81 htu = new HBaseTestingUtility(conf);
82 StartMiniClusterOption option = StartMiniClusterOption.builder()
83 .numMasters(3)
84 .numRegionServers(1)
85 .numDataNodes(1)
86 .build();
87 final MiniHBaseCluster cluster = htu.startMiniCluster(option);
89 // wait for all master thread to spawn and start their run loop.
90 final long thirtySeconds = TimeUnit.SECONDS.toMillis(30);
91 final long oneSecond = TimeUnit.SECONDS.toMillis(1);
92 assertNotEquals(-1, htu.waitFor(thirtySeconds, oneSecond, () -> {
93 final List<MasterThread> masterThreads = cluster.getMasterThreads();
94 return masterThreads != null
95 && masterThreads.size() >= 3
96 && masterThreads.stream().allMatch(Thread::isAlive);
97 }));
99 // find the active master
100 final HMaster active = cluster.getMaster();
101 assertNotNull(active);
103 // make sure the other two are backup masters
104 ClusterMetrics status = active.getClusterMetrics();
105 assertEquals(2, status.getBackupMasterNames().size());
107 // tell the active master to shutdown the cluster
108 active.shutdown();
109 assertNotEquals(-1, htu.waitFor(thirtySeconds, oneSecond,
110 () -> CollectionUtils.isEmpty(cluster.getLiveMasterThreads())));
111 assertNotEquals(-1, htu.waitFor(thirtySeconds, oneSecond,
112 () -> CollectionUtils.isEmpty(cluster.getLiveRegionServerThreads())));
113 } finally {
114 if (htu != null) {
115 htu.shutdownMiniCluster();
116 htu = null;
122 * This test appears to be an intentional race between a thread that issues a shutdown RPC to the
123 * master, while the master is concurrently realizing it cannot initialize because there are no
124 * region servers available to it. The expected behavior is that master initialization is
125 * interruptable via said shutdown RPC.
127 @Test
128 public void testMasterShutdownBeforeStartingAnyRegionServer() throws Exception {
129 LocalHBaseCluster hbaseCluster = null;
130 try {
131 htu = new HBaseTestingUtility(
132 createMasterShutdownBeforeStartingAnyRegionServerConfiguration());
134 // configure a cluster with
135 final StartMiniClusterOption options = StartMiniClusterOption.builder()
136 .numDataNodes(1)
137 .numMasters(1)
138 .numRegionServers(0)
139 .masterClass(HMaster.class)
140 .rsClass(MiniHBaseCluster.MiniHBaseClusterRegionServer.class)
141 .createRootDir(true)
142 .build();
144 // Can't simply `htu.startMiniCluster(options)` because that method waits for the master to
145 // start completely. However, this test's premise is that a partially started master should
146 // still respond to a shutdown RPC. So instead, we manage each component lifecycle
147 // independently.
148 // I think it's not worth refactoring HTU's helper methods just for this class.
149 htu.startMiniDFSCluster(options.getNumDataNodes());
150 htu.startMiniZKCluster(options.getNumZkServers());
151 htu.createRootDir();
152 hbaseCluster = new LocalHBaseCluster(htu.getConfiguration(), options.getNumMasters(),
153 options.getNumRegionServers(), options.getMasterClass(), options.getRsClass());
154 final MasterThread masterThread = hbaseCluster.getMasters().get(0);
156 masterThread.start();
157 // Switching to master registry exacerbated a race in the master bootstrap that can result
158 // in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because
159 // the server manager in HMaster is not initialized by the time shutdown() RPC (below) is
160 // made to the master. The suspected reason as to why it was uncommon before HBASE-18095
161 // is because the connection creation with ZK registry is so slow that by then the server
162 // manager is usually init'ed in time for the RPC to be made. For now, adding an explicit
163 // wait() in the test, waiting for the server manager to become available.
164 final long timeout = TimeUnit.MINUTES.toMillis(10);
165 assertNotEquals("timeout waiting for server manager to become available.", -1,
166 htu.waitFor(timeout, () -> masterThread.getMaster().getServerManager() != null));
168 // Master has come up far enough that we can terminate it without creating a zombie.
169 try {
170 // HBASE-24327 : (Resolve Flaky connection issues)
171 // shutdown() RPC can have flaky ZK connection issues.
172 // e.g
173 // ERROR [RpcServer.priority.RWQ.Fifo.read.handler=1,queue=1,port=53033]
174 // master.HMaster(2878): ZooKeeper exception trying to set cluster as down in ZK
175 // org.apache.zookeeper.KeeperException$SystemErrorException:
176 // KeeperErrorCode = SystemError
178 // However, even when above flakes happen, shutdown call does get completed even if
179 // RPC call has failure. Hence, subsequent retries will never succeed as HMaster is
180 // already shutdown. Hence, it can fail. To resolve it, after making one shutdown()
181 // call, we are ignoring IOException.
182 htu.getConnection().getAdmin().shutdown();
183 } catch (RetriesExhaustedException e) {
184 if (e.getCause() instanceof ConnectionClosedException) {
185 LOG.info("Connection is Closed to the cluster. The cluster is already down.", e);
186 } else {
187 throw e;
190 LOG.info("Shutdown RPC sent.");
191 masterThread.join();
192 } finally {
193 if (hbaseCluster != null) {
194 hbaseCluster.shutdown();
196 if (htu != null) {
197 htu.shutdownMiniCluster();
198 htu = null;
204 * Create a cluster configuration suitable for
205 * {@link #testMasterShutdownBeforeStartingAnyRegionServer()}.
207 private static Configuration createMasterShutdownBeforeStartingAnyRegionServerConfiguration() {
208 final Configuration conf = HBaseConfiguration.create();
209 // make sure the master will wait forever in the absence of a RS.
210 conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
211 // don't need a long write pipeline for this test.
212 conf.setInt("dfs.replication", 1);
213 // reduce client retries
214 conf.setInt("hbase.client.retries.number", 1);
215 // Recoverable ZK configs are tuned more aggressively
216 conf.setInt(ReadOnlyZKClient.RECOVERY_RETRY, 3);
217 conf.setInt(ReadOnlyZKClient.RECOVERY_RETRY_INTERVAL_MILLIS, 100);
218 return conf;