HBASE-23949 refactor loadBalancer implements for rsgroup balance by table to achieve...
[hbase.git] / hbase-server / src / test / java / org / apache / hadoop / hbase / master / TestRollingRestart.java
blob0aba48747829a8d1fbaa3ddcdcd9b890cbbea218
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
18 package org.apache.hadoop.hbase.master;
20 import static org.junit.Assert.assertEquals;
22 import java.io.IOException;
23 import java.util.Arrays;
24 import java.util.Collection;
25 import java.util.List;
26 import java.util.NavigableSet;
27 import java.util.Set;
28 import java.util.TreeSet;
29 import org.apache.hadoop.conf.Configuration;
30 import org.apache.hadoop.hbase.HBaseClassTestRule;
31 import org.apache.hadoop.hbase.HBaseConfiguration;
32 import org.apache.hadoop.hbase.HBaseTestingUtility;
33 import org.apache.hadoop.hbase.HConstants;
34 import org.apache.hadoop.hbase.MiniHBaseCluster;
35 import org.apache.hadoop.hbase.ServerName;
36 import org.apache.hadoop.hbase.StartMiniClusterOption;
37 import org.apache.hadoop.hbase.TableName;
38 import org.apache.hadoop.hbase.client.RegionInfo;
39 import org.apache.hadoop.hbase.client.RegionLocator;
40 import org.apache.hadoop.hbase.client.Table;
41 import org.apache.hadoop.hbase.testclassification.LargeTests;
42 import org.apache.hadoop.hbase.testclassification.MasterTests;
43 import org.apache.hadoop.hbase.util.Bytes;
44 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
45 import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
46 import org.junit.ClassRule;
47 import org.junit.Rule;
48 import org.junit.Test;
49 import org.junit.experimental.categories.Category;
50 import org.junit.rules.TestName;
51 import org.junit.runner.RunWith;
52 import org.junit.runners.Parameterized;
53 import org.slf4j.Logger;
54 import org.slf4j.LoggerFactory;
56 import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
58 /**
59 * Tests the restarting of everything as done during rolling restarts.
61 @RunWith(Parameterized.class)
62 @Category({MasterTests.class, LargeTests.class})
63 public class TestRollingRestart {
65 @ClassRule
66 public static final HBaseClassTestRule CLASS_RULE =
67 HBaseClassTestRule.forClass(TestRollingRestart.class);
69 private static final Logger LOG = LoggerFactory.getLogger(TestRollingRestart.class);
71 @Rule
72 public TestName name = new TestName();
74 @Parameterized.Parameter
75 public boolean splitWALCoordinatedByZK;
77 @Test
78 public void testBasicRollingRestart() throws Exception {
80 // Start a cluster with 2 masters and 4 regionservers
81 final int NUM_MASTERS = 2;
82 final int NUM_RS = 3;
83 final int NUM_REGIONS_TO_CREATE = 20;
85 int expectedNumRS = 3;
87 // Start the cluster
88 log("Starting cluster");
89 Configuration conf = HBaseConfiguration.create();
90 conf.setBoolean(HConstants.HBASE_SPLIT_WAL_COORDINATED_BY_ZK,
91 splitWALCoordinatedByZK);
92 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
93 StartMiniClusterOption option = StartMiniClusterOption.builder()
94 .numMasters(NUM_MASTERS).numRegionServers(NUM_RS).numDataNodes(NUM_RS).build();
95 TEST_UTIL.startMiniCluster(option);
96 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
97 log("Waiting for active/ready master");
98 cluster.waitForActiveAndReadyMaster();
100 // Create a table with regions
101 final TableName tableName =
102 TableName.valueOf(name.getMethodName().replaceAll("[\\[|\\]]", "-"));
103 byte [] family = Bytes.toBytes("family");
104 log("Creating table with " + NUM_REGIONS_TO_CREATE + " regions");
105 Table ht = TEST_UTIL.createMultiRegionTable(tableName, family, NUM_REGIONS_TO_CREATE);
106 int numRegions = -1;
107 try (RegionLocator r = TEST_UTIL.getConnection().getRegionLocator(tableName)) {
108 numRegions = r.getStartKeys().length;
110 numRegions += 1; // catalogs
111 log("Waiting for no more RIT\n");
112 TEST_UTIL.waitUntilNoRegionsInTransition(60000);
113 log("Disabling table\n");
114 TEST_UTIL.getAdmin().disableTable(tableName);
115 log("Waiting for no more RIT\n");
116 TEST_UTIL.waitUntilNoRegionsInTransition(60000);
117 NavigableSet<String> regions = HBaseTestingUtility.getAllOnlineRegions(cluster);
118 log("Verifying only catalog region is assigned\n");
119 if (regions.size() != 1) {
120 for (String oregion : regions) {
121 log("Region still online: " + oregion);
124 assertEquals(1, regions.size());
125 log("Enabling table\n");
126 TEST_UTIL.getAdmin().enableTable(tableName);
127 log("Waiting for no more RIT\n");
128 TEST_UTIL.waitUntilNoRegionsInTransition(60000);
129 log("Verifying there are " + numRegions + " assigned on cluster\n");
130 regions = HBaseTestingUtility.getAllOnlineRegions(cluster);
131 assertRegionsAssigned(cluster, regions);
132 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
134 // Add a new regionserver
135 log("Adding a fourth RS");
136 RegionServerThread restarted = cluster.startRegionServer();
137 expectedNumRS++;
138 restarted.waitForServerOnline();
139 log("Additional RS is online");
140 log("Waiting for no more RIT");
141 TEST_UTIL.waitUntilNoRegionsInTransition(60000);
142 log("Verifying there are " + numRegions + " assigned on cluster");
143 assertRegionsAssigned(cluster, regions);
144 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
146 // Master Restarts
147 List<MasterThread> masterThreads = cluster.getMasterThreads();
148 MasterThread activeMaster = null;
149 MasterThread backupMaster = null;
150 assertEquals(2, masterThreads.size());
151 if (masterThreads.get(0).getMaster().isActiveMaster()) {
152 activeMaster = masterThreads.get(0);
153 backupMaster = masterThreads.get(1);
154 } else {
155 activeMaster = masterThreads.get(1);
156 backupMaster = masterThreads.get(0);
159 // Bring down the backup master
160 log("Stopping backup master\n\n");
161 backupMaster.getMaster().stop("Stop of backup during rolling restart");
162 cluster.hbaseCluster.waitOnMaster(backupMaster);
164 // Bring down the primary master
165 log("Stopping primary master\n\n");
166 activeMaster.getMaster().stop("Stop of active during rolling restart");
167 cluster.hbaseCluster.waitOnMaster(activeMaster);
169 // Start primary master
170 log("Restarting primary master\n\n");
171 activeMaster = cluster.startMaster();
172 cluster.waitForActiveAndReadyMaster();
174 // Start backup master
175 log("Restarting backup master\n\n");
176 backupMaster = cluster.startMaster();
178 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
180 // RegionServer Restarts
182 // Bring them down, one at a time, waiting between each to complete
183 List<RegionServerThread> regionServers =
184 cluster.getLiveRegionServerThreads();
185 int num = 1;
186 int total = regionServers.size();
187 for (RegionServerThread rst : regionServers) {
188 ServerName serverName = rst.getRegionServer().getServerName();
189 log("Stopping region server " + num + " of " + total + " [ " +
190 serverName + "]");
191 rst.getRegionServer().stop("Stopping RS during rolling restart");
192 cluster.hbaseCluster.waitOnRegionServer(rst);
193 log("Waiting for RS shutdown to be handled by master");
194 waitForRSShutdownToStartAndFinish(activeMaster, serverName);
195 log("RS shutdown done, waiting for no more RIT");
196 TEST_UTIL.waitUntilNoRegionsInTransition(60000);
197 log("Verifying there are " + numRegions + " assigned on cluster");
198 assertRegionsAssigned(cluster, regions);
199 expectedNumRS--;
200 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
201 log("Restarting region server " + num + " of " + total);
202 restarted = cluster.startRegionServer();
203 restarted.waitForServerOnline();
204 expectedNumRS++;
205 log("Region server " + num + " is back online");
206 log("Waiting for no more RIT");
207 TEST_UTIL.waitUntilNoRegionsInTransition(60000);
208 log("Verifying there are " + numRegions + " assigned on cluster");
209 assertRegionsAssigned(cluster, regions);
210 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
211 num++;
213 Thread.sleep(1000);
214 assertRegionsAssigned(cluster, regions);
216 // TODO: Bring random 3 of 4 RS down at the same time
218 ht.close();
219 // Stop the cluster
220 TEST_UTIL.shutdownMiniCluster();
223 private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster,
224 ServerName serverName) throws InterruptedException {
225 ServerManager sm = activeMaster.getMaster().getServerManager();
226 // First wait for it to be in dead list
227 while (!sm.getDeadServers().isDeadServer(serverName)) {
228 log("Waiting for [" + serverName + "] to be listed as dead in master");
229 Thread.sleep(1);
231 log("Server [" + serverName + "] marked as dead, waiting for it to " +
232 "finish dead processing");
233 while (sm.areDeadServersInProgress()) {
234 log("Server [" + serverName + "] still being processed, waiting");
235 Thread.sleep(100);
237 log("Server [" + serverName + "] done with server shutdown processing");
240 private void log(String msg) {
241 LOG.debug("\n\nTRR: " + msg + "\n");
244 private int getNumberOfOnlineRegions(MiniHBaseCluster cluster) {
245 int numFound = 0;
246 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
247 numFound += rst.getRegionServer().getNumberOfOnlineRegions();
249 for (MasterThread mt : cluster.getMasterThreads()) {
250 numFound += mt.getMaster().getNumberOfOnlineRegions();
252 return numFound;
255 private void assertRegionsAssigned(MiniHBaseCluster cluster,
256 Set<String> expectedRegions) throws IOException {
257 int numFound = getNumberOfOnlineRegions(cluster);
258 if (expectedRegions.size() > numFound) {
259 log("Expected to find " + expectedRegions.size() + " but only found"
260 + " " + numFound);
261 NavigableSet<String> foundRegions =
262 HBaseTestingUtility.getAllOnlineRegions(cluster);
263 for (String region : expectedRegions) {
264 if (!foundRegions.contains(region)) {
265 log("Missing region: " + region);
268 assertEquals(expectedRegions.size(), numFound);
269 } else if (expectedRegions.size() < numFound) {
270 int doubled = numFound - expectedRegions.size();
271 log("Expected to find " + expectedRegions.size() + " but found"
272 + " " + numFound + " (" + doubled + " double assignments?)");
273 NavigableSet<String> doubleRegions = getDoubleAssignedRegions(cluster);
274 for (String region : doubleRegions) {
275 log("Region is double assigned: " + region);
277 assertEquals(expectedRegions.size(), numFound);
278 } else {
279 log("Success! Found expected number of " + numFound + " regions");
283 private NavigableSet<String> getDoubleAssignedRegions(
284 MiniHBaseCluster cluster) throws IOException {
285 NavigableSet<String> online = new TreeSet<>();
286 NavigableSet<String> doubled = new TreeSet<>();
287 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
288 for (RegionInfo region : ProtobufUtil.getOnlineRegions(
289 rst.getRegionServer().getRSRpcServices())) {
290 if(!online.add(region.getRegionNameAsString())) {
291 doubled.add(region.getRegionNameAsString());
295 return doubled;
299 @Parameterized.Parameters
300 public static Collection coordinatedByZK() {
301 return Arrays.asList(false, true);