HBASE-24033 Add ut for loading the corrupt recovered hfiles (#1322)
[hbase.git] / hbase-server / src / test / java / org / apache / hadoop / hbase / master / TestClusterRestartFailover.java
blob1cce2acf9a91ac2cbb859f17bd73543f26f075eb
1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
18 package org.apache.hadoop.hbase.master;
20 import static org.junit.Assert.assertFalse;
21 import static org.junit.Assert.assertNotNull;
22 import static org.junit.Assert.assertNull;
23 import static org.junit.Assert.assertTrue;
25 import java.io.IOException;
26 import java.util.List;
27 import java.util.Optional;
28 import java.util.concurrent.CountDownLatch;
29 import java.util.stream.Collectors;
31 import org.apache.hadoop.conf.Configuration;
32 import org.apache.hadoop.hbase.HBaseClassTestRule;
33 import org.apache.hadoop.hbase.ServerName;
34 import org.apache.hadoop.hbase.StartMiniClusterOption;
35 import org.apache.hadoop.hbase.TableName;
36 import org.apache.hadoop.hbase.client.RegionInfo;
37 import org.apache.hadoop.hbase.client.Table;
38 import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
39 import org.apache.hadoop.hbase.master.assignment.ServerState;
40 import org.apache.hadoop.hbase.master.assignment.ServerStateNode;
41 import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
42 import org.apache.hadoop.hbase.procedure2.Procedure;
43 import org.apache.hadoop.hbase.regionserver.HRegionServer;
44 import org.apache.hadoop.hbase.testclassification.LargeTests;
45 import org.apache.hadoop.hbase.testclassification.MasterTests;
46 import org.apache.hadoop.hbase.util.JVMClusterUtil;
47 import org.apache.zookeeper.KeeperException;
48 import org.junit.ClassRule;
49 import org.junit.Test;
50 import org.junit.experimental.categories.Category;
51 import org.slf4j.Logger;
52 import org.slf4j.LoggerFactory;
54 @Category({ MasterTests.class, LargeTests.class })
55 public class TestClusterRestartFailover extends AbstractTestRestartCluster {
57 @ClassRule
58 public static final HBaseClassTestRule CLASS_RULE =
59 HBaseClassTestRule.forClass(TestClusterRestartFailover.class);
61 private static final Logger LOG = LoggerFactory.getLogger(TestClusterRestartFailover.class);
63 private volatile static CountDownLatch SCP_LATCH;
64 private static ServerName SERVER_FOR_TEST;
66 @Override
67 protected boolean splitWALCoordinatedByZk() {
68 return true;
71 private ServerStateNode getServerStateNode(ServerName serverName) {
72 return UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates()
73 .getServerNode(serverName);
76 /**
77 * Test for HBASE-22964
79 @Test
80 public void test() throws Exception {
81 setupCluster();
82 setupTable();
84 // Find server that does not have hbase:namespace on it. This tests holds up SCPs. If it
85 // holds up the server w/ hbase:namespace, the Master initialization will be held up
86 // because this table is not online and test fails.
87 for (JVMClusterUtil.RegionServerThread rst:
88 UTIL.getHBaseCluster().getLiveRegionServerThreads()) {
89 HRegionServer rs = rst.getRegionServer();
90 if (rs.getRegions(TableName.NAMESPACE_TABLE_NAME).isEmpty()) {
91 SERVER_FOR_TEST = rs.getServerName();
94 UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null);
95 ServerStateNode serverNode = getServerStateNode(SERVER_FOR_TEST);
96 assertNotNull(serverNode);
97 assertTrue("serverNode should be ONLINE when cluster runs normally",
98 serverNode.isInState(ServerState.ONLINE));
100 SCP_LATCH = new CountDownLatch(1);
102 // Shutdown cluster and restart
103 List<Integer> ports =
104 UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList().stream()
105 .map(serverName -> serverName.getPort()).collect(Collectors.toList());
106 LOG.info("Shutting down cluster");
107 UTIL.getHBaseCluster().killAll();
108 UTIL.getHBaseCluster().waitUntilShutDown();
109 LOG.info("Restarting cluster");
110 UTIL.restartHBaseCluster(StartMiniClusterOption.builder().masterClass(HMasterForTest.class)
111 .numMasters(1).numRegionServers(3).rsPorts(ports).build());
112 LOG.info("Started cluster");
113 UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().isInitialized());
114 LOG.info("Started cluster master, waiting for {}", SERVER_FOR_TEST);
115 UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null);
116 serverNode = getServerStateNode(SERVER_FOR_TEST);
117 assertFalse("serverNode should not be ONLINE during SCP processing",
118 serverNode.isInState(ServerState.ONLINE));
119 Optional<Procedure<?>> procedure = UTIL.getHBaseCluster().getMaster().getProcedures().stream()
120 .filter(p -> (p instanceof ServerCrashProcedure) &&
121 ((ServerCrashProcedure) p).getServerName().equals(SERVER_FOR_TEST)).findAny();
122 assertTrue("Should have one SCP for " + SERVER_FOR_TEST, procedure.isPresent());
123 assertTrue("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail",
124 UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST) ==
125 Procedure.NO_PROC_ID);
127 // Wait the SCP to finish
128 LOG.info("Waiting on latch");
129 SCP_LATCH.countDown();
130 UTIL.waitFor(60000, () -> procedure.get().isFinished());
132 assertFalse("Even when the SCP is finished, the duplicate SCP should not be scheduled for " +
133 SERVER_FOR_TEST,
134 UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST) ==
135 Procedure.NO_PROC_ID);
136 serverNode = UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates()
137 .getServerNode(SERVER_FOR_TEST);
138 assertNull("serverNode should be deleted after SCP finished", serverNode);
141 private void setupCluster() throws Exception {
142 LOG.info("Setup cluster");
143 UTIL.startMiniCluster(
144 StartMiniClusterOption.builder().masterClass(HMasterForTest.class).numMasters(1)
145 .numRegionServers(3).build());
146 LOG.info("Cluster is up");
147 UTIL.waitFor(60000, () -> UTIL.getMiniHBaseCluster().getMaster().isInitialized());
148 LOG.info("Master is up");
149 // wait for all SCPs finished
150 UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().getProcedures().stream()
151 .noneMatch(p -> p instanceof ServerCrashProcedure));
152 LOG.info("No SCPs");
155 private void setupTable() throws Exception {
156 TableName tableName = TABLES[0];
157 UTIL.createMultiRegionTable(tableName, FAMILY);
158 UTIL.waitTableAvailable(tableName);
159 Table table = UTIL.getConnection().getTable(tableName);
160 for (int i = 0; i < 100; i++) {
161 UTIL.loadTable(table, FAMILY);
165 public static final class HMasterForTest extends HMaster {
167 public HMasterForTest(Configuration conf) throws IOException {
168 super(conf);
171 @Override
172 protected AssignmentManager createAssignmentManager(MasterServices master) {
173 return new AssignmentManagerForTest(master);
177 private static final class AssignmentManagerForTest extends AssignmentManager {
179 public AssignmentManagerForTest(MasterServices master) {
180 super(master);
183 @Override
184 public List<RegionInfo> getRegionsOnServer(ServerName serverName) {
185 List<RegionInfo> regions = super.getRegionsOnServer(serverName);
186 // ServerCrashProcedure will call this method, so wait the CountDownLatch here
187 if (SCP_LATCH != null && SERVER_FOR_TEST != null && serverName.equals(SERVER_FOR_TEST)) {
188 try {
189 LOG.info("ServerCrashProcedure wait the CountDownLatch here");
190 SCP_LATCH.await();
191 LOG.info("Continue the ServerCrashProcedure");
192 SCP_LATCH = null;
193 } catch (InterruptedException e) {
194 throw new RuntimeException(e);
197 return regions;