HBASE-26416 Implement a new method for region replication instead of using replay...
[hbase.git] / hbase-server / src / test / java / org / apache / hadoop / hbase / master / TestMasterFailover.java
blob9e0333b51031f0c5f836b6208ceac960faa2f156
1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
18 package org.apache.hadoop.hbase.master;
20 import static org.junit.Assert.assertEquals;
21 import static org.junit.Assert.assertNotNull;
22 import static org.junit.Assert.assertTrue;
23 import java.util.List;
24 import java.util.concurrent.TimeUnit;
25 import org.apache.hadoop.hbase.ClusterMetrics;
26 import org.apache.hadoop.hbase.HBaseClassTestRule;
27 import org.apache.hadoop.hbase.HBaseTestingUtil;
28 import org.apache.hadoop.hbase.ServerName;
29 import org.apache.hadoop.hbase.SingleProcessHBaseCluster;
30 import org.apache.hadoop.hbase.StartTestingClusterOption;
31 import org.apache.hadoop.hbase.master.RegionState.State;
32 import org.apache.hadoop.hbase.regionserver.HRegionServer;
33 import org.apache.hadoop.hbase.testclassification.FlakeyTests;
34 import org.apache.hadoop.hbase.testclassification.LargeTests;
35 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
36 import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
37 import org.junit.ClassRule;
38 import org.junit.Rule;
39 import org.junit.Test;
40 import org.junit.experimental.categories.Category;
41 import org.junit.rules.TestName;
42 import org.slf4j.Logger;
43 import org.slf4j.LoggerFactory;
45 @Category({FlakeyTests.class, LargeTests.class})
46 public class TestMasterFailover {
48 @ClassRule
49 public static final HBaseClassTestRule CLASS_RULE =
50 HBaseClassTestRule.forClass(TestMasterFailover.class);
52 private static final Logger LOG = LoggerFactory.getLogger(TestMasterFailover.class);
53 @Rule public TestName name = new TestName();
55 /**
56 * Simple test of master failover.
57 * <p>
58 * Starts with three masters. Kills a backup master. Then kills the active
59 * master. Ensures the final master becomes active and we can still contact
60 * the cluster.
62 @Test
63 public void testSimpleMasterFailover() throws Exception {
64 final int NUM_MASTERS = 3;
65 final int NUM_RS = 3;
67 // Start the cluster
68 HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
69 try {
70 StartTestingClusterOption option = StartTestingClusterOption.builder()
71 .numMasters(NUM_MASTERS).numRegionServers(NUM_RS).numDataNodes(NUM_RS).build();
72 TEST_UTIL.startMiniCluster(option);
73 SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
75 // get all the master threads
76 List<MasterThread> masterThreads = cluster.getMasterThreads();
78 // wait for each to come online
79 for (MasterThread mt : masterThreads) {
80 assertTrue(mt.isAlive());
83 // verify only one is the active master and we have right number
84 int numActive = 0;
85 int activeIndex = -1;
86 ServerName activeName = null;
87 HMaster active = null;
88 for (int i = 0; i < masterThreads.size(); i++) {
89 if (masterThreads.get(i).getMaster().isActiveMaster()) {
90 numActive++;
91 activeIndex = i;
92 active = masterThreads.get(activeIndex).getMaster();
93 activeName = active.getServerName();
96 assertEquals(1, numActive);
97 assertEquals(NUM_MASTERS, masterThreads.size());
98 LOG.info("Active master " + activeName);
100 // Check that ClusterStatus reports the correct active and backup masters
101 assertNotNull(active);
102 ClusterMetrics status = active.getClusterMetrics();
103 assertEquals(activeName, status.getMasterName());
104 assertEquals(2, status.getBackupMasterNames().size());
106 // attempt to stop one of the inactive masters
107 int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1);
108 HMaster master = cluster.getMaster(backupIndex);
109 LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n");
110 cluster.stopMaster(backupIndex, false);
111 cluster.waitOnMaster(backupIndex);
113 // Verify still one active master and it's the same
114 for (int i = 0; i < masterThreads.size(); i++) {
115 if (masterThreads.get(i).getMaster().isActiveMaster()) {
116 assertEquals(activeName, masterThreads.get(i).getMaster().getServerName());
117 activeIndex = i;
118 active = masterThreads.get(activeIndex).getMaster();
121 assertEquals(1, numActive);
122 assertEquals(2, masterThreads.size());
123 int rsCount = masterThreads.get(activeIndex).getMaster().getClusterMetrics()
124 .getLiveServerMetrics().size();
125 LOG.info("Active master " + active.getServerName() + " managing " + rsCount +
126 " regions servers");
127 assertEquals(3, rsCount);
129 // wait for the active master to acknowledge loss of the backup from ZK
130 final HMaster activeFinal = active;
131 TEST_UTIL.waitFor(
132 TimeUnit.MINUTES.toMillis(5), () -> activeFinal.getBackupMasters().size() == 1);
134 // Check that ClusterStatus reports the correct active and backup masters
135 assertNotNull(active);
136 status = active.getClusterMetrics();
137 assertEquals(activeName, status.getMasterName());
138 assertEquals(1, status.getBackupMasterNames().size());
140 // kill the active master
141 LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n");
142 cluster.stopMaster(activeIndex, false);
143 cluster.waitOnMaster(activeIndex);
145 // wait for an active master to show up and be ready
146 assertTrue(cluster.waitForActiveAndReadyMaster());
148 LOG.debug("\n\nVerifying backup master is now active\n");
149 // should only have one master now
150 assertEquals(1, masterThreads.size());
152 // and he should be active
153 active = masterThreads.get(0).getMaster();
154 assertNotNull(active);
155 status = active.getClusterMetrics();
156 ServerName masterName = status.getMasterName();
157 assertNotNull(masterName);
158 assertEquals(active.getServerName(), masterName);
159 assertTrue(active.isActiveMaster());
160 assertEquals(0, status.getBackupMasterNames().size());
161 int rss = status.getLiveServerMetrics().size();
162 LOG.info("Active master {} managing {} region servers", masterName.getServerName(), rss);
163 assertEquals(3, rss);
164 } finally {
165 // Stop the cluster
166 TEST_UTIL.shutdownMiniCluster();
171 * Test meta in transition when master failover.
172 * This test used to manipulate region state up in zk. That is not allowed any more in hbase2
173 * so I removed that messing. That makes this test anemic.
175 @Test
176 public void testMetaInTransitionWhenMasterFailover() throws Exception {
177 // Start the cluster
178 HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
179 TEST_UTIL.startMiniCluster();
180 try {
181 SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
182 LOG.info("Cluster started");
184 HMaster activeMaster = cluster.getMaster();
185 ServerName metaServerName = cluster.getServerHoldingMeta();
186 HRegionServer hrs = cluster.getRegionServer(metaServerName);
188 // Now kill master, meta should remain on rs, where we placed it before.
189 LOG.info("Aborting master");
190 activeMaster.abort("test-kill");
191 cluster.waitForMasterToStop(activeMaster.getServerName(), 30000);
192 LOG.info("Master has aborted");
194 // meta should remain where it was
195 RegionState metaState = MetaTableLocator.getMetaRegionState(hrs.getZooKeeper());
196 assertEquals("hbase:meta should be online on RS",
197 metaState.getServerName(), metaServerName);
198 assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState());
200 // Start up a new master
201 LOG.info("Starting up a new master");
202 activeMaster = cluster.startMaster().getMaster();
203 LOG.info("Waiting for master to be ready");
204 cluster.waitForActiveAndReadyMaster();
205 LOG.info("Master is ready");
207 // ensure meta is still deployed on RS
208 metaState = MetaTableLocator.getMetaRegionState(activeMaster.getZooKeeper());
209 assertEquals("hbase:meta should be online on RS",
210 metaState.getServerName(), metaServerName);
211 assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState());
213 // Done, shutdown the cluster
214 } finally {
215 TEST_UTIL.shutdownMiniCluster();