2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
18 package org
.apache
.hadoop
.hbase
.regionserver
;
20 import static org
.junit
.Assert
.assertEquals
;
21 import static org
.junit
.Assert
.assertTrue
;
23 import java
.io
.IOException
;
24 import java
.util
.List
;
26 import java
.util
.concurrent
.atomic
.AtomicBoolean
;
27 import java
.util
.concurrent
.atomic
.AtomicReference
;
28 import org
.apache
.hadoop
.conf
.Configuration
;
29 import org
.apache
.hadoop
.hbase
.HBaseClassTestRule
;
30 import org
.apache
.hadoop
.hbase
.HBaseConfiguration
;
31 import org
.apache
.hadoop
.hbase
.HBaseTestingUtil
;
32 import org
.apache
.hadoop
.hbase
.LocalHBaseCluster
;
33 import org
.apache
.hadoop
.hbase
.ServerName
;
34 import org
.apache
.hadoop
.hbase
.SingleProcessHBaseCluster
;
35 import org
.apache
.hadoop
.hbase
.client
.RegionInfo
;
36 import org
.apache
.hadoop
.hbase
.master
.HMaster
;
37 import org
.apache
.hadoop
.hbase
.master
.ServerListener
;
38 import org
.apache
.hadoop
.hbase
.master
.ServerManager
;
39 import org
.apache
.hadoop
.hbase
.testclassification
.MediumTests
;
40 import org
.apache
.hadoop
.hbase
.testclassification
.RegionServerTests
;
41 import org
.apache
.hadoop
.hbase
.util
.Bytes
;
42 import org
.apache
.hadoop
.hbase
.util
.JVMClusterUtil
.MasterThread
;
43 import org
.apache
.hadoop
.hbase
.util
.Threads
;
44 import org
.junit
.ClassRule
;
45 import org
.junit
.Ignore
;
46 import org
.junit
.Rule
;
47 import org
.junit
.Test
;
48 import org
.junit
.experimental
.categories
.Category
;
49 import org
.junit
.rules
.TestName
;
50 import org
.slf4j
.Logger
;
51 import org
.slf4j
.LoggerFactory
;
53 import org
.apache
.hadoop
.hbase
.shaded
.protobuf
.generated
.RegionServerStatusProtos
.RegionServerStartupResponse
;
56 * Tests that a regionserver that dies after reporting for duty gets removed
57 * from list of online regions. See HBASE-9593.
59 @Category({RegionServerTests
.class, MediumTests
.class})
60 @Ignore("See HBASE-19515")
61 public class TestRSKilledWhenInitializing
{
64 public static final HBaseClassTestRule CLASS_RULE
=
65 HBaseClassTestRule
.forClass(TestRSKilledWhenInitializing
.class);
67 private static final Logger LOG
= LoggerFactory
.getLogger(TestRSKilledWhenInitializing
.class);
70 public TestName testName
= new TestName();
72 // This boolean needs to be globally available. It is used below in our
73 // mocked up regionserver so it knows when to die.
74 private static AtomicBoolean masterActive
= new AtomicBoolean(false);
75 // Ditto for this variable. It also is used in the mocked regionserver class.
76 private static final AtomicReference
<ServerName
> killedRS
= new AtomicReference
<ServerName
>();
78 private static final int NUM_MASTERS
= 1;
79 private static final int NUM_RS
= 2;
82 * Test verifies whether a region server is removed from online servers list in master if it went
83 * down after registering with master. Test will TIMEOUT if an error!!!!
87 public void testRSTerminationAfterRegisteringToMasterBeforeCreatingEphemeralNode()
89 // Create config to use for this cluster
90 Configuration conf
= HBaseConfiguration
.create();
91 conf
.setInt(ServerManager
.WAIT_ON_REGIONSERVERS_MINTOSTART
, 1);
93 final HBaseTestingUtil TEST_UTIL
= new HBaseTestingUtil(conf
);
94 TEST_UTIL
.startMiniDFSCluster(3);
95 TEST_UTIL
.startMiniZKCluster();
96 TEST_UTIL
.createRootDir();
97 final LocalHBaseCluster cluster
= new LocalHBaseCluster(conf
, NUM_MASTERS
, NUM_RS
,
98 HMaster
.class, RegisterAndDieRegionServer
.class);
99 final MasterThread master
= startMaster(cluster
.getMasters().get(0));
101 // Master is up waiting on RegionServers to check in. Now start RegionServers.
102 for (int i
= 0; i
< NUM_RS
; i
++) {
103 cluster
.getRegionServers().get(i
).start();
105 // Expected total regionservers depends on whether Master can host regions or not.
106 int expectedTotalRegionServers
= NUM_RS
;
107 List
<ServerName
> onlineServersList
= null;
109 onlineServersList
= master
.getMaster().getServerManager().getOnlineServersList();
110 } while (onlineServersList
.size() < expectedTotalRegionServers
);
111 // Wait until killedRS is set. Means RegionServer is starting to go down.
112 while (killedRS
.get() == null) {
115 // Wait on the RegionServer to fully die.
116 while (cluster
.getLiveRegionServers().size() >= expectedTotalRegionServers
) {
119 // Make sure Master is fully up before progressing. Could take a while if regions
121 while (!master
.getMaster().isInitialized()) {
125 // Now in steady state. How many regions open? Master should have too many regionservers
126 // showing still. The downed RegionServer should still be showing as registered.
127 assertTrue(master
.getMaster().getServerManager().isServerOnline(killedRS
.get()));
128 // Find non-meta region (namespace?) and assign to the killed server. That'll trigger cleanup.
129 Map
<RegionInfo
, ServerName
> assignments
= null;
131 assignments
= master
.getMaster().getAssignmentManager().getRegionStates().getRegionAssignments();
132 } while (assignments
== null || assignments
.size() < 2);
133 RegionInfo hri
= null;
134 for (Map
.Entry
<RegionInfo
, ServerName
> e
: assignments
.entrySet()) {
135 if (e
.getKey().isMetaRegion()) continue;
139 // Try moving region to the killed server. It will fail. As by-product, we will
140 // remove the RS from Master online list because no corresponding znode.
141 assertEquals(expectedTotalRegionServers
,
142 master
.getMaster().getServerManager().getOnlineServersList().size());
143 LOG
.info("Move " + hri
.getEncodedName() + " to " + killedRS
.get());
144 master
.getMaster().move(hri
.getEncodedNameAsBytes(),
145 Bytes
.toBytes(killedRS
.get().toString()));
147 // TODO: This test could do more to verify fix. It could create a table
148 // and do round-robin assign. It should fail if zombie RS. HBASE-19515.
150 // Wait until the RS no longer shows as registered in Master.
151 while (onlineServersList
.size() > (NUM_RS
+ 1)) {
153 onlineServersList
= master
.getMaster().getServerManager().getOnlineServersList();
156 // Shutdown is messy with complaints about fs being closed. Why? TODO.
159 TEST_UTIL
.shutdownMiniDFSCluster();
160 TEST_UTIL
.shutdownMiniZKCluster();
161 TEST_UTIL
.cleanupTestDir();
166 * Start Master. Get as far as the state where Master is waiting on
167 * RegionServers to check in, then return.
169 private MasterThread
startMaster(MasterThread master
) {
171 // It takes a while until ServerManager creation to happen inside Master startup.
172 while (master
.getMaster().getServerManager() == null) {
175 // Set a listener for the waiting-on-RegionServers state. We want to wait
176 // until this condition before we leave this method and start regionservers.
177 final AtomicBoolean waiting
= new AtomicBoolean(false);
178 if (master
.getMaster().getServerManager() == null) throw new NullPointerException("SM");
179 master
.getMaster().getServerManager().registerListener(new ServerListener() {
181 public void waiting() {
185 // Wait until the Master gets to place where it is waiting on RegionServers to check in.
186 while (!waiting
.get()) {
189 // Set the global master-is-active; gets picked up by regionservers later.
190 masterActive
.set(true);
195 * A RegionServer that reports for duty and then immediately dies if it is the first to receive
196 * the response to a reportForDuty. When it dies, it clears its ephemeral znode which the master
197 * notices and so removes the region from its set of online regionservers.
199 static class RegisterAndDieRegionServer
200 extends SingleProcessHBaseCluster
.MiniHBaseClusterRegionServer
{
201 public RegisterAndDieRegionServer(Configuration conf
) throws IOException
, InterruptedException
{
206 protected void handleReportForDutyResponse(RegionServerStartupResponse c
)
208 if (killedRS
.compareAndSet(null, getServerName())) {
209 // Make sure Master is up so it will see the removal of the ephemeral znode for this RS.
210 while (!masterActive
.get()) {
215 super.handleReportForDutyResponse(c
);