2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
18 package org
.apache
.hadoop
.hbase
.master
;
20 import java
.io
.IOException
;
21 import org
.apache
.hadoop
.hbase
.CatalogFamilyFormat
;
22 import org
.apache
.hadoop
.hbase
.HBaseClassTestRule
;
23 import org
.apache
.hadoop
.hbase
.HBaseTestingUtil
;
24 import org
.apache
.hadoop
.hbase
.HConstants
;
25 import org
.apache
.hadoop
.hbase
.TableName
;
26 import org
.apache
.hadoop
.hbase
.client
.Durability
;
27 import org
.apache
.hadoop
.hbase
.client
.Put
;
28 import org
.apache
.hadoop
.hbase
.client
.RegionInfo
;
29 import org
.apache
.hadoop
.hbase
.client
.RegionLocator
;
30 import org
.apache
.hadoop
.hbase
.client
.Result
;
31 import org
.apache
.hadoop
.hbase
.client
.ResultScanner
;
32 import org
.apache
.hadoop
.hbase
.client
.Scan
;
33 import org
.apache
.hadoop
.hbase
.client
.Table
;
34 import org
.apache
.hadoop
.hbase
.testclassification
.LargeTests
;
35 import org
.apache
.hadoop
.hbase
.testclassification
.MasterTests
;
36 import org
.apache
.hadoop
.hbase
.util
.Bytes
;
37 import org
.junit
.AfterClass
;
38 import org
.junit
.Assert
;
39 import org
.junit
.Before
;
40 import org
.junit
.BeforeClass
;
41 import org
.junit
.ClassRule
;
42 import org
.junit
.Ignore
;
43 import org
.junit
.Test
;
44 import org
.junit
.experimental
.categories
.Category
;
45 import org
.slf4j
.Logger
;
46 import org
.slf4j
.LoggerFactory
;
49 * Test transitions of state across the master. Sets up the cluster once and
50 * then runs a couple of tests.
52 @Category({MasterTests
.class, LargeTests
.class})
53 public class TestMasterTransitions
{
56 public static final HBaseClassTestRule CLASS_RULE
=
57 HBaseClassTestRule
.forClass(TestMasterTransitions
.class);
59 private static final Logger LOG
= LoggerFactory
.getLogger(TestMasterTransitions
.class);
60 private static final HBaseTestingUtil TEST_UTIL
= new HBaseTestingUtil();
61 private static final TableName TABLENAME
= TableName
.valueOf("master_transitions");
62 private static final byte [][] FAMILIES
= new byte [][] {Bytes
.toBytes("a"),
63 Bytes
.toBytes("b"), Bytes
.toBytes("c")};
66 * Start up a mini cluster and put a small table of many empty regions into it.
69 @BeforeClass public static void beforeAllTests() throws Exception
{
70 TEST_UTIL
.startMiniCluster(2);
71 // Create a table of three families. This will assign a region.
72 TEST_UTIL
.createMultiRegionTable(TABLENAME
, FAMILIES
);
73 Table t
= TEST_UTIL
.getConnection().getTable(TABLENAME
);
74 int countOfRegions
= -1;
75 try (RegionLocator r
= TEST_UTIL
.getConnection().getRegionLocator(TABLENAME
)) {
76 countOfRegions
= r
.getStartKeys().length
;
78 TEST_UTIL
.waitUntilAllRegionsAssigned(TABLENAME
);
79 addToEachStartKey(countOfRegions
);
83 @AfterClass public static void afterAllTests() throws Exception
{
84 TEST_UTIL
.shutdownMiniCluster();
87 @Before public void setup() throws IOException
{
88 TEST_UTIL
.ensureSomeRegionServersAvailable(2);
92 * Listener for regionserver events testing hbase-2428 (Infinite loop of
93 * region closes if hbase:meta region is offline). In particular, listen
94 * for the close of the 'metaServer' and when it comes in, requeue it with a
95 * delay as though there were an issue processing the shutdown. As part of
96 * the requeuing, send over a close of a region on 'otherServer' so it comes
97 * into a master that has its meta region marked as offline.
100 static class HBase2428Listener implements RegionServerOperationListener {
101 // Map of what we've delayed so we don't do do repeated delays.
102 private final Set<RegionServerOperation> postponed =
103 new CopyOnWriteArraySet<RegionServerOperation>();
104 private boolean done = false;;
105 private boolean metaShutdownReceived = false;
106 private final HServerAddress metaAddress;
107 private final MiniHBaseCluster cluster;
108 private final int otherServerIndex;
109 private final RegionInfo hri;
110 private int closeCount = 0;
111 static final int SERVER_DURATION = 3 * 1000;
112 static final int CLOSE_DURATION = 1 * 1000;
114 HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress,
115 final RegionInfo closingHRI, final int otherServerIndex) {
117 this.metaAddress = metaAddress;
118 this.hri = closingHRI;
119 this.otherServerIndex = otherServerIndex;
123 public boolean process(final RegionServerOperation op) throws IOException {
124 // If a regionserver shutdown and its of the meta server, then we want to
125 // delay the processing of the shutdown and send off a close of a region on
127 boolean result = true;
128 if (op instanceof ProcessServerShutdown) {
129 ProcessServerShutdown pss = (ProcessServerShutdown)op;
130 if (pss.getDeadServerAddress().equals(this.metaAddress)) {
131 // Don't postpone more than once.
132 if (!this.postponed.contains(pss)) {
133 // Close some region.
134 this.cluster.addMessageToSendRegionServer(this.otherServerIndex,
135 new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri,
136 Bytes.toBytes("Forcing close in test")));
137 this.postponed.add(pss);
138 // Put off the processing of the regionserver shutdown processing.
139 pss.setDelay(SERVER_DURATION);
140 this.metaShutdownReceived = true;
141 // Return false. This will add this op to the delayed queue.
146 // Have the close run frequently.
147 if (isWantedCloseOperation(op) != null) {
148 op.setDelay(CLOSE_DURATION);
149 // Count how many times it comes through here.
156 public void processed(final RegionServerOperation op) {
157 if (isWantedCloseOperation(op) != null) return;
163 * @return Null if not the wanted ProcessRegionClose, else <code>op</code>
164 * cast as a ProcessRegionClose.
167 private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) {
168 // Count every time we get a close operation.
169 if (op instanceof ProcessRegionClose) {
170 ProcessRegionClose c = (ProcessRegionClose)op;
171 if (c.regionInfo.equals(hri)) {
182 boolean isMetaShutdownReceived() {
183 return metaShutdownReceived;
186 int getCloseCount() {
187 return this.closeCount;
191 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
197 * In 2428, the meta region has just been set offline and then a close comes
199 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a>
202 public void testRegionCloseWhenNoMetaHBase2428()
205 LOG.info("Running testRegionCloseWhenNoMetaHBase2428");
206 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
207 final HMaster master = cluster.getMaster();
208 int metaIndex = cluster.getServerWithMeta();
209 // Figure the index of the server that is not server the hbase:meta
210 int otherServerIndex = -1;
211 for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) {
212 if (i == metaIndex) continue;
213 otherServerIndex = i;
216 final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex);
217 final HRegionServer metaHRS = cluster.getRegionServer(metaIndex);
219 // Get a region out on the otherServer.
220 final RegionInfo hri =
221 otherServer.getOnlineRegions().iterator().next().getRegionInfo();
223 // Add our RegionServerOperationsListener
224 HBase2428Listener listener = new HBase2428Listener(cluster,
225 metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex);
226 master.getRegionServerOperationQueue().
227 registerRegionServerOperationListener(listener);
229 // Now close the server carrying meta.
230 cluster.abortRegionServer(metaIndex);
232 // First wait on receipt of meta server shutdown message.
233 while(!listener.metaShutdownReceived) Threads.sleep(100);
234 while(!listener.isDone()) Threads.sleep(10);
235 // We should not have retried the close more times than it took for the
236 // server shutdown message to exit the delay queue and get processed
237 // (Multiple by two to add in some slop in case of GC or something).
238 assertTrue(listener.getCloseCount() > 1);
239 assertTrue(listener.getCloseCount() <
240 ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2));
242 // Assert the closed region came back online
243 assertRegionIsBackOnline(hri);
245 master.getRegionServerOperationQueue().
246 unregisterRegionServerOperationListener(listener);
252 * Test adding in a new server before old one on same host+port is dead.
253 * Make the test more onerous by having the server under test carry the meta.
254 * If confusion between old and new, purportedly meta never comes back. Test
255 * that meta gets redeployed.
258 public void testAddingServerBeforeOldIsDead2413()
261 LOG.info("Running testAddingServerBeforeOldIsDead2413");
262 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
264 int metaIndex = cluster.getServerWithMeta();
265 MiniHBaseClusterRegionServer metaHRS =
266 (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex);
267 int port = metaHRS.getServerInfo().getServerAddress().getPort();
268 Configuration c = TEST_UTIL.getConfiguration();
269 String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0");
271 LOG.info("KILLED=" + metaHRS);
273 c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port));
274 // Try and start new regionserver. It might clash with the old
275 // regionserver port so keep trying to get past the BindException.
276 HRegionServer hrs = null;
279 hrs = cluster.startRegionServer().getRegionServer();
281 } catch (IOException e) {
282 if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) {
283 InvocationTargetException ee = (InvocationTargetException)e.getCause();
284 if (ee.getCause() != null && ee.getCause() instanceof BindException) {
285 LOG.info("BindException; retrying: " + e.toString());
290 LOG.info("STARTED=" + hrs);
291 // Wait until he's been given at least 3 regions before we go on to try
292 // and count rows in table.
293 while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100);
294 LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() +
296 assertEquals(count, count());
298 c.set(HConstants.REGIONSERVER_PORT, oldPort);
304 * HBase2482 is about outstanding region openings. If any are outstanding
305 * when a regionserver goes down, then they'll never deploy. They'll be
306 * stuck in the regions-in-transition list for ever. This listener looks
307 * for a region opening HMsg and if its from the server passed on construction,
308 * then we kill it. It also looks out for a close message on the victim
309 * server because that signifies start of the fireworks.
312 static class HBase2482Listener implements RegionServerOperationListener {
313 private final HRegionServer victim;
314 private boolean abortSent = false;
315 // We closed regions on new server.
316 private volatile boolean closed = false;
317 // Copy of regions on new server
318 private final Collection<HRegion> copyOfOnlineRegions;
319 // This is the region that was in transition on the server we aborted. Test
320 // passes if this region comes back online successfully.
321 private RegionInfo regionToFind;
323 HBase2482Listener(final HRegionServer victim) {
324 this.victim = victim;
325 // Copy regions currently open on this server so I can notice when
327 this.copyOfOnlineRegions =
328 this.victim.getCopyOfOnlineRegionsSortedBySize().values();
332 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
333 if (!victim.getServerInfo().equals(serverInfo) ||
334 this.abortSent || !this.closed) {
337 if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true;
338 // Save the region that is in transition so can test later it came back.
339 this.regionToFind = incomingMsg.getRegionInfo();
340 String msg = "ABORTING " + this.victim + " because got a " +
341 HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " +
342 incomingMsg.getRegionInfo().getRegionNameAsString();
343 this.victim.abort(msg);
344 this.abortSent = true;
349 public boolean process(RegionServerOperation op) throws IOException {
354 public void processed(RegionServerOperation op) {
355 if (this.closed || !(op instanceof ProcessRegionClose)) return;
356 ProcessRegionClose close = (ProcessRegionClose)op;
357 for (HRegion r: this.copyOfOnlineRegions) {
358 if (r.getRegionInfo().equals(close.regionInfo)) {
359 // We've closed one of the regions that was on the victim server.
360 // Now can start testing for when all regions are back online again
361 LOG.info("Found close of " +
362 r.getRegionInfo().getRegionNameAsString() +
363 "; setting close happened flag");
372 * In 2482, a RS with an opening region on it dies. The said region is then
373 * stuck in the master's regions-in-transition and never leaves it. This
374 * test works by bringing up a new regionserver, waiting for the load
375 * balancer to give it some regions. Then, we close all on the new server.
376 * After sending all the close messages, we send the new regionserver the
377 * special blocking message so it can not process any more messages.
378 * Meantime reopening of the just-closed regions is backed up on the new
379 * server. Soon as master gets an opening region from the new regionserver,
380 * we kill it. We then wait on all regions to come back on line. If bug
381 * is fixed, this should happen soon as the processing of the killed server is
383 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a>
386 public void testKillRSWithOpeningRegion2482()
389 LOG.info("Running testKillRSWithOpeningRegion2482");
390 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
391 if (cluster.getLiveRegionServerThreads().size() < 2) {
392 // Need at least two servers.
393 cluster.startRegionServer();
395 // Count how many regions are online. They need to be all back online for
396 // this test to succeed.
397 int countOfMetaRegions = countOfMetaRegions();
398 // Add a listener on the server.
399 HMaster m = cluster.getMaster();
400 // Start new regionserver.
401 MiniHBaseClusterRegionServer hrs =
402 (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer();
403 LOG.info("Started new regionserver: " + hrs.toString());
404 // Wait until has some regions before proceeding. Balancer will give it some.
406 countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2);
407 while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100);
408 // Set the listener only after some regions have been opened on new server.
409 HBase2482Listener listener = new HBase2482Listener(hrs);
410 m.getRegionServerOperationQueue().
411 registerRegionServerOperationListener(listener);
413 // Go close all non-catalog regions on this new server
414 closeAllNonCatalogRegions(cluster, hrs);
415 // After all closes, add blocking message before the region opens start to
417 cluster.addMessageToSendRegionServer(hrs,
418 new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER));
419 // Wait till one of the above close messages has an effect before we start
420 // wait on all regions back online.
421 while (!listener.closed) Threads.sleep(100);
422 LOG.info("Past close");
423 // Make sure the abort server message was sent.
424 while(!listener.abortSent) Threads.sleep(100);
425 LOG.info("Past abort send; waiting on all regions to redeploy");
426 // Now wait for regions to come back online.
427 assertRegionIsBackOnline(listener.regionToFind);
429 m.getRegionServerOperationQueue().
430 unregisterRegionServerOperationListener(listener);
436 * @return Count of all non-catalog regions on the designated server
439 private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster,
440 final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs)
442 int countOfRegions = 0;
443 for (HRegion r: hrs.getOnlineRegions()) {
444 if (r.getRegionInfo().isMetaRegion()) continue;
445 cluster.addMessageToSendRegionServer(hrs,
446 new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo()));
447 LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() +
448 " on " + hrs.toString());
451 return countOfRegions;
454 private void assertRegionIsBackOnline(final RegionInfo hri)
456 // Region should have an entry in its startkey because of addRowToEachRegion.
457 byte [] row = getStartKey(hri);
458 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
459 Get g = new Get(row);
460 assertTrue((t.get(g)).size() > 0);
464 * @return Count of regions in meta table.
465 * @throws IOException
468 private static int countOfMetaRegions()
470 HTable meta = new HTable(TEST_UTIL.getConfiguration(),
471 HConstants.META_TABLE_NAME);
473 Scan scan = new Scan();
474 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
475 ResultScanner s = meta.getScanner(scan);
476 for (Result r = null; (r = s.next()) != null;) {
478 r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
479 if (b == null || b.length <= 0) break;
487 * Add to each of the regions in hbase:meta a value. Key is the startrow of the
488 * region (except its 'aaa' for first region). Actual value is the row name.
491 * @throws IOException
493 private static int addToEachStartKey(final int expected
) throws IOException
{
494 Table t
= TEST_UTIL
.getConnection().getTable(TABLENAME
);
495 Table meta
= TEST_UTIL
.getConnection().getTable(TableName
.META_TABLE_NAME
);
497 Scan scan
= new Scan();
498 scan
.addColumn(HConstants
.CATALOG_FAMILY
, HConstants
.REGIONINFO_QUALIFIER
);
499 ResultScanner s
= meta
.getScanner(scan
);
500 for (Result r
= null; (r
= s
.next()) != null;) {
501 RegionInfo hri
= CatalogFamilyFormat
.getRegionInfo(r
);
502 if (hri
== null) break;
503 if (!hri
.getTable().equals(TABLENAME
)) {
507 // If start key, add 'aaa'.
508 if(!hri
.getTable().equals(TABLENAME
)) {
511 byte [] row
= getStartKey(hri
);
512 Put p
= new Put(row
);
513 p
.setDurability(Durability
.SKIP_WAL
);
514 p
.addColumn(getTestFamily(), getTestQualifier(), row
);
519 Assert
.assertEquals(expected
, rows
);
527 * @return Start key for hri (If start key is '', then return 'aaa'.
529 private static byte [] getStartKey(final RegionInfo hri
) {
530 return Bytes
.equals(HConstants
.EMPTY_START_ROW
, hri
.getStartKey())?
531 Bytes
.toBytes("aaa"): hri
.getStartKey();
534 private static byte [] getTestFamily() {
538 private static byte [] getTestQualifier() {
539 return getTestFamily();