HBASE-26921 Rewrite the counting cells part in TestMultiVersions (#4316)
[hbase.git] / hbase-server / src / test / java / org / apache / hadoop / hbase / master / TestMasterTransitions.java
blob372224e56c176535f927f5337c096b115fb8ccee
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
18 package org.apache.hadoop.hbase.master;
20 import java.io.IOException;
21 import org.apache.hadoop.hbase.CatalogFamilyFormat;
22 import org.apache.hadoop.hbase.HBaseClassTestRule;
23 import org.apache.hadoop.hbase.HBaseTestingUtil;
24 import org.apache.hadoop.hbase.HConstants;
25 import org.apache.hadoop.hbase.TableName;
26 import org.apache.hadoop.hbase.client.Durability;
27 import org.apache.hadoop.hbase.client.Put;
28 import org.apache.hadoop.hbase.client.RegionInfo;
29 import org.apache.hadoop.hbase.client.RegionLocator;
30 import org.apache.hadoop.hbase.client.Result;
31 import org.apache.hadoop.hbase.client.ResultScanner;
32 import org.apache.hadoop.hbase.client.Scan;
33 import org.apache.hadoop.hbase.client.Table;
34 import org.apache.hadoop.hbase.testclassification.LargeTests;
35 import org.apache.hadoop.hbase.testclassification.MasterTests;
36 import org.apache.hadoop.hbase.util.Bytes;
37 import org.junit.AfterClass;
38 import org.junit.Assert;
39 import org.junit.Before;
40 import org.junit.BeforeClass;
41 import org.junit.ClassRule;
42 import org.junit.Ignore;
43 import org.junit.Test;
44 import org.junit.experimental.categories.Category;
45 import org.slf4j.Logger;
46 import org.slf4j.LoggerFactory;
48 /**
49 * Test transitions of state across the master. Sets up the cluster once and
50 * then runs a couple of tests.
52 @Category({MasterTests.class, LargeTests.class})
53 public class TestMasterTransitions {
55 @ClassRule
56 public static final HBaseClassTestRule CLASS_RULE =
57 HBaseClassTestRule.forClass(TestMasterTransitions.class);
59 private static final Logger LOG = LoggerFactory.getLogger(TestMasterTransitions.class);
60 private static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
61 private static final TableName TABLENAME = TableName.valueOf("master_transitions");
62 private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"),
63 Bytes.toBytes("b"), Bytes.toBytes("c")};
65 /**
66 * Start up a mini cluster and put a small table of many empty regions into it.
67 * @throws Exception
69 @BeforeClass public static void beforeAllTests() throws Exception {
70 TEST_UTIL.startMiniCluster(2);
71 // Create a table of three families. This will assign a region.
72 TEST_UTIL.createMultiRegionTable(TABLENAME, FAMILIES);
73 Table t = TEST_UTIL.getConnection().getTable(TABLENAME);
74 int countOfRegions = -1;
75 try (RegionLocator r = TEST_UTIL.getConnection().getRegionLocator(TABLENAME)) {
76 countOfRegions = r.getStartKeys().length;
78 TEST_UTIL.waitUntilAllRegionsAssigned(TABLENAME);
79 addToEachStartKey(countOfRegions);
80 t.close();
83 @AfterClass public static void afterAllTests() throws Exception {
84 TEST_UTIL.shutdownMiniCluster();
87 @Before public void setup() throws IOException {
88 TEST_UTIL.ensureSomeRegionServersAvailable(2);
91 /**
92 * Listener for regionserver events testing hbase-2428 (Infinite loop of
93 * region closes if hbase:meta region is offline). In particular, listen
94 * for the close of the 'metaServer' and when it comes in, requeue it with a
95 * delay as though there were an issue processing the shutdown. As part of
96 * the requeuing, send over a close of a region on 'otherServer' so it comes
97 * into a master that has its meta region marked as offline.
100 static class HBase2428Listener implements RegionServerOperationListener {
101 // Map of what we've delayed so we don't do do repeated delays.
102 private final Set<RegionServerOperation> postponed =
103 new CopyOnWriteArraySet<RegionServerOperation>();
104 private boolean done = false;;
105 private boolean metaShutdownReceived = false;
106 private final HServerAddress metaAddress;
107 private final MiniHBaseCluster cluster;
108 private final int otherServerIndex;
109 private final RegionInfo hri;
110 private int closeCount = 0;
111 static final int SERVER_DURATION = 3 * 1000;
112 static final int CLOSE_DURATION = 1 * 1000;
114 HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress,
115 final RegionInfo closingHRI, final int otherServerIndex) {
116 this.cluster = c;
117 this.metaAddress = metaAddress;
118 this.hri = closingHRI;
119 this.otherServerIndex = otherServerIndex;
122 @Override
123 public boolean process(final RegionServerOperation op) throws IOException {
124 // If a regionserver shutdown and its of the meta server, then we want to
125 // delay the processing of the shutdown and send off a close of a region on
126 // the 'otherServer.
127 boolean result = true;
128 if (op instanceof ProcessServerShutdown) {
129 ProcessServerShutdown pss = (ProcessServerShutdown)op;
130 if (pss.getDeadServerAddress().equals(this.metaAddress)) {
131 // Don't postpone more than once.
132 if (!this.postponed.contains(pss)) {
133 // Close some region.
134 this.cluster.addMessageToSendRegionServer(this.otherServerIndex,
135 new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri,
136 Bytes.toBytes("Forcing close in test")));
137 this.postponed.add(pss);
138 // Put off the processing of the regionserver shutdown processing.
139 pss.setDelay(SERVER_DURATION);
140 this.metaShutdownReceived = true;
141 // Return false. This will add this op to the delayed queue.
142 result = false;
145 } else {
146 // Have the close run frequently.
147 if (isWantedCloseOperation(op) != null) {
148 op.setDelay(CLOSE_DURATION);
149 // Count how many times it comes through here.
150 this.closeCount++;
153 return result;
156 public void processed(final RegionServerOperation op) {
157 if (isWantedCloseOperation(op) != null) return;
158 this.done = true;
162 * @param op
163 * @return Null if not the wanted ProcessRegionClose, else <code>op</code>
164 * cast as a ProcessRegionClose.
167 private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) {
168 // Count every time we get a close operation.
169 if (op instanceof ProcessRegionClose) {
170 ProcessRegionClose c = (ProcessRegionClose)op;
171 if (c.regionInfo.equals(hri)) {
172 return c;
175 return null;
178 boolean isDone() {
179 return this.done;
182 boolean isMetaShutdownReceived() {
183 return metaShutdownReceived;
186 int getCloseCount() {
187 return this.closeCount;
190 @Override
191 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
192 return true;
197 * In 2428, the meta region has just been set offline and then a close comes
198 * in.
199 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a>
201 @Ignore @Test
202 public void testRegionCloseWhenNoMetaHBase2428()
203 throws Exception {
205 LOG.info("Running testRegionCloseWhenNoMetaHBase2428");
206 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
207 final HMaster master = cluster.getMaster();
208 int metaIndex = cluster.getServerWithMeta();
209 // Figure the index of the server that is not server the hbase:meta
210 int otherServerIndex = -1;
211 for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) {
212 if (i == metaIndex) continue;
213 otherServerIndex = i;
214 break;
216 final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex);
217 final HRegionServer metaHRS = cluster.getRegionServer(metaIndex);
219 // Get a region out on the otherServer.
220 final RegionInfo hri =
221 otherServer.getOnlineRegions().iterator().next().getRegionInfo();
223 // Add our RegionServerOperationsListener
224 HBase2428Listener listener = new HBase2428Listener(cluster,
225 metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex);
226 master.getRegionServerOperationQueue().
227 registerRegionServerOperationListener(listener);
228 try {
229 // Now close the server carrying meta.
230 cluster.abortRegionServer(metaIndex);
232 // First wait on receipt of meta server shutdown message.
233 while(!listener.metaShutdownReceived) Threads.sleep(100);
234 while(!listener.isDone()) Threads.sleep(10);
235 // We should not have retried the close more times than it took for the
236 // server shutdown message to exit the delay queue and get processed
237 // (Multiple by two to add in some slop in case of GC or something).
238 assertTrue(listener.getCloseCount() > 1);
239 assertTrue(listener.getCloseCount() <
240 ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2));
242 // Assert the closed region came back online
243 assertRegionIsBackOnline(hri);
244 } finally {
245 master.getRegionServerOperationQueue().
246 unregisterRegionServerOperationListener(listener);
252 * Test adding in a new server before old one on same host+port is dead.
253 * Make the test more onerous by having the server under test carry the meta.
254 * If confusion between old and new, purportedly meta never comes back. Test
255 * that meta gets redeployed.
257 @Ignore @Test
258 public void testAddingServerBeforeOldIsDead2413()
259 throws IOException {
261 LOG.info("Running testAddingServerBeforeOldIsDead2413");
262 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
263 int count = count();
264 int metaIndex = cluster.getServerWithMeta();
265 MiniHBaseClusterRegionServer metaHRS =
266 (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex);
267 int port = metaHRS.getServerInfo().getServerAddress().getPort();
268 Configuration c = TEST_UTIL.getConfiguration();
269 String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0");
270 try {
271 LOG.info("KILLED=" + metaHRS);
272 metaHRS.kill();
273 c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port));
274 // Try and start new regionserver. It might clash with the old
275 // regionserver port so keep trying to get past the BindException.
276 HRegionServer hrs = null;
277 while (true) {
278 try {
279 hrs = cluster.startRegionServer().getRegionServer();
280 break;
281 } catch (IOException e) {
282 if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) {
283 InvocationTargetException ee = (InvocationTargetException)e.getCause();
284 if (ee.getCause() != null && ee.getCause() instanceof BindException) {
285 LOG.info("BindException; retrying: " + e.toString());
290 LOG.info("STARTED=" + hrs);
291 // Wait until he's been given at least 3 regions before we go on to try
292 // and count rows in table.
293 while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100);
294 LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() +
295 " regions");
296 assertEquals(count, count());
297 } finally {
298 c.set(HConstants.REGIONSERVER_PORT, oldPort);
304 * HBase2482 is about outstanding region openings. If any are outstanding
305 * when a regionserver goes down, then they'll never deploy. They'll be
306 * stuck in the regions-in-transition list for ever. This listener looks
307 * for a region opening HMsg and if its from the server passed on construction,
308 * then we kill it. It also looks out for a close message on the victim
309 * server because that signifies start of the fireworks.
312 static class HBase2482Listener implements RegionServerOperationListener {
313 private final HRegionServer victim;
314 private boolean abortSent = false;
315 // We closed regions on new server.
316 private volatile boolean closed = false;
317 // Copy of regions on new server
318 private final Collection<HRegion> copyOfOnlineRegions;
319 // This is the region that was in transition on the server we aborted. Test
320 // passes if this region comes back online successfully.
321 private RegionInfo regionToFind;
323 HBase2482Listener(final HRegionServer victim) {
324 this.victim = victim;
325 // Copy regions currently open on this server so I can notice when
326 // there is a close.
327 this.copyOfOnlineRegions =
328 this.victim.getCopyOfOnlineRegionsSortedBySize().values();
331 @Override
332 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
333 if (!victim.getServerInfo().equals(serverInfo) ||
334 this.abortSent || !this.closed) {
335 return true;
337 if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true;
338 // Save the region that is in transition so can test later it came back.
339 this.regionToFind = incomingMsg.getRegionInfo();
340 String msg = "ABORTING " + this.victim + " because got a " +
341 HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " +
342 incomingMsg.getRegionInfo().getRegionNameAsString();
343 this.victim.abort(msg);
344 this.abortSent = true;
345 return true;
348 @Override
349 public boolean process(RegionServerOperation op) throws IOException {
350 return true;
353 @Override
354 public void processed(RegionServerOperation op) {
355 if (this.closed || !(op instanceof ProcessRegionClose)) return;
356 ProcessRegionClose close = (ProcessRegionClose)op;
357 for (HRegion r: this.copyOfOnlineRegions) {
358 if (r.getRegionInfo().equals(close.regionInfo)) {
359 // We've closed one of the regions that was on the victim server.
360 // Now can start testing for when all regions are back online again
361 LOG.info("Found close of " +
362 r.getRegionInfo().getRegionNameAsString() +
363 "; setting close happened flag");
364 this.closed = true;
365 break;
372 * In 2482, a RS with an opening region on it dies. The said region is then
373 * stuck in the master's regions-in-transition and never leaves it. This
374 * test works by bringing up a new regionserver, waiting for the load
375 * balancer to give it some regions. Then, we close all on the new server.
376 * After sending all the close messages, we send the new regionserver the
377 * special blocking message so it can not process any more messages.
378 * Meantime reopening of the just-closed regions is backed up on the new
379 * server. Soon as master gets an opening region from the new regionserver,
380 * we kill it. We then wait on all regions to come back on line. If bug
381 * is fixed, this should happen soon as the processing of the killed server is
382 * done.
383 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a>
385 @Ignore @Test
386 public void testKillRSWithOpeningRegion2482()
387 throws Exception {
389 LOG.info("Running testKillRSWithOpeningRegion2482");
390 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
391 if (cluster.getLiveRegionServerThreads().size() < 2) {
392 // Need at least two servers.
393 cluster.startRegionServer();
395 // Count how many regions are online. They need to be all back online for
396 // this test to succeed.
397 int countOfMetaRegions = countOfMetaRegions();
398 // Add a listener on the server.
399 HMaster m = cluster.getMaster();
400 // Start new regionserver.
401 MiniHBaseClusterRegionServer hrs =
402 (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer();
403 LOG.info("Started new regionserver: " + hrs.toString());
404 // Wait until has some regions before proceeding. Balancer will give it some.
405 int minimumRegions =
406 countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2);
407 while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100);
408 // Set the listener only after some regions have been opened on new server.
409 HBase2482Listener listener = new HBase2482Listener(hrs);
410 m.getRegionServerOperationQueue().
411 registerRegionServerOperationListener(listener);
412 try {
413 // Go close all non-catalog regions on this new server
414 closeAllNonCatalogRegions(cluster, hrs);
415 // After all closes, add blocking message before the region opens start to
416 // come in.
417 cluster.addMessageToSendRegionServer(hrs,
418 new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER));
419 // Wait till one of the above close messages has an effect before we start
420 // wait on all regions back online.
421 while (!listener.closed) Threads.sleep(100);
422 LOG.info("Past close");
423 // Make sure the abort server message was sent.
424 while(!listener.abortSent) Threads.sleep(100);
425 LOG.info("Past abort send; waiting on all regions to redeploy");
426 // Now wait for regions to come back online.
427 assertRegionIsBackOnline(listener.regionToFind);
428 } finally {
429 m.getRegionServerOperationQueue().
430 unregisterRegionServerOperationListener(listener);
436 * @return Count of all non-catalog regions on the designated server
439 private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster,
440 final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs)
441 throws IOException {
442 int countOfRegions = 0;
443 for (HRegion r: hrs.getOnlineRegions()) {
444 if (r.getRegionInfo().isMetaRegion()) continue;
445 cluster.addMessageToSendRegionServer(hrs,
446 new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo()));
447 LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() +
448 " on " + hrs.toString());
449 countOfRegions++;
451 return countOfRegions;
454 private void assertRegionIsBackOnline(final RegionInfo hri)
455 throws IOException {
456 // Region should have an entry in its startkey because of addRowToEachRegion.
457 byte [] row = getStartKey(hri);
458 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
459 Get g = new Get(row);
460 assertTrue((t.get(g)).size() > 0);
464 * @return Count of regions in meta table.
465 * @throws IOException
468 private static int countOfMetaRegions()
469 throws IOException {
470 HTable meta = new HTable(TEST_UTIL.getConfiguration(),
471 HConstants.META_TABLE_NAME);
472 int rows = 0;
473 Scan scan = new Scan();
474 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
475 ResultScanner s = meta.getScanner(scan);
476 for (Result r = null; (r = s.next()) != null;) {
477 byte [] b =
478 r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
479 if (b == null || b.length <= 0) break;
480 rows++;
482 s.close();
483 return rows;
487 * Add to each of the regions in hbase:meta a value. Key is the startrow of the
488 * region (except its 'aaa' for first region). Actual value is the row name.
489 * @param expected
490 * @return
491 * @throws IOException
493 private static int addToEachStartKey(final int expected) throws IOException {
494 Table t = TEST_UTIL.getConnection().getTable(TABLENAME);
495 Table meta = TEST_UTIL.getConnection().getTable(TableName.META_TABLE_NAME);
496 int rows = 0;
497 Scan scan = new Scan();
498 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
499 ResultScanner s = meta.getScanner(scan);
500 for (Result r = null; (r = s.next()) != null;) {
501 RegionInfo hri = CatalogFamilyFormat.getRegionInfo(r);
502 if (hri == null) break;
503 if (!hri.getTable().equals(TABLENAME)) {
504 continue;
507 // If start key, add 'aaa'.
508 if(!hri.getTable().equals(TABLENAME)) {
509 continue;
511 byte [] row = getStartKey(hri);
512 Put p = new Put(row);
513 p.setDurability(Durability.SKIP_WAL);
514 p.addColumn(getTestFamily(), getTestQualifier(), row);
515 t.put(p);
516 rows++;
518 s.close();
519 Assert.assertEquals(expected, rows);
520 t.close();
521 meta.close();
522 return rows;
526 * @param hri
527 * @return Start key for hri (If start key is '', then return 'aaa'.
529 private static byte [] getStartKey(final RegionInfo hri) {
530 return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())?
531 Bytes.toBytes("aaa"): hri.getStartKey();
534 private static byte [] getTestFamily() {
535 return FAMILIES[0];
538 private static byte [] getTestQualifier() {
539 return getTestFamily();