HBASE-17532 Replaced explicit type with diamond operator
[hbase.git] / hbase-server / src / test / java / org / apache / hadoop / hbase / regionserver / TestRegionReplicaFailover.java
blob69b7581491a522b536b0cc05d20791b229202891
1 /**
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 package org.apache.hadoop.hbase.regionserver;
22 import static org.junit.Assert.*;
24 import java.io.IOException;
25 import java.util.Arrays;
26 import java.util.Collection;
27 import java.util.concurrent.atomic.AtomicBoolean;
28 import java.util.concurrent.atomic.AtomicInteger;
29 import java.util.concurrent.atomic.AtomicReference;
31 import org.apache.commons.logging.Log;
32 import org.apache.commons.logging.LogFactory;
33 import org.apache.commons.logging.impl.Log4JLogger;
34 import org.apache.hadoop.conf.Configuration;
35 import org.apache.hadoop.hbase.HBaseTestingUtility;
36 import org.apache.hadoop.hbase.HConstants;
37 import org.apache.hadoop.hbase.HTableDescriptor;
38 import org.apache.hadoop.hbase.TableName;
39 import org.apache.hadoop.hbase.Waiter.Predicate;
40 import org.apache.hadoop.hbase.client.Admin;
41 import org.apache.hadoop.hbase.client.Connection;
42 import org.apache.hadoop.hbase.client.ConnectionFactory;
43 import org.apache.hadoop.hbase.client.Consistency;
44 import org.apache.hadoop.hbase.client.Get;
45 import org.apache.hadoop.hbase.client.RpcRetryingCallerImpl;
46 import org.apache.hadoop.hbase.client.Table;
47 import org.apache.hadoop.hbase.replication.regionserver.TestRegionReplicaReplicationEndpoint;
48 import org.apache.hadoop.hbase.testclassification.LargeTests;
49 import org.apache.hadoop.hbase.util.Bytes;
50 import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
51 import org.apache.hadoop.hbase.util.Threads;
52 import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
53 import org.apache.log4j.Level;
54 import org.junit.After;
55 import org.junit.Before;
56 import org.junit.Rule;
57 import org.junit.Test;
58 import org.junit.experimental.categories.Category;
59 import org.junit.rules.TestName;
60 import org.junit.runner.RunWith;
61 import org.junit.runners.Parameterized;
62 import org.junit.runners.Parameterized.Parameters;
64 /**
65 * Tests failover of secondary region replicas.
67 @RunWith(Parameterized.class)
68 @Category(LargeTests.class)
69 public class TestRegionReplicaFailover {
71 private static final Log LOG = LogFactory.getLog(TestRegionReplicaReplicationEndpoint.class);
73 static {
74 ((Log4JLogger)RpcRetryingCallerImpl.LOG).getLogger().setLevel(Level.ALL);
77 private static final HBaseTestingUtility HTU = new HBaseTestingUtility();
79 private static final int NB_SERVERS = 3;
81 protected final byte[][] families = new byte[][] {HBaseTestingUtility.fam1,
82 HBaseTestingUtility.fam2, HBaseTestingUtility.fam3};
83 protected final byte[] fam = HBaseTestingUtility.fam1;
84 protected final byte[] qual1 = Bytes.toBytes("qual1");
85 protected final byte[] value1 = Bytes.toBytes("value1");
86 protected final byte[] row = Bytes.toBytes("rowA");
87 protected final byte[] row2 = Bytes.toBytes("rowB");
89 @Rule public TestName name = new TestName();
91 private HTableDescriptor htd;
94 * We are testing with dist log split and dist log replay separately
96 @Parameters
97 public static Collection<Object[]> getParameters() {
98 Object[][] params =
99 new Boolean[][] { /*{true}, Disable DLR!!! It is going to be removed*/ {false} };
100 return Arrays.asList(params);
103 @Parameterized.Parameter(0)
104 public boolean distributedLogReplay;
106 @Before
107 public void before() throws Exception {
108 Configuration conf = HTU.getConfiguration();
109 // Up the handlers; this test needs more than usual.
110 conf.setInt(HConstants.REGION_SERVER_HIGH_PRIORITY_HANDLER_COUNT, 10);
111 conf.setBoolean(ServerRegionReplicaUtil.REGION_REPLICA_REPLICATION_CONF_KEY, true);
112 conf.setBoolean(ServerRegionReplicaUtil.REGION_REPLICA_WAIT_FOR_PRIMARY_FLUSH_CONF_KEY, true);
113 conf.setInt("replication.stats.thread.period.seconds", 5);
114 conf.setBoolean("hbase.tests.use.shortcircuit.reads", false);
115 conf.setBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, distributedLogReplay);
117 HTU.startMiniCluster(NB_SERVERS);
118 htd = HTU.createTableDescriptor(
119 name.getMethodName().substring(0, name.getMethodName().length()-3));
120 htd.setRegionReplication(3);
121 HTU.getAdmin().createTable(htd);
124 @After
125 public void after() throws Exception {
126 HTU.deleteTableIfAny(htd.getTableName());
127 HTU.shutdownMiniCluster();
131 * Tests the case where a newly created table with region replicas and no data, the secondary
132 * region replicas are available to read immediately.
134 @Test(timeout = 60000)
135 public void testSecondaryRegionWithEmptyRegion() throws IOException {
136 // Create a new table with region replication, don't put any data. Test that the secondary
137 // region replica is available to read.
138 try (Connection connection = ConnectionFactory.createConnection(HTU.getConfiguration());
139 Table table = connection.getTable(htd.getTableName())) {
141 Get get = new Get(row);
142 get.setConsistency(Consistency.TIMELINE);
143 get.setReplicaId(1);
144 table.get(get); // this should not block
149 * Tests the case where if there is some data in the primary region, reopening the region replicas
150 * (enable/disable table, etc) makes the region replicas readable.
151 * @throws IOException
153 @Test(timeout = 60000)
154 public void testSecondaryRegionWithNonEmptyRegion() throws IOException {
155 // Create a new table with region replication and load some data
156 // than disable and enable the table again and verify the data from secondary
157 try (Connection connection = ConnectionFactory.createConnection(HTU.getConfiguration());
158 Table table = connection.getTable(htd.getTableName())) {
160 HTU.loadNumericRows(table, fam, 0, 1000);
162 HTU.getAdmin().disableTable(htd.getTableName());
163 HTU.getAdmin().enableTable(htd.getTableName());
165 HTU.verifyNumericRows(table, fam, 0, 1000, 1);
170 * Tests the case where killing a primary region with unflushed data recovers
172 @Test (timeout = 120000)
173 public void testPrimaryRegionKill() throws Exception {
174 try (Connection connection = ConnectionFactory.createConnection(HTU.getConfiguration());
175 Table table = connection.getTable(htd.getTableName())) {
177 HTU.loadNumericRows(table, fam, 0, 1000);
179 // wal replication is async, we have to wait until the replication catches up, or we timeout
180 verifyNumericRowsWithTimeout(table, fam, 0, 1000, 1, 30000);
181 verifyNumericRowsWithTimeout(table, fam, 0, 1000, 2, 30000);
183 // we should not have flushed files now, but data in memstores of primary and secondary
184 // kill the primary region replica now, and ensure that when it comes back up, we can still
185 // read from it the same data from primary and secondaries
186 boolean aborted = false;
187 for (RegionServerThread rs : HTU.getMiniHBaseCluster().getRegionServerThreads()) {
188 for (Region r : rs.getRegionServer().getOnlineRegions(htd.getTableName())) {
189 if (r.getRegionInfo().getReplicaId() == 0) {
190 LOG.info("Aborting region server hosting primary region replica");
191 rs.getRegionServer().abort("for test");
192 aborted = true;
193 break;
197 assertTrue(aborted);
199 // wal replication is async, we have to wait until the replication catches up, or we timeout
200 verifyNumericRowsWithTimeout(table, fam, 0, 1000, 0, 30000);
201 verifyNumericRowsWithTimeout(table, fam, 0, 1000, 1, 30000);
202 verifyNumericRowsWithTimeout(table, fam, 0, 1000, 2, 30000);
205 // restart the region server
206 HTU.getMiniHBaseCluster().startRegionServer();
209 /** wal replication is async, we have to wait until the replication catches up, or we timeout
211 private void verifyNumericRowsWithTimeout(final Table table, final byte[] f, final int startRow,
212 final int endRow, final int replicaId, final long timeout) throws Exception {
213 try {
214 HTU.waitFor(timeout, new Predicate<Exception>() {
215 @Override
216 public boolean evaluate() throws Exception {
217 try {
218 HTU.verifyNumericRows(table, f, startRow, endRow, replicaId);
219 return true;
220 } catch (AssertionError ae) {
221 return false;
225 } catch (Throwable t) {
226 // ignore this, but redo the verify do get the actual exception
227 HTU.verifyNumericRows(table, f, startRow, endRow, replicaId);
232 * Tests the case where killing a secondary region with unflushed data recovers, and the replica
233 * becomes available to read again shortly.
235 @Test (timeout = 120000)
236 public void testSecondaryRegionKill() throws Exception {
237 try (Connection connection = ConnectionFactory.createConnection(HTU.getConfiguration());
238 Table table = connection.getTable(htd.getTableName())) {
239 HTU.loadNumericRows(table, fam, 0, 1000);
241 // wait for some time to ensure that async wal replication does it's magic
242 verifyNumericRowsWithTimeout(table, fam, 0, 1000, 1, 30000);
243 verifyNumericRowsWithTimeout(table, fam, 0, 1000, 2, 30000);
245 // we should not have flushed files now, but data in memstores of primary and secondary
246 // kill the secondary region replica now, and ensure that when it comes back up, we can still
247 // read from it the same data
248 boolean aborted = false;
249 for (RegionServerThread rs : HTU.getMiniHBaseCluster().getRegionServerThreads()) {
250 for (Region r : rs.getRegionServer().getOnlineRegions(htd.getTableName())) {
251 if (r.getRegionInfo().getReplicaId() == 1) {
252 LOG.info("Aborting region server hosting secondary region replica");
253 rs.getRegionServer().abort("for test");
254 aborted = true;
255 break;
259 assertTrue(aborted);
261 Threads.sleep(5000);
263 HTU.verifyNumericRows(table, fam, 0, 1000, 1);
264 HTU.verifyNumericRows(table, fam, 0, 1000, 2);
267 // restart the region server
268 HTU.getMiniHBaseCluster().startRegionServer();
272 * Tests the case where there are 3 region replicas and the primary is continuously accepting
273 * new writes while one of the secondaries is killed. Verification is done for both of the
274 * secondary replicas.
276 @Test (timeout = 120000)
277 public void testSecondaryRegionKillWhilePrimaryIsAcceptingWrites() throws Exception {
278 try (Connection connection = ConnectionFactory.createConnection(HTU.getConfiguration());
279 Table table = connection.getTable(htd.getTableName());
280 Admin admin = connection.getAdmin()) {
281 // start a thread to do the loading of primary
282 HTU.loadNumericRows(table, fam, 0, 1000); // start with some base
283 admin.flush(table.getName());
284 HTU.loadNumericRows(table, fam, 1000, 2000);
286 final AtomicReference<Throwable> ex = new AtomicReference<>(null);
287 final AtomicBoolean done = new AtomicBoolean(false);
288 final AtomicInteger key = new AtomicInteger(2000);
290 Thread loader = new Thread() {
291 @Override
292 public void run() {
293 while (!done.get()) {
294 try {
295 HTU.loadNumericRows(table, fam, key.get(), key.get()+1000);
296 key.addAndGet(1000);
297 } catch (Throwable e) {
298 ex.compareAndSet(null, e);
303 loader.start();
305 Thread aborter = new Thread() {
306 @Override
307 public void run() {
308 try {
309 boolean aborted = false;
310 for (RegionServerThread rs : HTU.getMiniHBaseCluster().getRegionServerThreads()) {
311 for (Region r : rs.getRegionServer().getOnlineRegions(htd.getTableName())) {
312 if (r.getRegionInfo().getReplicaId() == 1) {
313 LOG.info("Aborting region server hosting secondary region replica");
314 rs.getRegionServer().abort("for test");
315 aborted = true;
319 assertTrue(aborted);
320 } catch (Throwable e) {
321 ex.compareAndSet(null, e);
326 aborter.start();
327 aborter.join();
328 done.set(true);
329 loader.join();
331 assertNull(ex.get());
333 assertTrue(key.get() > 1000); // assert that the test is working as designed
334 LOG.info("Loaded up to key :" + key.get());
335 verifyNumericRowsWithTimeout(table, fam, 0, key.get(), 0, 30000);
336 verifyNumericRowsWithTimeout(table, fam, 0, key.get(), 1, 30000);
337 verifyNumericRowsWithTimeout(table, fam, 0, key.get(), 2, 30000);
340 // restart the region server
341 HTU.getMiniHBaseCluster().startRegionServer();
345 * Tests the case where we are creating a table with a lot of regions and replicas. Opening region
346 * replicas should not block handlers on RS indefinitely.
348 @Test (timeout = 120000)
349 public void testLotsOfRegionReplicas() throws IOException {
350 int numRegions = NB_SERVERS * 20;
351 int regionReplication = 10;
352 String tableName = htd.getTableName().getNameAsString() + "2";
353 htd = HTU.createTableDescriptor(tableName);
354 htd.setRegionReplication(regionReplication);
356 // dont care about splits themselves too much
357 byte[] startKey = Bytes.toBytes("aaa");
358 byte[] endKey = Bytes.toBytes("zzz");
359 byte[][] splits = HTU.getRegionSplitStartKeys(startKey, endKey, numRegions);
360 HTU.getAdmin().createTable(htd, startKey, endKey, numRegions);
362 try (Connection connection = ConnectionFactory.createConnection(HTU.getConfiguration());
363 Table table = connection.getTable(htd.getTableName())) {
365 for (int i = 1; i < splits.length; i++) {
366 for (int j = 0; j < regionReplication; j++) {
367 Get get = new Get(splits[i]);
368 get.setConsistency(Consistency.TIMELINE);
369 get.setReplicaId(j);
370 table.get(get); // this should not block. Regions should be coming online
375 HTU.deleteTableIfAny(TableName.valueOf(tableName));