HBASE-13591 Attempt to stabilize TestHBaseFsck
[hbase.git] / hbase-server / src / test / java / org / apache / hadoop / hbase / util / TestHBaseFsck.java
blobfa160686d379cb8029f5340d1bda887bd3cc33bb
1 /**
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
19 package org.apache.hadoop.hbase.util;
21 import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
22 import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertNoErrors;
23 import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
24 import static org.junit.Assert.assertEquals;
25 import static org.junit.Assert.assertFalse;
26 import static org.junit.Assert.assertNotEquals;
27 import static org.junit.Assert.assertNotNull;
28 import static org.junit.Assert.assertTrue;
29 import static org.junit.Assert.fail;
31 import java.io.IOException;
32 import java.util.ArrayList;
33 import java.util.Arrays;
34 import java.util.Collection;
35 import java.util.HashMap;
36 import java.util.HashSet;
37 import java.util.LinkedList;
38 import java.util.List;
39 import java.util.Map;
40 import java.util.NavigableMap;
41 import java.util.Set;
42 import java.util.concurrent.Callable;
43 import java.util.concurrent.CountDownLatch;
44 import java.util.concurrent.ExecutorService;
45 import java.util.concurrent.Executors;
46 import java.util.concurrent.Future;
47 import java.util.concurrent.ScheduledThreadPoolExecutor;
48 import java.util.concurrent.SynchronousQueue;
49 import java.util.concurrent.ThreadPoolExecutor;
50 import java.util.concurrent.TimeUnit;
51 import java.util.concurrent.atomic.AtomicBoolean;
53 import org.apache.commons.io.IOUtils;
54 import org.apache.commons.logging.Log;
55 import org.apache.commons.logging.LogFactory;
56 import org.apache.hadoop.conf.Configuration;
57 import org.apache.hadoop.fs.FileStatus;
58 import org.apache.hadoop.fs.FileSystem;
59 import org.apache.hadoop.fs.Path;
60 import org.apache.hadoop.hbase.ClusterStatus;
61 import org.apache.hadoop.hbase.HBaseTestingUtility;
62 import org.apache.hadoop.hbase.HColumnDescriptor;
63 import org.apache.hadoop.hbase.HConstants;
64 import org.apache.hadoop.hbase.HRegionInfo;
65 import org.apache.hadoop.hbase.HRegionLocation;
66 import org.apache.hadoop.hbase.HTableDescriptor;
67 import org.apache.hadoop.hbase.MiniHBaseCluster;
68 import org.apache.hadoop.hbase.ServerName;
69 import org.apache.hadoop.hbase.TableName;
70 import org.apache.hadoop.hbase.MetaTableAccessor;
71 import org.apache.hadoop.hbase.client.Admin;
72 import org.apache.hadoop.hbase.client.ClusterConnection;
73 import org.apache.hadoop.hbase.client.Connection;
74 import org.apache.hadoop.hbase.client.ConnectionFactory;
75 import org.apache.hadoop.hbase.client.Delete;
76 import org.apache.hadoop.hbase.client.Durability;
77 import org.apache.hadoop.hbase.client.Get;
78 import org.apache.hadoop.hbase.client.HBaseAdmin;
79 import org.apache.hadoop.hbase.client.HConnection;
80 import org.apache.hadoop.hbase.client.HTable;
81 import org.apache.hadoop.hbase.client.Put;
82 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
83 import org.apache.hadoop.hbase.client.Result;
84 import org.apache.hadoop.hbase.client.ResultScanner;
85 import org.apache.hadoop.hbase.client.Scan;
86 import org.apache.hadoop.hbase.client.Table;
87 import org.apache.hadoop.hbase.coprocessor.BaseMasterObserver;
88 import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
89 import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment;
90 import org.apache.hadoop.hbase.coprocessor.ObserverContext;
91 import org.apache.hadoop.hbase.io.hfile.TestHFile;
92 import org.apache.hadoop.hbase.master.AssignmentManager;
93 import org.apache.hadoop.hbase.master.HMaster;
94 import org.apache.hadoop.hbase.master.RegionState;
95 import org.apache.hadoop.hbase.master.RegionStates;
96 import org.apache.hadoop.hbase.master.TableLockManager;
97 import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
98 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
99 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
100 import org.apache.hadoop.hbase.regionserver.HRegion;
101 import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
102 import org.apache.hadoop.hbase.regionserver.HRegionServer;
103 import org.apache.hadoop.hbase.regionserver.SplitTransactionFactory;
104 import org.apache.hadoop.hbase.regionserver.SplitTransactionImpl;
105 import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
106 import org.apache.hadoop.hbase.security.access.AccessControlClient;
107 import org.apache.hadoop.hbase.testclassification.LargeTests;
108 import org.apache.hadoop.hbase.testclassification.MiscTests;
109 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
110 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
111 import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;
112 import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter;
113 import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
114 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
115 import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
116 import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
117 import org.apache.zookeeper.KeeperException;
118 import org.junit.AfterClass;
119 import org.junit.Assert;
120 import org.junit.Before;
121 import org.junit.BeforeClass;
122 import org.junit.Ignore;
123 import org.junit.Test;
124 import org.junit.experimental.categories.Category;
125 import org.junit.rules.TestName;
127 import com.google.common.collect.Multimap;
130 * This tests HBaseFsck's ability to detect reasons for inconsistent tables.
132 @Category({MiscTests.class, LargeTests.class})
133 public class TestHBaseFsck {
134 static final int POOL_SIZE = 7;
135 private static final Log LOG = LogFactory.getLog(TestHBaseFsck.class);
136 private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
137 private final static Configuration conf = TEST_UTIL.getConfiguration();
138 private final static String FAM_STR = "fam";
139 private final static byte[] FAM = Bytes.toBytes(FAM_STR);
140 private final static int REGION_ONLINE_TIMEOUT = 800;
141 private static RegionStates regionStates;
142 private static ExecutorService tableExecutorService;
143 private static ScheduledThreadPoolExecutor hbfsckExecutorService;
144 private static ClusterConnection connection;
145 private static Admin admin;
147 // for the instance, reset every test run
148 private HTable tbl;
149 private final static byte[][] SPLITS = new byte[][] { Bytes.toBytes("A"),
150 Bytes.toBytes("B"), Bytes.toBytes("C") };
151 // one row per region.
152 private final static byte[][] ROWKEYS= new byte[][] {
153 Bytes.toBytes("00"), Bytes.toBytes("50"), Bytes.toBytes("A0"), Bytes.toBytes("A5"),
154 Bytes.toBytes("B0"), Bytes.toBytes("B5"), Bytes.toBytes("C0"), Bytes.toBytes("C5") };
156 @BeforeClass
157 public static void setUpBeforeClass() throws Exception {
158 TEST_UTIL.getConfiguration().set(CoprocessorHost.MASTER_COPROCESSOR_CONF_KEY,
159 MasterSyncObserver.class.getName());
161 conf.setInt("hbase.regionserver.handler.count", 2);
162 conf.setInt("hbase.regionserver.metahandler.count", 30);
164 conf.setInt("hbase.htable.threads.max", POOL_SIZE);
165 conf.setInt("hbase.hconnection.threads.max", 2 * POOL_SIZE);
166 conf.setInt("hbase.hconnection.threads.core", POOL_SIZE);
167 conf.setInt("hbase.hbck.close.timeout", 2 * REGION_ONLINE_TIMEOUT);
168 conf.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, 8 * REGION_ONLINE_TIMEOUT);
169 TEST_UTIL.startMiniCluster(3);
171 tableExecutorService = new ThreadPoolExecutor(1, POOL_SIZE, 60, TimeUnit.SECONDS,
172 new SynchronousQueue<Runnable>(), Threads.newDaemonThreadFactory("testhbck"));
174 hbfsckExecutorService = new ScheduledThreadPoolExecutor(POOL_SIZE);
176 AssignmentManager assignmentManager =
177 TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
178 regionStates = assignmentManager.getRegionStates();
180 connection = (ClusterConnection) TEST_UTIL.getConnection();
182 admin = connection.getAdmin();
183 admin.setBalancerRunning(false, true);
185 TEST_UTIL.waitUntilAllRegionsAssigned(TableName.META_TABLE_NAME);
186 TEST_UTIL.waitUntilAllRegionsAssigned(TableName.NAMESPACE_TABLE_NAME);
189 @AfterClass
190 public static void tearDownAfterClass() throws Exception {
191 tableExecutorService.shutdown();
192 hbfsckExecutorService.shutdown();
193 admin.close();
194 TEST_UTIL.shutdownMiniCluster();
197 @Before
198 public void setUp() {
199 EnvironmentEdgeManager.reset();
202 @Test (timeout=180000)
203 public void testHBaseFsck() throws Exception {
204 assertNoErrors(doFsck(conf, false));
205 TableName table = TableName.valueOf("tableBadMetaAssign");
206 HTableDescriptor desc = new HTableDescriptor(table);
207 HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
208 desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
209 createTable(TEST_UTIL, desc, null);
211 // We created 1 table, should be fine
212 assertNoErrors(doFsck(conf, false));
214 // Now let's mess it up and change the assignment in hbase:meta to
215 // point to a different region server
216 Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
217 Scan scan = new Scan();
218 scan.setStartRow(Bytes.toBytes(table+",,"));
219 ResultScanner scanner = meta.getScanner(scan);
220 HRegionInfo hri = null;
222 Result res = scanner.next();
223 ServerName currServer =
224 ServerName.parseFrom(res.getValue(HConstants.CATALOG_FAMILY,
225 HConstants.SERVER_QUALIFIER));
226 long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
227 HConstants.STARTCODE_QUALIFIER));
229 for (JVMClusterUtil.RegionServerThread rs :
230 TEST_UTIL.getHBaseCluster().getRegionServerThreads()) {
232 ServerName sn = rs.getRegionServer().getServerName();
234 // When we find a diff RS, change the assignment and break
235 if (!currServer.getHostAndPort().equals(sn.getHostAndPort()) ||
236 startCode != sn.getStartcode()) {
237 Put put = new Put(res.getRow());
238 put.setDurability(Durability.SKIP_WAL);
239 put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
240 Bytes.toBytes(sn.getHostAndPort()));
241 put.add(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
242 Bytes.toBytes(sn.getStartcode()));
243 meta.put(put);
244 hri = MetaTableAccessor.getHRegionInfo(res);
245 break;
249 // Try to fix the data
250 assertErrors(doFsck(conf, true), new ERROR_CODE[]{
251 ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
253 TEST_UTIL.getHBaseCluster().getMaster()
254 .getAssignmentManager().waitForAssignment(hri);
256 // Should be fixed now
257 assertNoErrors(doFsck(conf, false));
259 // comment needed - what is the purpose of this line
260 Table t = connection.getTable(table, tableExecutorService);
261 ResultScanner s = t.getScanner(new Scan());
262 s.close();
263 t.close();
265 scanner.close();
266 meta.close();
269 @Test(timeout=180000)
270 public void testFixAssignmentsWhenMETAinTransition() throws Exception {
271 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
272 admin.closeRegion(cluster.getServerHoldingMeta(), HRegionInfo.FIRST_META_REGIONINFO);
273 regionStates.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
274 new MetaTableLocator().deleteMetaLocation(cluster.getMaster().getZooKeeper());
275 assertFalse(regionStates.isRegionOnline(HRegionInfo.FIRST_META_REGIONINFO));
276 HBaseFsck hbck = doFsck(conf, true);
277 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.UNKNOWN, ERROR_CODE.NO_META_REGION,
278 ERROR_CODE.NULL_META_REGION });
279 assertNoErrors(doFsck(conf, false));
283 * Create a new region in META.
285 private HRegionInfo createRegion(final HTableDescriptor
286 htd, byte[] startKey, byte[] endKey)
287 throws IOException {
288 Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
289 HRegionInfo hri = new HRegionInfo(htd.getTableName(), startKey, endKey);
290 MetaTableAccessor.addRegionToMeta(meta, hri);
291 meta.close();
292 return hri;
296 * Debugging method to dump the contents of meta.
298 private void dumpMeta(TableName tableName) throws IOException {
299 List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
300 for (byte[] row : metaRows) {
301 LOG.info(Bytes.toString(row));
306 * This method is used to undeploy a region -- close it and attempt to
307 * remove its state from the Master.
309 private void undeployRegion(Connection conn, ServerName sn,
310 HRegionInfo hri) throws IOException, InterruptedException {
311 try {
312 HBaseFsckRepair.closeRegionSilentlyAndWait((HConnection) conn, sn, hri);
313 if (!hri.isMetaTable()) {
314 admin.offline(hri.getRegionName());
316 } catch (IOException ioe) {
317 LOG.warn("Got exception when attempting to offline region "
318 + Bytes.toString(hri.getRegionName()), ioe);
322 * Delete a region from assignments, meta, or completely from hdfs.
323 * @param unassign if true unassign region if assigned
324 * @param metaRow if true remove region's row from META
325 * @param hdfs if true remove region's dir in HDFS
327 private void deleteRegion(Configuration conf, final HTableDescriptor htd,
328 byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
329 boolean hdfs) throws IOException, InterruptedException {
330 deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false, HRegionInfo.DEFAULT_REPLICA_ID);
334 * Delete a region from assignments, meta, or completely from hdfs.
335 * @param unassign if true unassign region if assigned
336 * @param metaRow if true remove region's row from META
337 * @param hdfs if true remove region's dir in HDFS
338 * @param regionInfoOnly if true remove a region dir's .regioninfo file
339 * @param replicaId replica id
341 private void deleteRegion(Configuration conf, final HTableDescriptor htd,
342 byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
343 boolean hdfs, boolean regionInfoOnly, int replicaId)
344 throws IOException, InterruptedException {
345 LOG.info("** Before delete:");
346 dumpMeta(htd.getTableName());
348 List<HRegionLocation> locations = tbl.getAllRegionLocations();
349 for (HRegionLocation location : locations) {
350 HRegionInfo hri = location.getRegionInfo();
351 ServerName hsa = location.getServerName();
352 if (Bytes.compareTo(hri.getStartKey(), startKey) == 0
353 && Bytes.compareTo(hri.getEndKey(), endKey) == 0
354 && hri.getReplicaId() == replicaId) {
356 LOG.info("RegionName: " +hri.getRegionNameAsString());
357 byte[] deleteRow = hri.getRegionName();
359 if (unassign) {
360 LOG.info("Undeploying region " + hri + " from server " + hsa);
361 undeployRegion(connection, hsa, hri);
364 if (regionInfoOnly) {
365 LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
366 Path rootDir = FSUtils.getRootDir(conf);
367 FileSystem fs = rootDir.getFileSystem(conf);
368 Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
369 hri.getEncodedName());
370 Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
371 fs.delete(hriPath, true);
374 if (hdfs) {
375 LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
376 Path rootDir = FSUtils.getRootDir(conf);
377 FileSystem fs = rootDir.getFileSystem(conf);
378 Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
379 hri.getEncodedName());
380 HBaseFsck.debugLsr(conf, p);
381 boolean success = fs.delete(p, true);
382 LOG.info("Deleted " + p + " sucessfully? " + success);
383 HBaseFsck.debugLsr(conf, p);
386 if (metaRow) {
387 try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
388 Delete delete = new Delete(deleteRow);
389 meta.delete(delete);
393 LOG.info(hri.toString() + hsa.toString());
396 TEST_UTIL.getMetaTableRows(htd.getTableName());
397 LOG.info("*** After delete:");
398 dumpMeta(htd.getTableName());
402 * Setup a clean table before we start mucking with it.
404 * It will set tbl which needs to be closed after test
406 * @throws IOException
407 * @throws InterruptedException
408 * @throws KeeperException
410 void setupTable(TableName tablename) throws Exception {
411 setupTableWithRegionReplica(tablename, 1);
415 * Setup a clean table with a certain region_replica count
417 * It will set tbl which needs to be closed after test
419 * @param tableName
420 * @param replicaCount
421 * @throws Exception
423 void setupTableWithRegionReplica(TableName tablename, int replicaCount) throws Exception {
424 HTableDescriptor desc = new HTableDescriptor(tablename);
425 desc.setRegionReplication(replicaCount);
426 HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
427 desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
428 createTable(TEST_UTIL, desc, SPLITS);
430 tbl = (HTable) connection.getTable(tablename, tableExecutorService);
431 List<Put> puts = new ArrayList<Put>();
432 for (byte[] row : ROWKEYS) {
433 Put p = new Put(row);
434 p.add(FAM, Bytes.toBytes("val"), row);
435 puts.add(p);
437 tbl.put(puts);
438 tbl.flushCommits();
442 * Counts the number of row to verify data loss or non-dataloss.
444 int countRows() throws IOException {
445 Scan s = new Scan();
446 ResultScanner rs = tbl.getScanner(s);
447 int i = 0;
448 while(rs.next() !=null) {
449 i++;
451 return i;
455 * delete table in preparation for next test
457 * @param tablename
458 * @throws IOException
460 void cleanupTable(TableName tablename) throws Exception {
461 if (tbl != null) {
462 tbl.close();
463 tbl = null;
466 ((ClusterConnection) connection).clearRegionCache();
467 deleteTable(TEST_UTIL, tablename);
471 * This creates a clean table and confirms that the table is clean.
473 @Test (timeout=180000)
474 public void testHBaseFsckClean() throws Exception {
475 assertNoErrors(doFsck(conf, false));
476 TableName table = TableName.valueOf("tableClean");
477 try {
478 HBaseFsck hbck = doFsck(conf, false);
479 assertNoErrors(hbck);
481 setupTable(table);
482 assertEquals(ROWKEYS.length, countRows());
484 // We created 1 table, should be fine
485 hbck = doFsck(conf, false);
486 assertNoErrors(hbck);
487 assertEquals(0, hbck.getOverlapGroups(table).size());
488 assertEquals(ROWKEYS.length, countRows());
489 } finally {
490 cleanupTable(table);
495 * Test thread pooling in the case where there are more regions than threads
497 @Test (timeout=180000)
498 public void testHbckThreadpooling() throws Exception {
499 TableName table =
500 TableName.valueOf("tableDupeStartKey");
501 try {
502 // Create table with 4 regions
503 setupTable(table);
505 // limit number of threads to 1.
506 Configuration newconf = new Configuration(conf);
507 newconf.setInt("hbasefsck.numthreads", 1);
508 assertNoErrors(doFsck(newconf, false));
510 // We should pass without triggering a RejectedExecutionException
511 } finally {
512 cleanupTable(table);
516 @Test (timeout=180000)
517 public void testHbckFixOrphanTable() throws Exception {
518 TableName table = TableName.valueOf("tableInfo");
519 FileSystem fs = null;
520 Path tableinfo = null;
521 try {
522 setupTable(table);
524 Path hbaseTableDir = FSUtils.getTableDir(
525 FSUtils.getRootDir(conf), table);
526 fs = hbaseTableDir.getFileSystem(conf);
527 FileStatus status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
528 tableinfo = status.getPath();
529 fs.rename(tableinfo, new Path("/.tableinfo"));
531 //to report error if .tableinfo is missing.
532 HBaseFsck hbck = doFsck(conf, false);
533 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_TABLEINFO_FILE });
535 // fix OrphanTable with default .tableinfo (htd not yet cached on master)
536 hbck = doFsck(conf, true);
537 assertNoErrors(hbck);
538 status = null;
539 status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
540 assertNotNull(status);
542 HTableDescriptor htd = admin.getTableDescriptor(table);
543 htd.setValue("NOT_DEFAULT", "true");
544 admin.disableTable(table);
545 admin.modifyTable(table, htd);
546 admin.enableTable(table);
547 fs.delete(status.getPath(), true);
549 // fix OrphanTable with cache
550 htd = admin.getTableDescriptor(table); // warms up cached htd on master
551 hbck = doFsck(conf, true);
552 assertNoErrors(hbck);
553 status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
554 assertNotNull(status);
555 htd = admin.getTableDescriptor(table);
556 assertEquals(htd.getValue("NOT_DEFAULT"), "true");
557 } finally {
558 fs.rename(new Path("/.tableinfo"), tableinfo);
559 cleanupTable(table);
564 * This test makes sure that parallel instances of Hbck is disabled.
566 * @throws Exception
568 @Test (timeout=180000)
569 public void testParallelHbck() throws Exception {
570 final ExecutorService service;
571 final Future<HBaseFsck> hbck1,hbck2;
573 class RunHbck implements Callable<HBaseFsck>{
574 boolean fail = true;
575 @Override
576 public HBaseFsck call(){
577 Configuration c = new Configuration(conf);
578 c.setInt("hbase.hbck.lockfile.attempts", 1);
579 try{
580 return doFsck(c, false);
581 } catch(Exception e){
582 if (e.getMessage().contains("Duplicate hbck")) {
583 fail = false;
586 // If we reach here, then an exception was caught
587 if (fail) fail();
588 return null;
591 service = Executors.newFixedThreadPool(2);
592 hbck1 = service.submit(new RunHbck());
593 hbck2 = service.submit(new RunHbck());
594 service.shutdown();
595 //wait for 15 seconds, for both hbck calls finish
596 service.awaitTermination(15, TimeUnit.SECONDS);
597 HBaseFsck h1 = hbck1.get();
598 HBaseFsck h2 = hbck2.get();
599 // Make sure only one of the calls was successful
600 assert(h1 == null || h2 == null);
601 if (h1 != null) {
602 assert(h1.getRetCode() >= 0);
604 if (h2 != null) {
605 assert(h2.getRetCode() >= 0);
610 * This test makes sure that with 10 retries both parallel instances
611 * of hbck will be completed successfully.
613 * @throws Exception
615 @Test (timeout=180000)
616 public void testParallelWithRetriesHbck() throws Exception {
617 final ExecutorService service;
618 final Future<HBaseFsck> hbck1,hbck2;
620 class RunHbck implements Callable<HBaseFsck>{
622 @Override
623 public HBaseFsck call() throws Exception {
624 // Increase retry attempts to make sure the non-active hbck doesn't get starved
625 Configuration c = new Configuration(conf);
626 c.setInt("hbase.hbck.lockfile.attempts", 10);
627 return doFsck(c, false);
630 service = Executors.newFixedThreadPool(2);
631 hbck1 = service.submit(new RunHbck());
632 hbck2 = service.submit(new RunHbck());
633 service.shutdown();
634 //wait for 15 seconds, for both hbck calls finish
635 service.awaitTermination(25, TimeUnit.SECONDS);
636 HBaseFsck h1 = hbck1.get();
637 HBaseFsck h2 = hbck2.get();
638 // Both should be successful
639 assertNotNull(h1);
640 assertNotNull(h2);
641 assert(h1.getRetCode() >= 0);
642 assert(h2.getRetCode() >= 0);
647 * This create and fixes a bad table with regions that have a duplicate
648 * start key
650 @Test (timeout=180000)
651 public void testDupeStartKey() throws Exception {
652 TableName table =
653 TableName.valueOf("tableDupeStartKey");
654 try {
655 setupTable(table);
656 assertNoErrors(doFsck(conf, false));
657 assertEquals(ROWKEYS.length, countRows());
659 // Now let's mess it up, by adding a region with a duplicate startkey
660 HRegionInfo hriDupe =
661 createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("A2"));
662 TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
663 TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
664 .waitForAssignment(hriDupe);
665 ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
666 TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
668 HBaseFsck hbck = doFsck(conf, false);
669 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
670 ERROR_CODE.DUPE_STARTKEYS});
671 assertEquals(2, hbck.getOverlapGroups(table).size());
672 assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
674 // fix the degenerate region.
675 doFsck(conf,true);
677 // check that the degenerate region is gone and no data loss
678 HBaseFsck hbck2 = doFsck(conf,false);
679 assertNoErrors(hbck2);
680 assertEquals(0, hbck2.getOverlapGroups(table).size());
681 assertEquals(ROWKEYS.length, countRows());
682 } finally {
683 cleanupTable(table);
688 * This creates a table with region_replica > 1 and verifies hbck runs
689 * successfully
691 @Test (timeout=180000)
692 public void testHbckWithRegionReplica() throws Exception {
693 TableName table =
694 TableName.valueOf("testHbckWithRegionReplica");
695 try {
696 setupTableWithRegionReplica(table, 2);
697 admin.flush(table);
698 assertNoErrors(doFsck(conf, false));
699 } finally {
700 cleanupTable(table);
704 @Test (timeout=180000)
705 public void testHbckWithFewerReplica() throws Exception {
706 TableName table =
707 TableName.valueOf("testHbckWithFewerReplica");
708 try {
709 setupTableWithRegionReplica(table, 2);
710 admin.flush(table);
711 assertNoErrors(doFsck(conf, false));
712 assertEquals(ROWKEYS.length, countRows());
713 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
714 Bytes.toBytes("C"), true, false, false, false, 1); // unassign one replica
715 // check that problem exists
716 HBaseFsck hbck = doFsck(conf, false);
717 assertErrors(hbck, new ERROR_CODE[]{ERROR_CODE.NOT_DEPLOYED});
718 // fix the problem
719 hbck = doFsck(conf, true);
720 // run hbck again to make sure we don't see any errors
721 hbck = doFsck(conf, false);
722 assertErrors(hbck, new ERROR_CODE[]{});
723 } finally {
724 cleanupTable(table);
728 @Test (timeout=180000)
729 public void testHbckWithExcessReplica() throws Exception {
730 TableName table =
731 TableName.valueOf("testHbckWithExcessReplica");
732 try {
733 setupTableWithRegionReplica(table, 2);
734 admin.flush(table);
735 assertNoErrors(doFsck(conf, false));
736 assertEquals(ROWKEYS.length, countRows());
737 // the next few lines inject a location in meta for a replica, and then
738 // asks the master to assign the replica (the meta needs to be injected
739 // for the master to treat the request for assignment as valid; the master
740 // checks the region is valid either from its memory or meta)
741 Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
742 List<HRegionInfo> regions = admin.getTableRegions(table);
743 byte[] startKey = Bytes.toBytes("B");
744 byte[] endKey = Bytes.toBytes("C");
745 byte[] metaKey = null;
746 HRegionInfo newHri = null;
747 for (HRegionInfo h : regions) {
748 if (Bytes.compareTo(h.getStartKey(), startKey) == 0 &&
749 Bytes.compareTo(h.getEndKey(), endKey) == 0 &&
750 h.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
751 metaKey = h.getRegionName();
752 //create a hri with replicaId as 2 (since we already have replicas with replicaid 0 and 1)
753 newHri = RegionReplicaUtil.getRegionInfoForReplica(h, 2);
754 break;
757 Put put = new Put(metaKey);
758 Collection<ServerName> var = admin.getClusterStatus().getServers();
759 ServerName sn = var.toArray(new ServerName[var.size()])[0];
760 //add a location with replicaId as 2 (since we already have replicas with replicaid 0 and 1)
761 MetaTableAccessor.addLocation(put, sn, sn.getStartcode(), 2);
762 meta.put(put);
763 // assign the new replica
764 HBaseFsckRepair.fixUnassigned(admin, newHri);
765 HBaseFsckRepair.waitUntilAssigned(admin, newHri);
766 // now reset the meta row to its original value
767 Delete delete = new Delete(metaKey);
768 delete.addColumns(HConstants.CATALOG_FAMILY, MetaTableAccessor.getServerColumn(2));
769 delete.addColumns(HConstants.CATALOG_FAMILY, MetaTableAccessor.getStartCodeColumn(2));
770 delete.addColumns(HConstants.CATALOG_FAMILY, MetaTableAccessor.getSeqNumColumn(2));
771 meta.delete(delete);
772 meta.close();
773 // check that problem exists
774 HBaseFsck hbck = doFsck(conf, false);
775 assertErrors(hbck, new ERROR_CODE[]{ERROR_CODE.NOT_IN_META});
776 // fix the problem
777 hbck = doFsck(conf, true);
778 // run hbck again to make sure we don't see any errors
779 hbck = doFsck(conf, false);
780 assertErrors(hbck, new ERROR_CODE[]{});
781 } finally {
782 cleanupTable(table);
786 * Get region info from local cluster.
788 Map<ServerName, List<String>> getDeployedHRIs(final HBaseAdmin admin) throws IOException {
789 ClusterStatus status = admin.getClusterStatus();
790 Collection<ServerName> regionServers = status.getServers();
791 Map<ServerName, List<String>> mm =
792 new HashMap<ServerName, List<String>>();
793 for (ServerName hsi : regionServers) {
794 AdminProtos.AdminService.BlockingInterface server = ((HConnection) connection).getAdmin(hsi);
796 // list all online regions from this region server
797 List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
798 List<String> regionNames = new ArrayList<String>();
799 for (HRegionInfo hri : regions) {
800 regionNames.add(hri.getRegionNameAsString());
802 mm.put(hsi, regionNames);
804 return mm;
808 * Returns the HSI a region info is on.
810 ServerName findDeployedHSI(Map<ServerName, List<String>> mm, HRegionInfo hri) {
811 for (Map.Entry<ServerName,List <String>> e : mm.entrySet()) {
812 if (e.getValue().contains(hri.getRegionNameAsString())) {
813 return e.getKey();
816 return null;
820 * This create and fixes a bad table with regions that have a duplicate
821 * start key
823 @Test (timeout=180000)
824 public void testDupeRegion() throws Exception {
825 TableName table =
826 TableName.valueOf("tableDupeRegion");
827 try {
828 setupTable(table);
829 assertNoErrors(doFsck(conf, false));
830 assertEquals(ROWKEYS.length, countRows());
832 // Now let's mess it up, by adding a region with a duplicate startkey
833 HRegionInfo hriDupe =
834 createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("B"));
836 TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
837 TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
838 .waitForAssignment(hriDupe);
839 ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
840 TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
842 // Yikes! The assignment manager can't tell between diff between two
843 // different regions with the same start/endkeys since it doesn't
844 // differentiate on ts/regionId! We actually need to recheck
845 // deployments!
846 while (findDeployedHSI(getDeployedHRIs((HBaseAdmin) admin), hriDupe) == null) {
847 Thread.sleep(250);
850 LOG.debug("Finished assignment of dupe region");
852 // TODO why is dupe region different from dupe start keys?
853 HBaseFsck hbck = doFsck(conf, false);
854 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
855 ERROR_CODE.DUPE_STARTKEYS});
856 assertEquals(2, hbck.getOverlapGroups(table).size());
857 assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
859 // fix the degenerate region.
860 doFsck(conf,true);
862 // check that the degenerate region is gone and no data loss
863 HBaseFsck hbck2 = doFsck(conf,false);
864 assertNoErrors(hbck2);
865 assertEquals(0, hbck2.getOverlapGroups(table).size());
866 assertEquals(ROWKEYS.length, countRows());
867 } finally {
868 cleanupTable(table);
873 * This creates and fixes a bad table with regions that has startkey == endkey
875 @Test (timeout=180000)
876 public void testDegenerateRegions() throws Exception {
877 TableName table = TableName.valueOf("tableDegenerateRegions");
878 try {
879 setupTable(table);
880 assertNoErrors(doFsck(conf,false));
881 assertEquals(ROWKEYS.length, countRows());
883 // Now let's mess it up, by adding a region with a duplicate startkey
884 HRegionInfo hriDupe =
885 createRegion(tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("B"));
886 TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
887 TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
888 .waitForAssignment(hriDupe);
889 ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
890 TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
892 HBaseFsck hbck = doFsck(conf,false);
893 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DEGENERATE_REGION, ERROR_CODE.DUPE_STARTKEYS,
894 ERROR_CODE.DUPE_STARTKEYS });
895 assertEquals(2, hbck.getOverlapGroups(table).size());
896 assertEquals(ROWKEYS.length, countRows());
898 // fix the degenerate region.
899 doFsck(conf,true);
901 // check that the degenerate region is gone and no data loss
902 HBaseFsck hbck2 = doFsck(conf,false);
903 assertNoErrors(hbck2);
904 assertEquals(0, hbck2.getOverlapGroups(table).size());
905 assertEquals(ROWKEYS.length, countRows());
906 } finally {
907 cleanupTable(table);
912 * This creates and fixes a bad table where a region is completely contained
913 * by another region.
915 @Test (timeout=180000)
916 public void testContainedRegionOverlap() throws Exception {
917 TableName table =
918 TableName.valueOf("tableContainedRegionOverlap");
919 try {
920 setupTable(table);
921 assertEquals(ROWKEYS.length, countRows());
923 // Mess it up by creating an overlap in the metadata
924 HRegionInfo hriOverlap =
925 createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
926 TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
927 TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
928 .waitForAssignment(hriOverlap);
929 ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
930 TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
932 HBaseFsck hbck = doFsck(conf, false);
933 assertErrors(hbck, new ERROR_CODE[] {
934 ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
935 assertEquals(2, hbck.getOverlapGroups(table).size());
936 assertEquals(ROWKEYS.length, countRows());
938 // fix the problem.
939 doFsck(conf, true);
941 // verify that overlaps are fixed
942 HBaseFsck hbck2 = doFsck(conf,false);
943 assertNoErrors(hbck2);
944 assertEquals(0, hbck2.getOverlapGroups(table).size());
945 assertEquals(ROWKEYS.length, countRows());
946 } finally {
947 cleanupTable(table);
952 * This creates and fixes a bad table where an overlap group of
953 * 3 regions. Set HBaseFsck.maxMerge to 2 to trigger sideline overlapped
954 * region. Mess around the meta data so that closeRegion/offlineRegion
955 * throws exceptions.
957 @Test (timeout=180000)
958 public void testSidelineOverlapRegion() throws Exception {
959 TableName table =
960 TableName.valueOf("testSidelineOverlapRegion");
961 try {
962 setupTable(table);
963 assertEquals(ROWKEYS.length, countRows());
965 // Mess it up by creating an overlap
966 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
967 HMaster master = cluster.getMaster();
968 HRegionInfo hriOverlap1 =
969 createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("AB"));
970 master.assignRegion(hriOverlap1);
971 master.getAssignmentManager().waitForAssignment(hriOverlap1);
972 HRegionInfo hriOverlap2 =
973 createRegion(tbl.getTableDescriptor(), Bytes.toBytes("AB"), Bytes.toBytes("B"));
974 master.assignRegion(hriOverlap2);
975 master.getAssignmentManager().waitForAssignment(hriOverlap2);
977 HBaseFsck hbck = doFsck(conf, false);
978 assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.DUPE_STARTKEYS,
979 ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
980 assertEquals(3, hbck.getOverlapGroups(table).size());
981 assertEquals(ROWKEYS.length, countRows());
983 // mess around the overlapped regions, to trigger NotServingRegionException
984 Multimap<byte[], HbckInfo> overlapGroups = hbck.getOverlapGroups(table);
985 ServerName serverName = null;
986 byte[] regionName = null;
987 for (HbckInfo hbi: overlapGroups.values()) {
988 if ("A".equals(Bytes.toString(hbi.getStartKey()))
989 && "B".equals(Bytes.toString(hbi.getEndKey()))) {
990 regionName = hbi.getRegionName();
992 // get an RS not serving the region to force bad assignment info in to META.
993 int k = cluster.getServerWith(regionName);
994 for (int i = 0; i < 3; i++) {
995 if (i != k) {
996 HRegionServer rs = cluster.getRegionServer(i);
997 serverName = rs.getServerName();
998 break;
1002 HBaseFsckRepair.closeRegionSilentlyAndWait((HConnection) connection,
1003 cluster.getRegionServer(k).getServerName(), hbi.getHdfsHRI());
1004 admin.offline(regionName);
1005 break;
1009 assertNotNull(regionName);
1010 assertNotNull(serverName);
1011 try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
1012 Put put = new Put(regionName);
1013 put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
1014 Bytes.toBytes(serverName.getHostAndPort()));
1015 meta.put(put);
1018 // fix the problem.
1019 HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
1020 fsck.connect();
1021 HBaseFsck.setDisplayFullReport(); // i.e. -details
1022 fsck.setTimeLag(0);
1023 fsck.setFixAssignments(true);
1024 fsck.setFixMeta(true);
1025 fsck.setFixHdfsHoles(true);
1026 fsck.setFixHdfsOverlaps(true);
1027 fsck.setFixHdfsOrphans(true);
1028 fsck.setFixVersionFile(true);
1029 fsck.setSidelineBigOverlaps(true);
1030 fsck.setMaxMerge(2);
1031 fsck.onlineHbck();
1032 fsck.close();
1034 // verify that overlaps are fixed, and there are less rows
1035 // since one region is sidelined.
1036 HBaseFsck hbck2 = doFsck(conf,false);
1037 assertNoErrors(hbck2);
1038 assertEquals(0, hbck2.getOverlapGroups(table).size());
1039 assertTrue(ROWKEYS.length > countRows());
1040 } finally {
1041 cleanupTable(table);
1046 * This creates and fixes a bad table where a region is completely contained
1047 * by another region, and there is a hole (sort of like a bad split)
1049 @Test (timeout=180000)
1050 public void testOverlapAndOrphan() throws Exception {
1051 TableName table =
1052 TableName.valueOf("tableOverlapAndOrphan");
1053 try {
1054 setupTable(table);
1055 assertEquals(ROWKEYS.length, countRows());
1057 // Mess it up by creating an overlap in the metadata
1058 admin.disableTable(table);
1059 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1060 Bytes.toBytes("B"), true, true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
1061 admin.enableTable(table);
1063 HRegionInfo hriOverlap =
1064 createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
1065 TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
1066 TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
1067 .waitForAssignment(hriOverlap);
1068 ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1069 TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1071 HBaseFsck hbck = doFsck(conf, false);
1072 assertErrors(hbck, new ERROR_CODE[] {
1073 ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1074 ERROR_CODE.HOLE_IN_REGION_CHAIN});
1076 // fix the problem.
1077 doFsck(conf, true);
1079 // verify that overlaps are fixed
1080 HBaseFsck hbck2 = doFsck(conf,false);
1081 assertNoErrors(hbck2);
1082 assertEquals(0, hbck2.getOverlapGroups(table).size());
1083 assertEquals(ROWKEYS.length, countRows());
1084 } finally {
1085 cleanupTable(table);
1090 * This creates and fixes a bad table where a region overlaps two regions --
1091 * a start key contained in another region and its end key is contained in
1092 * yet another region.
1094 @Test (timeout=180000)
1095 public void testCoveredStartKey() throws Exception {
1096 TableName table =
1097 TableName.valueOf("tableCoveredStartKey");
1098 try {
1099 setupTable(table);
1100 assertEquals(ROWKEYS.length, countRows());
1102 // Mess it up by creating an overlap in the metadata
1103 HRegionInfo hriOverlap =
1104 createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B2"));
1105 TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
1106 TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
1107 .waitForAssignment(hriOverlap);
1108 ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1109 TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1111 HBaseFsck hbck = doFsck(conf, false);
1112 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
1113 ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
1114 assertEquals(3, hbck.getOverlapGroups(table).size());
1115 assertEquals(ROWKEYS.length, countRows());
1117 // fix the problem.
1118 doFsck(conf, true);
1120 // verify that overlaps are fixed
1121 HBaseFsck hbck2 = doFsck(conf, false);
1122 assertErrors(hbck2, new ERROR_CODE[0]);
1123 assertEquals(0, hbck2.getOverlapGroups(table).size());
1124 assertEquals(ROWKEYS.length, countRows());
1125 } finally {
1126 cleanupTable(table);
1131 * This creates and fixes a bad table with a missing region -- hole in meta
1132 * and data missing in the fs.
1134 @Test (timeout=180000)
1135 public void testRegionHole() throws Exception {
1136 TableName table =
1137 TableName.valueOf("tableRegionHole");
1138 try {
1139 setupTable(table);
1140 assertEquals(ROWKEYS.length, countRows());
1142 // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1143 admin.disableTable(table);
1144 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1145 Bytes.toBytes("C"), true, true, true);
1146 admin.enableTable(table);
1148 HBaseFsck hbck = doFsck(conf, false);
1149 assertErrors(hbck, new ERROR_CODE[] {
1150 ERROR_CODE.HOLE_IN_REGION_CHAIN});
1151 // holes are separate from overlap groups
1152 assertEquals(0, hbck.getOverlapGroups(table).size());
1154 // fix hole
1155 doFsck(conf, true);
1157 // check that hole fixed
1158 assertNoErrors(doFsck(conf,false));
1159 assertEquals(ROWKEYS.length - 2 , countRows()); // lost a region so lost a row
1160 } finally {
1161 cleanupTable(table);
1166 * This creates and fixes a bad table with a missing region -- hole in meta
1167 * and data present but .regioinfino missing (an orphan hdfs region)in the fs.
1169 @Test (timeout=180000)
1170 public void testHDFSRegioninfoMissing() throws Exception {
1171 TableName table = TableName.valueOf("tableHDFSRegioninfoMissing");
1172 try {
1173 setupTable(table);
1174 assertEquals(ROWKEYS.length, countRows());
1176 // Mess it up by leaving a hole in the meta data
1177 admin.disableTable(table);
1178 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1179 Bytes.toBytes("C"), true, true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
1180 admin.enableTable(table);
1182 HBaseFsck hbck = doFsck(conf, false);
1183 assertErrors(hbck, new ERROR_CODE[] {
1184 ERROR_CODE.ORPHAN_HDFS_REGION,
1185 ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1186 ERROR_CODE.HOLE_IN_REGION_CHAIN});
1187 // holes are separate from overlap groups
1188 assertEquals(0, hbck.getOverlapGroups(table).size());
1190 // fix hole
1191 doFsck(conf, true);
1193 // check that hole fixed
1194 assertNoErrors(doFsck(conf, false));
1195 assertEquals(ROWKEYS.length, countRows());
1196 } finally {
1197 cleanupTable(table);
1202 * This creates and fixes a bad table with a region that is missing meta and
1203 * not assigned to a region server.
1205 @Test (timeout=180000)
1206 public void testNotInMetaOrDeployedHole() throws Exception {
1207 TableName table =
1208 TableName.valueOf("tableNotInMetaOrDeployedHole");
1209 try {
1210 setupTable(table);
1211 assertEquals(ROWKEYS.length, countRows());
1213 // Mess it up by leaving a hole in the meta data
1214 admin.disableTable(table);
1215 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1216 Bytes.toBytes("C"), true, true, false); // don't rm from fs
1217 admin.enableTable(table);
1219 HBaseFsck hbck = doFsck(conf, false);
1220 assertErrors(hbck, new ERROR_CODE[] {
1221 ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1222 // holes are separate from overlap groups
1223 assertEquals(0, hbck.getOverlapGroups(table).size());
1225 // fix hole
1226 assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1227 ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1229 // check that hole fixed
1230 assertNoErrors(doFsck(conf,false));
1231 assertEquals(ROWKEYS.length, countRows());
1232 } finally {
1233 cleanupTable(table);
1237 @Test (timeout=180000)
1238 public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception {
1239 TableName table = TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit");
1240 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1241 try {
1242 HTableDescriptor desc = new HTableDescriptor(table);
1243 desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f")));
1244 createTable(TEST_UTIL, desc, null);
1246 tbl = (HTable) connection.getTable(desc.getTableName());
1247 for (int i = 0; i < 5; i++) {
1248 Put p1 = new Put(("r" + i).getBytes());
1249 p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes());
1250 tbl.put(p1);
1252 admin.flush(desc.getTableName());
1253 List<HRegion> regions = cluster.getRegions(desc.getTableName());
1254 int serverWith = cluster.getServerWith(regions.get(0).getRegionInfo().getRegionName());
1255 HRegionServer regionServer = cluster.getRegionServer(serverWith);
1256 cluster.getServerWith(regions.get(0).getRegionInfo().getRegionName());
1257 SplitTransactionImpl st = (SplitTransactionImpl)
1258 new SplitTransactionFactory(TEST_UTIL.getConfiguration())
1259 .create(regions.get(0), Bytes.toBytes("r3"));
1260 st.prepare();
1261 st.stepsBeforePONR(regionServer, regionServer, false);
1262 AssignmentManager am = cluster.getMaster().getAssignmentManager();
1263 Map<String, RegionState> regionsInTransition = am.getRegionStates().getRegionsInTransition();
1264 for (RegionState state : regionsInTransition.values()) {
1265 am.regionOffline(state.getRegion());
1267 Map<HRegionInfo, ServerName> regionsMap = new HashMap<HRegionInfo, ServerName>();
1268 regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName());
1269 am.assign(regionsMap);
1270 am.waitForAssignment(regions.get(0).getRegionInfo());
1271 HBaseFsck hbck = doFsck(conf, false);
1272 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1273 ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
1274 // holes are separate from overlap groups
1275 assertEquals(0, hbck.getOverlapGroups(table).size());
1277 // fix hole
1278 assertErrors(
1279 doFsck(conf, false, true, false, false, false, false, false, false, false, false, null),
1280 new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1281 ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
1283 // check that hole fixed
1284 assertNoErrors(doFsck(conf, false));
1285 assertEquals(5, countRows());
1286 } finally {
1287 if (tbl != null) {
1288 tbl.close();
1289 tbl = null;
1291 cleanupTable(table);
1296 * This creates fixes a bad table with a hole in meta.
1298 @Test (timeout=180000)
1299 public void testNotInMetaHole() throws Exception {
1300 TableName table =
1301 TableName.valueOf("tableNotInMetaHole");
1302 try {
1303 setupTable(table);
1304 assertEquals(ROWKEYS.length, countRows());
1306 // Mess it up by leaving a hole in the meta data
1307 admin.disableTable(table);
1308 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1309 Bytes.toBytes("C"), false, true, false); // don't rm from fs
1310 admin.enableTable(table);
1312 HBaseFsck hbck = doFsck(conf, false);
1313 assertErrors(hbck, new ERROR_CODE[] {
1314 ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1315 // holes are separate from overlap groups
1316 assertEquals(0, hbck.getOverlapGroups(table).size());
1318 // fix hole
1319 assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1320 ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1322 // check that hole fixed
1323 assertNoErrors(doFsck(conf,false));
1324 assertEquals(ROWKEYS.length, countRows());
1325 } finally {
1326 cleanupTable(table);
1331 * This creates and fixes a bad table with a region that is in meta but has
1332 * no deployment or data hdfs
1334 @Test (timeout=180000)
1335 public void testNotInHdfs() throws Exception {
1336 TableName table =
1337 TableName.valueOf("tableNotInHdfs");
1338 try {
1339 setupTable(table);
1340 assertEquals(ROWKEYS.length, countRows());
1342 // make sure data in regions, if in wal only there is no data loss
1343 admin.flush(table);
1345 // Mess it up by leaving a hole in the hdfs data
1346 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1347 Bytes.toBytes("C"), false, false, true); // don't rm meta
1349 HBaseFsck hbck = doFsck(conf, false);
1350 assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1351 // holes are separate from overlap groups
1352 assertEquals(0, hbck.getOverlapGroups(table).size());
1354 // fix hole
1355 doFsck(conf, true);
1357 // check that hole fixed
1358 assertNoErrors(doFsck(conf,false));
1359 assertEquals(ROWKEYS.length - 2, countRows());
1360 } finally {
1361 cleanupTable(table);
1366 * This creates and fixes a bad table with a region that is in meta but has
1367 * no deployment or data hdfs. The table has region_replication set to 2.
1369 @Test (timeout=180000)
1370 public void testNotInHdfsWithReplicas() throws Exception {
1371 TableName table =
1372 TableName.valueOf("tableNotInHdfs");
1373 try {
1374 HRegionInfo[] oldHris = new HRegionInfo[2];
1375 setupTableWithRegionReplica(table, 2);
1376 assertEquals(ROWKEYS.length, countRows());
1377 NavigableMap<HRegionInfo, ServerName> map =
1378 MetaTableAccessor.allTableRegions(TEST_UTIL.getConnection(),
1379 tbl.getName());
1380 int i = 0;
1381 // store the HRIs of the regions we will mess up
1382 for (Map.Entry<HRegionInfo, ServerName> m : map.entrySet()) {
1383 if (m.getKey().getStartKey().length > 0 &&
1384 m.getKey().getStartKey()[0] == Bytes.toBytes("B")[0]) {
1385 LOG.debug("Initially server hosting " + m.getKey() + " is " + m.getValue());
1386 oldHris[i++] = m.getKey();
1389 // make sure data in regions
1390 admin.flush(table);
1392 // Mess it up by leaving a hole in the hdfs data
1393 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1394 Bytes.toBytes("C"), false, false, true); // don't rm meta
1396 HBaseFsck hbck = doFsck(conf, false);
1397 assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1399 // fix hole
1400 doFsck(conf, true);
1402 // check that hole fixed
1403 assertNoErrors(doFsck(conf,false));
1404 assertEquals(ROWKEYS.length - 2, countRows());
1406 // the following code checks whether the old primary/secondary has
1407 // been unassigned and the new primary/secondary has been assigned
1408 i = 0;
1409 HRegionInfo[] newHris = new HRegionInfo[2];
1410 // get all table's regions from meta
1411 map = MetaTableAccessor.allTableRegions(TEST_UTIL.getConnection(), tbl.getName());
1412 // get the HRIs of the new regions (hbck created new regions for fixing the hdfs mess-up)
1413 for (Map.Entry<HRegionInfo, ServerName> m : map.entrySet()) {
1414 if (m.getKey().getStartKey().length > 0 &&
1415 m.getKey().getStartKey()[0] == Bytes.toBytes("B")[0]) {
1416 newHris[i++] = m.getKey();
1419 // get all the online regions in the regionservers
1420 Collection<ServerName> servers = admin.getClusterStatus().getServers();
1421 Set<HRegionInfo> onlineRegions = new HashSet<HRegionInfo>();
1422 for (ServerName s : servers) {
1423 List<HRegionInfo> list = admin.getOnlineRegions(s);
1424 onlineRegions.addAll(list);
1426 // the new HRIs must be a subset of the online regions
1427 assertTrue(onlineRegions.containsAll(Arrays.asList(newHris)));
1428 // the old HRIs must not be part of the set (removeAll would return false if
1429 // the set didn't change)
1430 assertFalse(onlineRegions.removeAll(Arrays.asList(oldHris)));
1431 } finally {
1432 cleanupTable(table);
1433 admin.close();
1439 * This creates entries in hbase:meta with no hdfs data. This should cleanly
1440 * remove the table.
1442 @Test (timeout=180000)
1443 public void testNoHdfsTable() throws Exception {
1444 TableName table = TableName.valueOf("NoHdfsTable");
1445 setupTable(table);
1446 assertEquals(ROWKEYS.length, countRows());
1448 // make sure data in regions, if in wal only there is no data loss
1449 admin.flush(table);
1451 // Mess it up by deleting hdfs dirs
1452 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""),
1453 Bytes.toBytes("A"), false, false, true); // don't rm meta
1454 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1455 Bytes.toBytes("B"), false, false, true); // don't rm meta
1456 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1457 Bytes.toBytes("C"), false, false, true); // don't rm meta
1458 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"),
1459 Bytes.toBytes(""), false, false, true); // don't rm meta
1461 // also remove the table directory in hdfs
1462 deleteTableDir(table);
1464 HBaseFsck hbck = doFsck(conf, false);
1465 assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS,
1466 ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS,
1467 ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.ORPHAN_TABLE_STATE, });
1468 // holes are separate from overlap groups
1469 assertEquals(0, hbck.getOverlapGroups(table).size());
1471 // fix hole
1472 doFsck(conf, true); // detect dangling regions and remove those
1474 // check that hole fixed
1475 assertNoErrors(doFsck(conf,false));
1476 assertFalse("Table " + table + " should have been deleted", admin.tableExists(table));
1479 public void deleteTableDir(TableName table) throws IOException {
1480 Path rootDir = FSUtils.getRootDir(conf);
1481 FileSystem fs = rootDir.getFileSystem(conf);
1482 Path p = FSUtils.getTableDir(rootDir, table);
1483 HBaseFsck.debugLsr(conf, p);
1484 boolean success = fs.delete(p, true);
1485 LOG.info("Deleted " + p + " sucessfully? " + success);
1489 * when the hbase.version file missing, It is fix the fault.
1491 @Test (timeout=180000)
1492 public void testNoVersionFile() throws Exception {
1493 // delete the hbase.version file
1494 Path rootDir = FSUtils.getRootDir(conf);
1495 FileSystem fs = rootDir.getFileSystem(conf);
1496 Path versionFile = new Path(rootDir, HConstants.VERSION_FILE_NAME);
1497 fs.delete(versionFile, true);
1499 // test
1500 HBaseFsck hbck = doFsck(conf, false);
1501 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_VERSION_FILE });
1502 // fix hbase.version missing
1503 doFsck(conf, true);
1505 // no version file fixed
1506 assertNoErrors(doFsck(conf, false));
1510 * when the hbase.version file missing, It is fix the fault.
1512 @Test (timeout=180000)
1513 public void testNoTableState() throws Exception {
1514 // delete the hbase.version file
1515 TableName table =
1516 TableName.valueOf("testNoTableState");
1517 try {
1518 setupTable(table);
1519 // make sure data in regions, if in wal only there is no data loss
1520 admin.flush(table);
1522 MetaTableAccessor.deleteTableState(TEST_UTIL.getConnection(), table);
1524 // test
1525 HBaseFsck hbck = doFsck(conf, false);
1526 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_TABLE_STATE });
1527 // fix table state missing
1528 doFsck(conf, true);
1530 assertNoErrors(doFsck(conf, false));
1531 assertTrue(TEST_UTIL.getHBaseAdmin().isTableEnabled(table));
1532 } finally {
1533 cleanupTable(table);
1538 * The region is not deployed when the table is disabled.
1540 @Test (timeout=180000)
1541 public void testRegionShouldNotBeDeployed() throws Exception {
1542 TableName table =
1543 TableName.valueOf("tableRegionShouldNotBeDeployed");
1544 try {
1545 LOG.info("Starting testRegionShouldNotBeDeployed.");
1546 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1547 assertTrue(cluster.waitForActiveAndReadyMaster());
1550 byte[][] SPLIT_KEYS = new byte[][] { new byte[0], Bytes.toBytes("aaa"),
1551 Bytes.toBytes("bbb"), Bytes.toBytes("ccc"), Bytes.toBytes("ddd") };
1552 HTableDescriptor htdDisabled = new HTableDescriptor(table);
1553 htdDisabled.addFamily(new HColumnDescriptor(FAM));
1555 // Write the .tableinfo
1556 FSTableDescriptors fstd = new FSTableDescriptors(conf);
1557 fstd.createTableDescriptor(htdDisabled);
1558 List<HRegionInfo> disabledRegions =
1559 TEST_UTIL.createMultiRegionsInMeta(conf, htdDisabled, SPLIT_KEYS);
1561 // Let's just assign everything to first RS
1562 HRegionServer hrs = cluster.getRegionServer(0);
1564 // Create region files.
1565 admin.disableTable(table);
1566 admin.enableTable(table);
1568 // Disable the table and close its regions
1569 admin.disableTable(table);
1570 HRegionInfo region = disabledRegions.remove(0);
1571 byte[] regionName = region.getRegionName();
1573 // The region should not be assigned currently
1574 assertTrue(cluster.getServerWith(regionName) == -1);
1576 // Directly open a region on a region server.
1577 // If going through AM/ZK, the region won't be open.
1578 // Even it is opened, AM will close it which causes
1579 // flakiness of this test.
1580 HRegion r = HRegion.openHRegion(
1581 region, htdDisabled, hrs.getWAL(region), conf);
1582 hrs.addToOnlineRegions(r);
1584 HBaseFsck hbck = doFsck(conf, false);
1585 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.SHOULD_NOT_BE_DEPLOYED });
1587 // fix this fault
1588 doFsck(conf, true);
1590 // check result
1591 assertNoErrors(doFsck(conf, false));
1592 } finally {
1593 admin.enableTable(table);
1594 cleanupTable(table);
1599 * This creates two tables and mess both of them and fix them one by one
1601 @Test (timeout=180000)
1602 public void testFixByTable() throws Exception {
1603 TableName table1 =
1604 TableName.valueOf("testFixByTable1");
1605 TableName table2 =
1606 TableName.valueOf("testFixByTable2");
1607 try {
1608 setupTable(table1);
1609 // make sure data in regions, if in wal only there is no data loss
1610 admin.flush(table1);
1611 // Mess them up by leaving a hole in the hdfs data
1612 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1613 Bytes.toBytes("C"), false, false, true); // don't rm meta
1615 setupTable(table2);
1616 // make sure data in regions, if in wal only there is no data loss
1617 admin.flush(table2);
1618 // Mess them up by leaving a hole in the hdfs data
1619 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1620 Bytes.toBytes("C"), false, false, true); // don't rm meta
1622 HBaseFsck hbck = doFsck(conf, false);
1623 assertErrors(hbck, new ERROR_CODE[] {
1624 ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS});
1626 // fix hole in table 1
1627 doFsck(conf, true, table1);
1628 // check that hole in table 1 fixed
1629 assertNoErrors(doFsck(conf, false, table1));
1630 // check that hole in table 2 still there
1631 assertErrors(doFsck(conf, false, table2),
1632 new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1634 // fix hole in table 2
1635 doFsck(conf, true, table2);
1636 // check that hole in both tables fixed
1637 assertNoErrors(doFsck(conf, false));
1638 assertEquals(ROWKEYS.length - 2, countRows());
1639 } finally {
1640 cleanupTable(table1);
1641 cleanupTable(table2);
1645 * A split parent in meta, in hdfs, and not deployed
1647 @Test (timeout=180000)
1648 public void testLingeringSplitParent() throws Exception {
1649 TableName table =
1650 TableName.valueOf("testLingeringSplitParent");
1651 Table meta = null;
1652 try {
1653 setupTable(table);
1654 assertEquals(ROWKEYS.length, countRows());
1656 // make sure data in regions, if in wal only there is no data loss
1657 admin.flush(table);
1658 HRegionLocation location = tbl.getRegionLocation("B");
1660 // Delete one region from meta, but not hdfs, unassign it.
1661 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1662 Bytes.toBytes("C"), true, true, false);
1664 // Create a new meta entry to fake it as a split parent.
1665 meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1666 HRegionInfo hri = location.getRegionInfo();
1668 HRegionInfo a = new HRegionInfo(tbl.getName(),
1669 Bytes.toBytes("B"), Bytes.toBytes("BM"));
1670 HRegionInfo b = new HRegionInfo(tbl.getName(),
1671 Bytes.toBytes("BM"), Bytes.toBytes("C"));
1673 hri.setOffline(true);
1674 hri.setSplit(true);
1676 MetaTableAccessor.addRegionToMeta(meta, hri, a, b);
1677 meta.close();
1678 admin.flush(TableName.META_TABLE_NAME);
1680 HBaseFsck hbck = doFsck(conf, false);
1681 assertErrors(hbck, new ERROR_CODE[] {
1682 ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1684 // regular repair cannot fix lingering split parent
1685 hbck = doFsck(conf, true);
1686 assertErrors(hbck, new ERROR_CODE[] {
1687 ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1688 assertFalse(hbck.shouldRerun());
1689 hbck = doFsck(conf, false);
1690 assertErrors(hbck, new ERROR_CODE[] {
1691 ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1693 // fix lingering split parent
1694 hbck = new HBaseFsck(conf, hbfsckExecutorService);
1695 hbck.connect();
1696 HBaseFsck.setDisplayFullReport(); // i.e. -details
1697 hbck.setTimeLag(0);
1698 hbck.setFixSplitParents(true);
1699 hbck.onlineHbck();
1700 assertTrue(hbck.shouldRerun());
1701 hbck.close();
1703 Get get = new Get(hri.getRegionName());
1704 Result result = meta.get(get);
1705 assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1706 HConstants.SPLITA_QUALIFIER).isEmpty());
1707 assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1708 HConstants.SPLITB_QUALIFIER).isEmpty());
1709 admin.flush(TableName.META_TABLE_NAME);
1711 // fix other issues
1712 doFsck(conf, true);
1714 // check that all are fixed
1715 assertNoErrors(doFsck(conf, false));
1716 assertEquals(ROWKEYS.length, countRows());
1717 } finally {
1718 cleanupTable(table);
1719 IOUtils.closeQuietly(meta);
1724 * Tests that LINGERING_SPLIT_PARENT is not erroneously reported for
1725 * valid cases where the daughters are there.
1727 @Test (timeout=180000)
1728 public void testValidLingeringSplitParent() throws Exception {
1729 TableName table =
1730 TableName.valueOf("testLingeringSplitParent");
1731 Table meta = null;
1732 try {
1733 setupTable(table);
1734 assertEquals(ROWKEYS.length, countRows());
1736 // make sure data in regions, if in wal only there is no data loss
1737 admin.flush(table);
1738 HRegionLocation location = tbl.getRegionLocation(Bytes.toBytes("B"));
1740 meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1741 HRegionInfo hri = location.getRegionInfo();
1743 // do a regular split
1744 byte[] regionName = location.getRegionInfo().getRegionName();
1745 admin.splitRegion(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1746 TestEndToEndSplitTransaction.blockUntilRegionSplit(conf, 60000, regionName, true);
1748 // TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on
1749 // for some time until children references are deleted. HBCK erroneously sees this as
1750 // overlapping regions
1751 HBaseFsck hbck = doFsck(conf, true, true, false, false, false, true, true, true, false, false, null);
1752 assertErrors(hbck, new ERROR_CODE[] {}); //no LINGERING_SPLIT_PARENT reported
1754 // assert that the split hbase:meta entry is still there.
1755 Get get = new Get(hri.getRegionName());
1756 Result result = meta.get(get);
1757 assertNotNull(result);
1758 assertNotNull(MetaTableAccessor.getHRegionInfo(result));
1760 assertEquals(ROWKEYS.length, countRows());
1762 // assert that we still have the split regions
1763 assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1764 assertNoErrors(doFsck(conf, false));
1765 } finally {
1766 cleanupTable(table);
1767 IOUtils.closeQuietly(meta);
1772 * Split crashed after write to hbase:meta finished for the parent region, but
1773 * failed to write daughters (pre HBASE-7721 codebase)
1775 @Test(timeout=75000)
1776 public void testSplitDaughtersNotInMeta() throws Exception {
1777 TableName table = TableName.valueOf("testSplitdaughtersNotInMeta");
1778 Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1779 try {
1780 setupTable(table);
1781 assertEquals(ROWKEYS.length, countRows());
1783 // make sure data in regions, if in wal only there is no data loss
1784 admin.flush(table);
1785 HRegionLocation location = tbl.getRegionLocation(Bytes.toBytes("B"));
1787 HRegionInfo hri = location.getRegionInfo();
1789 // do a regular split
1790 byte[] regionName = location.getRegionInfo().getRegionName();
1791 admin.splitRegion(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1792 TestEndToEndSplitTransaction.blockUntilRegionSplit(conf, 60000, regionName, true);
1794 PairOfSameType<HRegionInfo> daughters =
1795 MetaTableAccessor.getDaughterRegions(meta.get(new Get(regionName)));
1797 // Delete daughter regions from meta, but not hdfs, unassign it.
1798 Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
1799 undeployRegion(connection, hris.get(daughters.getFirst()), daughters.getFirst());
1800 undeployRegion(connection, hris.get(daughters.getSecond()), daughters.getSecond());
1802 List<Delete> deletes = new ArrayList<>();
1803 deletes.add(new Delete(daughters.getFirst().getRegionName()));
1804 deletes.add(new Delete(daughters.getSecond().getRegionName()));
1805 meta.delete(deletes);
1807 // Remove daughters from regionStates
1808 RegionStates regionStates = TEST_UTIL.getMiniHBaseCluster().getMaster().
1809 getAssignmentManager().getRegionStates();
1810 regionStates.deleteRegion(daughters.getFirst());
1811 regionStates.deleteRegion(daughters.getSecond());
1813 HBaseFsck hbck = doFsck(conf, false);
1814 assertErrors(hbck,
1815 new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1816 ERROR_CODE.HOLE_IN_REGION_CHAIN }); //no LINGERING_SPLIT_PARENT
1818 // now fix it. The fix should not revert the region split, but add daughters to META
1819 hbck = doFsck(conf, true, true, false, false, false, false, false, false, false, false, null);
1820 assertErrors(hbck,
1821 new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1822 ERROR_CODE.HOLE_IN_REGION_CHAIN });
1824 // assert that the split hbase:meta entry is still there.
1825 Get get = new Get(hri.getRegionName());
1826 Result result = meta.get(get);
1827 assertNotNull(result);
1828 assertNotNull(MetaTableAccessor.getHRegionInfo(result));
1830 assertEquals(ROWKEYS.length, countRows());
1832 // assert that we still have the split regions
1833 assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1834 assertNoErrors(doFsck(conf, false)); //should be fixed by now
1835 } finally {
1836 meta.close();
1837 cleanupTable(table);
1842 * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1843 * meta and data missing in the fs.
1845 @Test(timeout=120000)
1846 public void testMissingFirstRegion() throws Exception {
1847 TableName table = TableName.valueOf("testMissingFirstRegion");
1848 try {
1849 setupTable(table);
1850 assertEquals(ROWKEYS.length, countRows());
1852 // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1853 admin.disableTable(table);
1854 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""), Bytes.toBytes("A"), true,
1855 true, true);
1856 admin.enableTable(table);
1858 HBaseFsck hbck = doFsck(conf, false);
1859 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY });
1860 // fix hole
1861 doFsck(conf, true);
1862 // check that hole fixed
1863 assertNoErrors(doFsck(conf, false));
1864 } finally {
1865 cleanupTable(table);
1870 * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1871 * meta and data missing in the fs.
1873 @Test(timeout=120000)
1874 public void testRegionDeployedNotInHdfs() throws Exception {
1875 TableName table =
1876 TableName.valueOf("testSingleRegionDeployedNotInHdfs");
1877 try {
1878 setupTable(table);
1879 admin.flush(table);
1881 // Mess it up by deleting region dir
1882 deleteRegion(conf, tbl.getTableDescriptor(),
1883 HConstants.EMPTY_START_ROW, Bytes.toBytes("A"), false,
1884 false, true);
1886 HBaseFsck hbck = doFsck(conf, false);
1887 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
1888 // fix hole
1889 doFsck(conf, true);
1890 // check that hole fixed
1891 assertNoErrors(doFsck(conf, false));
1892 } finally {
1893 cleanupTable(table);
1898 * This creates and fixes a bad table with missing last region -- hole in meta and data missing in
1899 * the fs.
1901 @Test(timeout=120000)
1902 public void testMissingLastRegion() throws Exception {
1903 TableName table =
1904 TableName.valueOf("testMissingLastRegion");
1905 try {
1906 setupTable(table);
1907 assertEquals(ROWKEYS.length, countRows());
1909 // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1910 admin.disableTable(table);
1911 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"), Bytes.toBytes(""), true,
1912 true, true);
1913 admin.enableTable(table);
1915 HBaseFsck hbck = doFsck(conf, false);
1916 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY });
1917 // fix hole
1918 doFsck(conf, true);
1919 // check that hole fixed
1920 assertNoErrors(doFsck(conf, false));
1921 } finally {
1922 cleanupTable(table);
1927 * Test -noHdfsChecking option can detect and fix assignments issue.
1929 @Test (timeout=180000)
1930 public void testFixAssignmentsAndNoHdfsChecking() throws Exception {
1931 TableName table =
1932 TableName.valueOf("testFixAssignmentsAndNoHdfsChecking");
1933 try {
1934 setupTable(table);
1935 assertEquals(ROWKEYS.length, countRows());
1937 // Mess it up by closing a region
1938 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1939 Bytes.toBytes("B"), true, false, false, false, HRegionInfo.DEFAULT_REPLICA_ID);
1941 // verify there is no other errors
1942 HBaseFsck hbck = doFsck(conf, false);
1943 assertErrors(hbck, new ERROR_CODE[] {
1944 ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1946 // verify that noHdfsChecking report the same errors
1947 HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
1948 fsck.connect();
1949 HBaseFsck.setDisplayFullReport(); // i.e. -details
1950 fsck.setTimeLag(0);
1951 fsck.setCheckHdfs(false);
1952 fsck.onlineHbck();
1953 assertErrors(fsck, new ERROR_CODE[] {
1954 ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1955 fsck.close();
1957 // verify that fixAssignments works fine with noHdfsChecking
1958 fsck = new HBaseFsck(conf, hbfsckExecutorService);
1959 fsck.connect();
1960 HBaseFsck.setDisplayFullReport(); // i.e. -details
1961 fsck.setTimeLag(0);
1962 fsck.setCheckHdfs(false);
1963 fsck.setFixAssignments(true);
1964 fsck.onlineHbck();
1965 assertTrue(fsck.shouldRerun());
1966 fsck.onlineHbck();
1967 assertNoErrors(fsck);
1969 assertEquals(ROWKEYS.length, countRows());
1971 fsck.close();
1972 } finally {
1973 cleanupTable(table);
1978 * Test -noHdfsChecking option can detect region is not in meta but deployed.
1979 * However, it can not fix it without checking Hdfs because we need to get
1980 * the region info from Hdfs in this case, then to patch the meta.
1982 @Test (timeout=180000)
1983 public void testFixMetaNotWorkingWithNoHdfsChecking() throws Exception {
1984 TableName table =
1985 TableName.valueOf("testFixMetaNotWorkingWithNoHdfsChecking");
1986 try {
1987 setupTable(table);
1988 assertEquals(ROWKEYS.length, countRows());
1990 // Mess it up by deleting a region from the metadata
1991 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1992 Bytes.toBytes("B"), false, true, false, false, HRegionInfo.DEFAULT_REPLICA_ID);
1994 // verify there is no other errors
1995 HBaseFsck hbck = doFsck(conf, false);
1996 assertErrors(hbck,
1997 new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1999 // verify that noHdfsChecking report the same errors
2000 HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
2001 fsck.connect();
2002 HBaseFsck.setDisplayFullReport(); // i.e. -details
2003 fsck.setTimeLag(0);
2004 fsck.setCheckHdfs(false);
2005 fsck.onlineHbck();
2006 assertErrors(fsck,
2007 new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
2008 fsck.close();
2010 // verify that fixMeta doesn't work with noHdfsChecking
2011 fsck = new HBaseFsck(conf, hbfsckExecutorService);
2012 fsck.connect();
2013 HBaseFsck.setDisplayFullReport(); // i.e. -details
2014 fsck.setTimeLag(0);
2015 fsck.setCheckHdfs(false);
2016 fsck.setFixAssignments(true);
2017 fsck.setFixMeta(true);
2018 fsck.onlineHbck();
2019 assertFalse(fsck.shouldRerun());
2020 assertErrors(fsck,
2021 new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
2022 fsck.close();
2024 // fix the cluster so other tests won't be impacted
2025 fsck = doFsck(conf, true);
2026 assertTrue(fsck.shouldRerun());
2027 fsck = doFsck(conf, true);
2028 assertNoErrors(fsck);
2029 } finally {
2030 cleanupTable(table);
2035 * Test -fixHdfsHoles doesn't work with -noHdfsChecking option,
2036 * and -noHdfsChecking can't detect orphan Hdfs region.
2038 @Test (timeout=180000)
2039 public void testFixHdfsHolesNotWorkingWithNoHdfsChecking() throws Exception {
2040 TableName table =
2041 TableName.valueOf("testFixHdfsHolesNotWorkingWithNoHdfsChecking");
2042 try {
2043 setupTable(table);
2044 assertEquals(ROWKEYS.length, countRows());
2046 // Mess it up by creating an overlap in the metadata
2047 admin.disableTable(table);
2048 deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
2049 Bytes.toBytes("B"), true, true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
2050 admin.enableTable(table);
2052 HRegionInfo hriOverlap =
2053 createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
2054 TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
2055 TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
2056 .waitForAssignment(hriOverlap);
2057 ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
2058 TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
2060 HBaseFsck hbck = doFsck(conf, false);
2061 assertErrors(hbck, new ERROR_CODE[] {
2062 ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2063 ERROR_CODE.HOLE_IN_REGION_CHAIN});
2065 // verify that noHdfsChecking can't detect ORPHAN_HDFS_REGION
2066 HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
2067 fsck.connect();
2068 HBaseFsck.setDisplayFullReport(); // i.e. -details
2069 fsck.setTimeLag(0);
2070 fsck.setCheckHdfs(false);
2071 fsck.onlineHbck();
2072 assertErrors(fsck, new ERROR_CODE[] {
2073 ERROR_CODE.HOLE_IN_REGION_CHAIN});
2074 fsck.close();
2076 // verify that fixHdfsHoles doesn't work with noHdfsChecking
2077 fsck = new HBaseFsck(conf, hbfsckExecutorService);
2078 fsck.connect();
2079 HBaseFsck.setDisplayFullReport(); // i.e. -details
2080 fsck.setTimeLag(0);
2081 fsck.setCheckHdfs(false);
2082 fsck.setFixHdfsHoles(true);
2083 fsck.setFixHdfsOverlaps(true);
2084 fsck.setFixHdfsOrphans(true);
2085 fsck.onlineHbck();
2086 assertFalse(fsck.shouldRerun());
2087 assertErrors(fsck, new ERROR_CODE[] { ERROR_CODE.HOLE_IN_REGION_CHAIN});
2088 fsck.close();
2089 } finally {
2090 if (admin.isTableDisabled(table)) {
2091 admin.enableTable(table);
2093 cleanupTable(table);
2098 * We don't have an easy way to verify that a flush completed, so we loop until we find a
2099 * legitimate hfile and return it.
2100 * @param fs
2101 * @param table
2102 * @return Path of a flushed hfile.
2103 * @throws IOException
2105 Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
2106 Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
2107 Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
2108 Path famDir = new Path(regionDir, FAM_STR);
2110 // keep doing this until we get a legit hfile
2111 while (true) {
2112 FileStatus[] hfFss = fs.listStatus(famDir);
2113 if (hfFss.length == 0) {
2114 continue;
2116 for (FileStatus hfs : hfFss) {
2117 if (!hfs.isDirectory()) {
2118 return hfs.getPath();
2125 * This creates a table and then corrupts an hfile. Hbck should quarantine the file.
2127 @Test(timeout=180000)
2128 public void testQuarantineCorruptHFile() throws Exception {
2129 TableName table = TableName.valueOf(name.getMethodName());
2130 try {
2131 setupTable(table);
2132 assertEquals(ROWKEYS.length, countRows());
2133 admin.flush(table); // flush is async.
2135 FileSystem fs = FileSystem.get(conf);
2136 Path hfile = getFlushedHFile(fs, table);
2138 // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2139 admin.disableTable(table);
2141 // create new corrupt file called deadbeef (valid hfile name)
2142 Path corrupt = new Path(hfile.getParent(), "deadbeef");
2143 TestHFile.truncateFile(fs, hfile, corrupt);
2144 LOG.info("Created corrupted file " + corrupt);
2145 HBaseFsck.debugLsr(conf, FSUtils.getRootDir(conf));
2147 // we cannot enable here because enable never finished due to the corrupt region.
2148 HBaseFsck res = HbckTestingUtil.doHFileQuarantine(conf, table);
2149 assertEquals(res.getRetCode(), 0);
2150 HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
2151 assertEquals(hfcc.getHFilesChecked(), 5);
2152 assertEquals(hfcc.getCorrupted().size(), 1);
2153 assertEquals(hfcc.getFailures().size(), 0);
2154 assertEquals(hfcc.getQuarantined().size(), 1);
2155 assertEquals(hfcc.getMissing().size(), 0);
2157 // Its been fixed, verify that we can enable.
2158 admin.enableTable(table);
2159 } finally {
2160 cleanupTable(table);
2165 * Test that use this should have a timeout, because this method could potentially wait forever.
2167 private void doQuarantineTest(TableName table, HBaseFsck hbck, int check,
2168 int corrupt, int fail, int quar, int missing) throws Exception {
2169 try {
2170 setupTable(table);
2171 assertEquals(ROWKEYS.length, countRows());
2172 admin.flush(table); // flush is async.
2174 // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2175 admin.disableTable(table);
2177 String[] args = {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
2178 table.getNameAsString()};
2179 HBaseFsck res = hbck.exec(hbfsckExecutorService, args);
2181 HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
2182 assertEquals(hfcc.getHFilesChecked(), check);
2183 assertEquals(hfcc.getCorrupted().size(), corrupt);
2184 assertEquals(hfcc.getFailures().size(), fail);
2185 assertEquals(hfcc.getQuarantined().size(), quar);
2186 assertEquals(hfcc.getMissing().size(), missing);
2188 // its been fixed, verify that we can enable
2189 admin.enableTableAsync(table);
2190 while (!admin.isTableEnabled(table)) {
2191 try {
2192 Thread.sleep(250);
2193 } catch (InterruptedException e) {
2194 e.printStackTrace();
2195 fail("Interrupted when trying to enable table " + table);
2198 } finally {
2199 cleanupTable(table);
2204 * This creates a table and simulates the race situation where a concurrent compaction or split
2205 * has removed an hfile after the corruption checker learned about it.
2207 @Test(timeout=180000)
2208 public void testQuarantineMissingHFile() throws Exception {
2209 TableName table = TableName.valueOf(name.getMethodName());
2211 // inject a fault in the hfcc created.
2212 final FileSystem fs = FileSystem.get(conf);
2213 HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
2214 @Override
2215 public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
2216 return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
2217 AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
2218 @Override
2219 protected void checkHFile(Path p) throws IOException {
2220 if (attemptedFirstHFile.compareAndSet(false, true)) {
2221 assertTrue(fs.delete(p, true)); // make sure delete happened.
2223 super.checkHFile(p);
2228 doQuarantineTest(table, hbck, 4, 0, 0, 0, 1); // 4 attempted, but 1 missing.
2229 hbck.close();
2233 * This creates a table and simulates the race situation where a concurrent compaction or split
2234 * has removed an colfam dir before the corruption checker got to it.
2236 // Disabled because fails sporadically. Is this test right? Timing-wise, there could be no
2237 // files in a column family on initial creation -- as suggested by Matteo.
2238 @Ignore @Test(timeout=180000)
2239 public void testQuarantineMissingFamdir() throws Exception {
2240 TableName table = TableName.valueOf(name.getMethodName());
2241 // inject a fault in the hfcc created.
2242 final FileSystem fs = FileSystem.get(conf);
2243 HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
2244 @Override
2245 public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
2246 return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
2247 AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
2248 @Override
2249 protected void checkColFamDir(Path p) throws IOException {
2250 if (attemptedFirstHFile.compareAndSet(false, true)) {
2251 assertTrue(fs.delete(p, true)); // make sure delete happened.
2253 super.checkColFamDir(p);
2258 doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
2259 hbck.close();
2263 * This creates a table and simulates the race situation where a concurrent compaction or split
2264 * has removed a region dir before the corruption checker got to it.
2266 @Test(timeout=180000)
2267 public void testQuarantineMissingRegionDir() throws Exception {
2268 TableName table = TableName.valueOf(name.getMethodName());
2269 // inject a fault in the hfcc created.
2270 final FileSystem fs = FileSystem.get(conf);
2271 HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
2272 @Override
2273 public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles)
2274 throws IOException {
2275 return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
2276 AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
2277 @Override
2278 protected void checkRegionDir(Path p) throws IOException {
2279 if (attemptedFirstHFile.compareAndSet(false, true)) {
2280 assertTrue(fs.delete(p, true)); // make sure delete happened.
2282 super.checkRegionDir(p);
2287 doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
2288 hbck.close();
2292 * Test fixing lingering reference file.
2294 @Test (timeout=180000)
2295 public void testLingeringReferenceFile() throws Exception {
2296 TableName table =
2297 TableName.valueOf("testLingeringReferenceFile");
2298 try {
2299 setupTable(table);
2300 assertEquals(ROWKEYS.length, countRows());
2302 // Mess it up by creating a fake reference file
2303 FileSystem fs = FileSystem.get(conf);
2304 Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
2305 Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
2306 Path famDir = new Path(regionDir, FAM_STR);
2307 Path fakeReferenceFile = new Path(famDir, "fbce357483ceea.12144538");
2308 fs.create(fakeReferenceFile);
2310 HBaseFsck hbck = doFsck(conf, false);
2311 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LINGERING_REFERENCE_HFILE });
2312 // fix reference file
2313 doFsck(conf, true);
2314 // check that reference file fixed
2315 assertNoErrors(doFsck(conf, false));
2316 } finally {
2317 cleanupTable(table);
2322 * Test mission REGIONINFO_QUALIFIER in hbase:meta
2324 @Test (timeout=180000)
2325 public void testMissingRegionInfoQualifier() throws Exception {
2326 Connection connection = ConnectionFactory.createConnection(conf);
2327 TableName table = TableName.valueOf("testMissingRegionInfoQualifier");
2328 try {
2329 setupTable(table);
2331 // Mess it up by removing the RegionInfo for one region.
2332 final List<Delete> deletes = new LinkedList<Delete>();
2333 Table meta = connection.getTable(TableName.META_TABLE_NAME, hbfsckExecutorService);
2334 MetaTableAccessor.fullScanRegions(connection, new MetaTableAccessor.Visitor() {
2336 @Override
2337 public boolean visit(Result rowResult) throws IOException {
2338 HRegionInfo hri = MetaTableAccessor.getHRegionInfo(rowResult);
2339 if (hri != null && !hri.getTable().isSystemTable()) {
2340 Delete delete = new Delete(rowResult.getRow());
2341 delete.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
2342 deletes.add(delete);
2344 return true;
2347 meta.delete(deletes);
2349 // Mess it up by creating a fake hbase:meta entry with no associated RegionInfo
2350 meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2351 HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER, Bytes.toBytes("node1:60020")));
2352 meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2353 HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER, Bytes.toBytes(1362150791183L)));
2354 meta.close();
2356 HBaseFsck hbck = doFsck(conf, false);
2357 assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2359 // fix reference file
2360 hbck = doFsck(conf, true);
2362 // check that reference file fixed
2363 assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2364 } finally {
2365 cleanupTable(table);
2367 connection.close();
2371 * Test pluggable error reporter. It can be plugged in
2372 * from system property or configuration.
2374 @Test (timeout=180000)
2375 public void testErrorReporter() throws Exception {
2376 try {
2377 MockErrorReporter.calledCount = 0;
2378 doFsck(conf, false);
2379 assertEquals(MockErrorReporter.calledCount, 0);
2381 conf.set("hbasefsck.errorreporter", MockErrorReporter.class.getName());
2382 doFsck(conf, false);
2383 assertTrue(MockErrorReporter.calledCount > 20);
2384 } finally {
2385 conf.set("hbasefsck.errorreporter",
2386 PrintingErrorReporter.class.getName());
2387 MockErrorReporter.calledCount = 0;
2391 static class MockErrorReporter implements ErrorReporter {
2392 static int calledCount = 0;
2394 @Override
2395 public void clear() {
2396 calledCount++;
2399 @Override
2400 public void report(String message) {
2401 calledCount++;
2404 @Override
2405 public void reportError(String message) {
2406 calledCount++;
2409 @Override
2410 public void reportError(ERROR_CODE errorCode, String message) {
2411 calledCount++;
2414 @Override
2415 public void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
2416 calledCount++;
2419 @Override
2420 public void reportError(ERROR_CODE errorCode,
2421 String message, TableInfo table, HbckInfo info) {
2422 calledCount++;
2425 @Override
2426 public void reportError(ERROR_CODE errorCode, String message,
2427 TableInfo table, HbckInfo info1, HbckInfo info2) {
2428 calledCount++;
2431 @Override
2432 public int summarize() {
2433 return ++calledCount;
2436 @Override
2437 public void detail(String details) {
2438 calledCount++;
2441 @Override
2442 public ArrayList<ERROR_CODE> getErrorList() {
2443 calledCount++;
2444 return new ArrayList<ERROR_CODE>();
2447 @Override
2448 public void progress() {
2449 calledCount++;
2452 @Override
2453 public void print(String message) {
2454 calledCount++;
2457 @Override
2458 public void resetErrors() {
2459 calledCount++;
2462 @Override
2463 public boolean tableHasErrors(TableInfo table) {
2464 calledCount++;
2465 return false;
2469 @Test(timeout=60000)
2470 public void testCheckTableLocks() throws Exception {
2471 IncrementingEnvironmentEdge edge = new IncrementingEnvironmentEdge(0);
2472 EnvironmentEdgeManager.injectEdge(edge);
2473 // check no errors
2474 HBaseFsck hbck = doFsck(conf, false);
2475 assertNoErrors(hbck);
2477 ServerName mockName = ServerName.valueOf("localhost", 60000, 1);
2478 final TableName tableName = TableName.valueOf("foo");
2480 // obtain one lock
2481 final TableLockManager tableLockManager =
2482 TableLockManager.createTableLockManager(conf, TEST_UTIL.getZooKeeperWatcher(), mockName);
2483 TableLock writeLock = tableLockManager.writeLock(tableName, "testCheckTableLocks");
2484 writeLock.acquire();
2485 hbck = doFsck(conf, false);
2486 assertNoErrors(hbck); // should not have expired, no problems
2488 edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2489 TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2491 hbck = doFsck(conf, false);
2492 assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK});
2494 final CountDownLatch latch = new CountDownLatch(1);
2495 new Thread() {
2496 @Override
2497 public void run() {
2498 TableLock readLock = tableLockManager.writeLock(tableName, "testCheckTableLocks");
2499 try {
2500 latch.countDown();
2501 readLock.acquire();
2502 } catch (IOException ex) {
2503 fail();
2504 } catch (IllegalStateException ex) {
2505 return; // expected, since this will be reaped under us.
2507 fail("should not have come here");
2509 }.start();
2511 latch.await(); // wait until thread starts
2512 Threads.sleep(300); // wait some more to ensure writeLock.acquire() is called
2514 hbck = doFsck(conf, false);
2515 assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK}); // still one expired, one not-expired
2517 edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2518 TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2520 hbck = doFsck(conf, false);
2521 assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK, ERROR_CODE.EXPIRED_TABLE_LOCK}); // both are expired
2523 conf.setLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT, 1); // reaping from ZKInterProcessWriteLock uses znode cTime,
2524 // which is not injectable through EnvironmentEdge
2525 Threads.sleep(10);
2526 hbck = doFsck(conf, true); // now fix both cases
2528 hbck = doFsck(conf, false);
2529 assertNoErrors(hbck);
2531 // ensure that locks are deleted
2532 writeLock = tableLockManager.writeLock(tableName, "should acquire without blocking");
2533 writeLock.acquire(); // this should not block.
2534 writeLock.release(); // release for clean state
2535 tableLockManager.tableDeleted(tableName);
2538 @Test (timeout=180000)
2539 public void testMetaOffline() throws Exception {
2540 // check no errors
2541 HBaseFsck hbck = doFsck(conf, false);
2542 assertNoErrors(hbck);
2543 deleteMetaRegion(conf, true, false, false);
2544 hbck = doFsck(conf, false);
2545 // ERROR_CODE.UNKNOWN is coming because we reportError with a message for the hbase:meta
2546 // inconsistency and whether we will be fixing it or not.
2547 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2548 hbck = doFsck(conf, true);
2549 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2550 hbck = doFsck(conf, false);
2551 assertNoErrors(hbck);
2554 private void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
2555 boolean regionInfoOnly) throws IOException, InterruptedException {
2556 HRegionLocation metaLocation = connection.getRegionLocator(TableName.META_TABLE_NAME)
2557 .getRegionLocation(HConstants.EMPTY_START_ROW);
2558 ServerName hsa = metaLocation.getServerName();
2559 HRegionInfo hri = metaLocation.getRegionInfo();
2560 if (unassign) {
2561 LOG.info("Undeploying meta region " + hri + " from server " + hsa);
2562 try (Connection unmanagedConnection = ConnectionFactory.createConnection(conf)) {
2563 undeployRegion(unmanagedConnection, hsa, hri);
2567 if (regionInfoOnly) {
2568 LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
2569 Path rootDir = FSUtils.getRootDir(conf);
2570 FileSystem fs = rootDir.getFileSystem(conf);
2571 Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2572 hri.getEncodedName());
2573 Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
2574 fs.delete(hriPath, true);
2577 if (hdfs) {
2578 LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
2579 Path rootDir = FSUtils.getRootDir(conf);
2580 FileSystem fs = rootDir.getFileSystem(conf);
2581 Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2582 hri.getEncodedName());
2583 HBaseFsck.debugLsr(conf, p);
2584 boolean success = fs.delete(p, true);
2585 LOG.info("Deleted " + p + " sucessfully? " + success);
2586 HBaseFsck.debugLsr(conf, p);
2590 @Test (timeout=180000)
2591 public void testTableWithNoRegions() throws Exception {
2592 // We might end up with empty regions in a table
2593 // see also testNoHdfsTable()
2594 TableName table =
2595 TableName.valueOf(name.getMethodName());
2596 try {
2597 // create table with one region
2598 HTableDescriptor desc = new HTableDescriptor(table);
2599 HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
2600 desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
2601 createTable(TEST_UTIL, desc, null);
2602 tbl = (HTable) connection.getTable(table, tableExecutorService);
2604 // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2605 deleteRegion(conf, tbl.getTableDescriptor(), HConstants.EMPTY_START_ROW,
2606 HConstants.EMPTY_END_ROW, false, false, true);
2608 HBaseFsck hbck = doFsck(conf, false);
2609 assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
2611 doFsck(conf, true);
2613 // fix hole
2614 doFsck(conf, true);
2616 // check that hole fixed
2617 assertNoErrors(doFsck(conf, false));
2618 } finally {
2619 cleanupTable(table);
2624 @Test (timeout=180000)
2625 public void testHbckAfterRegionMerge() throws Exception {
2626 TableName table = TableName.valueOf("testMergeRegionFilesInHdfs");
2627 Table meta = null;
2628 try {
2629 // disable CatalogJanitor
2630 TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(false);
2631 setupTable(table);
2632 assertEquals(ROWKEYS.length, countRows());
2634 // make sure data in regions, if in wal only there is no data loss
2635 admin.flush(table);
2636 HRegionInfo region1 = tbl.getRegionLocation(Bytes.toBytes("A")).getRegionInfo();
2637 HRegionInfo region2 = tbl.getRegionLocation(Bytes.toBytes("B")).getRegionInfo();
2639 int regionCountBeforeMerge = tbl.getRegionLocations().size();
2641 assertNotEquals(region1, region2);
2643 // do a region merge
2644 admin.mergeRegions(region1.getEncodedNameAsBytes(),
2645 region2.getEncodedNameAsBytes(), false);
2647 // wait until region merged
2648 long timeout = System.currentTimeMillis() + 30 * 1000;
2649 while (true) {
2650 if (tbl.getRegionLocations().size() < regionCountBeforeMerge) {
2651 break;
2652 } else if (System.currentTimeMillis() > timeout) {
2653 fail("Time out waiting on region " + region1.getEncodedName()
2654 + " and " + region2.getEncodedName() + " be merged");
2656 Thread.sleep(10);
2659 assertEquals(ROWKEYS.length, countRows());
2661 HBaseFsck hbck = doFsck(conf, false);
2662 assertNoErrors(hbck); // no errors
2664 } finally {
2665 TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(true);
2666 cleanupTable(table);
2667 IOUtils.closeQuietly(meta);
2671 @Test (timeout = 180000)
2672 public void testRegionBoundariesCheck() throws Exception {
2673 HBaseFsck hbck = doFsck(conf, false);
2674 assertNoErrors(hbck); // no errors
2675 try {
2676 hbck.connect(); // need connection to have access to META
2677 hbck.checkRegionBoundaries();
2678 } catch (IllegalArgumentException e) {
2679 if (e.getMessage().endsWith("not a valid DFS filename.")) {
2680 fail("Table directory path is not valid." + e.getMessage());
2682 } finally {
2683 hbck.close();
2687 @org.junit.Rule
2688 public TestName name = new TestName();
2690 @Test (timeout=180000)
2691 public void testReadOnlyProperty() throws Exception {
2692 HBaseFsck hbck = doFsck(conf, false);
2693 Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2694 hbck.shouldIgnorePreCheckPermission());
2696 hbck = doFsck(conf, true);
2697 Assert.assertEquals("shouldIgnorePreCheckPermission", false,
2698 hbck.shouldIgnorePreCheckPermission());
2700 hbck = doFsck(conf, true);
2701 hbck.setIgnorePreCheckPermission(true);
2702 Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2703 hbck.shouldIgnorePreCheckPermission());
2706 public static class MasterSyncObserver extends BaseMasterObserver {
2707 volatile CountDownLatch tableCreationLatch = null;
2708 volatile CountDownLatch tableDeletionLatch = null;
2710 @Override
2711 public void postCreateTableHandler(final ObserverContext<MasterCoprocessorEnvironment> ctx,
2712 HTableDescriptor desc, HRegionInfo[] regions) throws IOException {
2713 // the AccessController test, some times calls only and directly the postCreateTableHandler()
2714 if (tableCreationLatch != null) {
2715 tableCreationLatch.countDown();
2719 @Override
2720 public void postDeleteTableHandler(final ObserverContext<MasterCoprocessorEnvironment> ctx,
2721 TableName tableName)
2722 throws IOException {
2723 // the AccessController test, some times calls only and directly the postDeleteTableHandler()
2724 if (tableDeletionLatch != null) {
2725 tableDeletionLatch.countDown();
2730 public static void createTable(HBaseTestingUtility testUtil, HTableDescriptor htd,
2731 byte [][] splitKeys) throws Exception {
2732 // NOTE: We need a latch because admin is not sync,
2733 // so the postOp coprocessor method may be called after the admin operation returned.
2734 MasterSyncObserver observer = (MasterSyncObserver)testUtil.getHBaseCluster().getMaster()
2735 .getMasterCoprocessorHost().findCoprocessor(MasterSyncObserver.class.getName());
2736 observer.tableCreationLatch = new CountDownLatch(1);
2737 if (splitKeys != null) {
2738 admin.createTable(htd, splitKeys);
2739 } else {
2740 admin.createTable(htd);
2742 observer.tableCreationLatch.await();
2743 observer.tableCreationLatch = null;
2744 testUtil.waitUntilAllRegionsAssigned(htd.getTableName());
2747 public static void deleteTable(HBaseTestingUtility testUtil, TableName tableName)
2748 throws Exception {
2749 // NOTE: We need a latch because admin is not sync,
2750 // so the postOp coprocessor method may be called after the admin operation returned.
2751 MasterSyncObserver observer = (MasterSyncObserver)testUtil.getHBaseCluster().getMaster()
2752 .getMasterCoprocessorHost().findCoprocessor(MasterSyncObserver.class.getName());
2753 observer.tableDeletionLatch = new CountDownLatch(1);
2754 try {
2755 admin.disableTable(tableName);
2756 } catch (Exception e) {
2757 LOG.debug("Table: " + tableName + " already disabled, so just deleting it.");
2759 admin.deleteTable(tableName);
2760 observer.tableDeletionLatch.await();
2761 observer.tableDeletionLatch = null;