3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
19 package org
.apache
.hadoop
.hbase
.util
;
21 import static org
.apache
.hadoop
.hbase
.util
.hbck
.HbckTestingUtil
.assertErrors
;
22 import static org
.apache
.hadoop
.hbase
.util
.hbck
.HbckTestingUtil
.assertNoErrors
;
23 import static org
.apache
.hadoop
.hbase
.util
.hbck
.HbckTestingUtil
.doFsck
;
24 import static org
.junit
.Assert
.assertEquals
;
25 import static org
.junit
.Assert
.assertFalse
;
26 import static org
.junit
.Assert
.assertNotEquals
;
27 import static org
.junit
.Assert
.assertNotNull
;
28 import static org
.junit
.Assert
.assertTrue
;
29 import static org
.junit
.Assert
.fail
;
31 import java
.io
.IOException
;
32 import java
.util
.ArrayList
;
33 import java
.util
.Arrays
;
34 import java
.util
.Collection
;
35 import java
.util
.HashMap
;
36 import java
.util
.HashSet
;
37 import java
.util
.LinkedList
;
38 import java
.util
.List
;
40 import java
.util
.NavigableMap
;
42 import java
.util
.concurrent
.Callable
;
43 import java
.util
.concurrent
.CountDownLatch
;
44 import java
.util
.concurrent
.ExecutorService
;
45 import java
.util
.concurrent
.Executors
;
46 import java
.util
.concurrent
.Future
;
47 import java
.util
.concurrent
.ScheduledThreadPoolExecutor
;
48 import java
.util
.concurrent
.SynchronousQueue
;
49 import java
.util
.concurrent
.ThreadPoolExecutor
;
50 import java
.util
.concurrent
.TimeUnit
;
51 import java
.util
.concurrent
.atomic
.AtomicBoolean
;
53 import org
.apache
.commons
.io
.IOUtils
;
54 import org
.apache
.commons
.logging
.Log
;
55 import org
.apache
.commons
.logging
.LogFactory
;
56 import org
.apache
.hadoop
.conf
.Configuration
;
57 import org
.apache
.hadoop
.fs
.FileStatus
;
58 import org
.apache
.hadoop
.fs
.FileSystem
;
59 import org
.apache
.hadoop
.fs
.Path
;
60 import org
.apache
.hadoop
.hbase
.ClusterStatus
;
61 import org
.apache
.hadoop
.hbase
.HBaseTestingUtility
;
62 import org
.apache
.hadoop
.hbase
.HColumnDescriptor
;
63 import org
.apache
.hadoop
.hbase
.HConstants
;
64 import org
.apache
.hadoop
.hbase
.HRegionInfo
;
65 import org
.apache
.hadoop
.hbase
.HRegionLocation
;
66 import org
.apache
.hadoop
.hbase
.HTableDescriptor
;
67 import org
.apache
.hadoop
.hbase
.MiniHBaseCluster
;
68 import org
.apache
.hadoop
.hbase
.ServerName
;
69 import org
.apache
.hadoop
.hbase
.TableName
;
70 import org
.apache
.hadoop
.hbase
.MetaTableAccessor
;
71 import org
.apache
.hadoop
.hbase
.client
.Admin
;
72 import org
.apache
.hadoop
.hbase
.client
.ClusterConnection
;
73 import org
.apache
.hadoop
.hbase
.client
.Connection
;
74 import org
.apache
.hadoop
.hbase
.client
.ConnectionFactory
;
75 import org
.apache
.hadoop
.hbase
.client
.Delete
;
76 import org
.apache
.hadoop
.hbase
.client
.Durability
;
77 import org
.apache
.hadoop
.hbase
.client
.Get
;
78 import org
.apache
.hadoop
.hbase
.client
.HBaseAdmin
;
79 import org
.apache
.hadoop
.hbase
.client
.HConnection
;
80 import org
.apache
.hadoop
.hbase
.client
.HTable
;
81 import org
.apache
.hadoop
.hbase
.client
.Put
;
82 import org
.apache
.hadoop
.hbase
.client
.RegionReplicaUtil
;
83 import org
.apache
.hadoop
.hbase
.client
.Result
;
84 import org
.apache
.hadoop
.hbase
.client
.ResultScanner
;
85 import org
.apache
.hadoop
.hbase
.client
.Scan
;
86 import org
.apache
.hadoop
.hbase
.client
.Table
;
87 import org
.apache
.hadoop
.hbase
.coprocessor
.BaseMasterObserver
;
88 import org
.apache
.hadoop
.hbase
.coprocessor
.CoprocessorHost
;
89 import org
.apache
.hadoop
.hbase
.coprocessor
.MasterCoprocessorEnvironment
;
90 import org
.apache
.hadoop
.hbase
.coprocessor
.ObserverContext
;
91 import org
.apache
.hadoop
.hbase
.io
.hfile
.TestHFile
;
92 import org
.apache
.hadoop
.hbase
.master
.AssignmentManager
;
93 import org
.apache
.hadoop
.hbase
.master
.HMaster
;
94 import org
.apache
.hadoop
.hbase
.master
.RegionState
;
95 import org
.apache
.hadoop
.hbase
.master
.RegionStates
;
96 import org
.apache
.hadoop
.hbase
.master
.TableLockManager
;
97 import org
.apache
.hadoop
.hbase
.master
.TableLockManager
.TableLock
;
98 import org
.apache
.hadoop
.hbase
.protobuf
.ProtobufUtil
;
99 import org
.apache
.hadoop
.hbase
.protobuf
.generated
.AdminProtos
;
100 import org
.apache
.hadoop
.hbase
.regionserver
.HRegion
;
101 import org
.apache
.hadoop
.hbase
.regionserver
.HRegionFileSystem
;
102 import org
.apache
.hadoop
.hbase
.regionserver
.HRegionServer
;
103 import org
.apache
.hadoop
.hbase
.regionserver
.SplitTransactionFactory
;
104 import org
.apache
.hadoop
.hbase
.regionserver
.SplitTransactionImpl
;
105 import org
.apache
.hadoop
.hbase
.regionserver
.TestEndToEndSplitTransaction
;
106 import org
.apache
.hadoop
.hbase
.security
.access
.AccessControlClient
;
107 import org
.apache
.hadoop
.hbase
.testclassification
.LargeTests
;
108 import org
.apache
.hadoop
.hbase
.testclassification
.MiscTests
;
109 import org
.apache
.hadoop
.hbase
.util
.HBaseFsck
.ErrorReporter
;
110 import org
.apache
.hadoop
.hbase
.util
.HBaseFsck
.ErrorReporter
.ERROR_CODE
;
111 import org
.apache
.hadoop
.hbase
.util
.HBaseFsck
.HbckInfo
;
112 import org
.apache
.hadoop
.hbase
.util
.HBaseFsck
.PrintingErrorReporter
;
113 import org
.apache
.hadoop
.hbase
.util
.HBaseFsck
.TableInfo
;
114 import org
.apache
.hadoop
.hbase
.util
.hbck
.HFileCorruptionChecker
;
115 import org
.apache
.hadoop
.hbase
.util
.hbck
.HbckTestingUtil
;
116 import org
.apache
.hadoop
.hbase
.zookeeper
.MetaTableLocator
;
117 import org
.apache
.zookeeper
.KeeperException
;
118 import org
.junit
.AfterClass
;
119 import org
.junit
.Assert
;
120 import org
.junit
.Before
;
121 import org
.junit
.BeforeClass
;
122 import org
.junit
.Ignore
;
123 import org
.junit
.Test
;
124 import org
.junit
.experimental
.categories
.Category
;
125 import org
.junit
.rules
.TestName
;
127 import com
.google
.common
.collect
.Multimap
;
130 * This tests HBaseFsck's ability to detect reasons for inconsistent tables.
132 @Category({MiscTests
.class, LargeTests
.class})
133 public class TestHBaseFsck
{
134 static final int POOL_SIZE
= 7;
135 private static final Log LOG
= LogFactory
.getLog(TestHBaseFsck
.class);
136 private final static HBaseTestingUtility TEST_UTIL
= new HBaseTestingUtility();
137 private final static Configuration conf
= TEST_UTIL
.getConfiguration();
138 private final static String FAM_STR
= "fam";
139 private final static byte[] FAM
= Bytes
.toBytes(FAM_STR
);
140 private final static int REGION_ONLINE_TIMEOUT
= 800;
141 private static RegionStates regionStates
;
142 private static ExecutorService tableExecutorService
;
143 private static ScheduledThreadPoolExecutor hbfsckExecutorService
;
144 private static ClusterConnection connection
;
145 private static Admin admin
;
147 // for the instance, reset every test run
149 private final static byte[][] SPLITS
= new byte[][] { Bytes
.toBytes("A"),
150 Bytes
.toBytes("B"), Bytes
.toBytes("C") };
151 // one row per region.
152 private final static byte[][] ROWKEYS
= new byte[][] {
153 Bytes
.toBytes("00"), Bytes
.toBytes("50"), Bytes
.toBytes("A0"), Bytes
.toBytes("A5"),
154 Bytes
.toBytes("B0"), Bytes
.toBytes("B5"), Bytes
.toBytes("C0"), Bytes
.toBytes("C5") };
157 public static void setUpBeforeClass() throws Exception
{
158 TEST_UTIL
.getConfiguration().set(CoprocessorHost
.MASTER_COPROCESSOR_CONF_KEY
,
159 MasterSyncObserver
.class.getName());
161 conf
.setInt("hbase.regionserver.handler.count", 2);
162 conf
.setInt("hbase.regionserver.metahandler.count", 30);
164 conf
.setInt("hbase.htable.threads.max", POOL_SIZE
);
165 conf
.setInt("hbase.hconnection.threads.max", 2 * POOL_SIZE
);
166 conf
.setInt("hbase.hconnection.threads.core", POOL_SIZE
);
167 conf
.setInt("hbase.hbck.close.timeout", 2 * REGION_ONLINE_TIMEOUT
);
168 conf
.setInt(HConstants
.HBASE_RPC_TIMEOUT_KEY
, 8 * REGION_ONLINE_TIMEOUT
);
169 TEST_UTIL
.startMiniCluster(3);
171 tableExecutorService
= new ThreadPoolExecutor(1, POOL_SIZE
, 60, TimeUnit
.SECONDS
,
172 new SynchronousQueue
<Runnable
>(), Threads
.newDaemonThreadFactory("testhbck"));
174 hbfsckExecutorService
= new ScheduledThreadPoolExecutor(POOL_SIZE
);
176 AssignmentManager assignmentManager
=
177 TEST_UTIL
.getHBaseCluster().getMaster().getAssignmentManager();
178 regionStates
= assignmentManager
.getRegionStates();
180 connection
= (ClusterConnection
) TEST_UTIL
.getConnection();
182 admin
= connection
.getAdmin();
183 admin
.setBalancerRunning(false, true);
185 TEST_UTIL
.waitUntilAllRegionsAssigned(TableName
.META_TABLE_NAME
);
186 TEST_UTIL
.waitUntilAllRegionsAssigned(TableName
.NAMESPACE_TABLE_NAME
);
190 public static void tearDownAfterClass() throws Exception
{
191 tableExecutorService
.shutdown();
192 hbfsckExecutorService
.shutdown();
194 TEST_UTIL
.shutdownMiniCluster();
198 public void setUp() {
199 EnvironmentEdgeManager
.reset();
202 @Test (timeout
=180000)
203 public void testHBaseFsck() throws Exception
{
204 assertNoErrors(doFsck(conf
, false));
205 TableName table
= TableName
.valueOf("tableBadMetaAssign");
206 HTableDescriptor desc
= new HTableDescriptor(table
);
207 HColumnDescriptor hcd
= new HColumnDescriptor(Bytes
.toString(FAM
));
208 desc
.addFamily(hcd
); // If a table has no CF's it doesn't get checked
209 createTable(TEST_UTIL
, desc
, null);
211 // We created 1 table, should be fine
212 assertNoErrors(doFsck(conf
, false));
214 // Now let's mess it up and change the assignment in hbase:meta to
215 // point to a different region server
216 Table meta
= connection
.getTable(TableName
.META_TABLE_NAME
, tableExecutorService
);
217 Scan scan
= new Scan();
218 scan
.setStartRow(Bytes
.toBytes(table
+",,"));
219 ResultScanner scanner
= meta
.getScanner(scan
);
220 HRegionInfo hri
= null;
222 Result res
= scanner
.next();
223 ServerName currServer
=
224 ServerName
.parseFrom(res
.getValue(HConstants
.CATALOG_FAMILY
,
225 HConstants
.SERVER_QUALIFIER
));
226 long startCode
= Bytes
.toLong(res
.getValue(HConstants
.CATALOG_FAMILY
,
227 HConstants
.STARTCODE_QUALIFIER
));
229 for (JVMClusterUtil
.RegionServerThread rs
:
230 TEST_UTIL
.getHBaseCluster().getRegionServerThreads()) {
232 ServerName sn
= rs
.getRegionServer().getServerName();
234 // When we find a diff RS, change the assignment and break
235 if (!currServer
.getHostAndPort().equals(sn
.getHostAndPort()) ||
236 startCode
!= sn
.getStartcode()) {
237 Put put
= new Put(res
.getRow());
238 put
.setDurability(Durability
.SKIP_WAL
);
239 put
.add(HConstants
.CATALOG_FAMILY
, HConstants
.SERVER_QUALIFIER
,
240 Bytes
.toBytes(sn
.getHostAndPort()));
241 put
.add(HConstants
.CATALOG_FAMILY
, HConstants
.STARTCODE_QUALIFIER
,
242 Bytes
.toBytes(sn
.getStartcode()));
244 hri
= MetaTableAccessor
.getHRegionInfo(res
);
249 // Try to fix the data
250 assertErrors(doFsck(conf
, true), new ERROR_CODE
[]{
251 ERROR_CODE
.SERVER_DOES_NOT_MATCH_META
});
253 TEST_UTIL
.getHBaseCluster().getMaster()
254 .getAssignmentManager().waitForAssignment(hri
);
256 // Should be fixed now
257 assertNoErrors(doFsck(conf
, false));
259 // comment needed - what is the purpose of this line
260 Table t
= connection
.getTable(table
, tableExecutorService
);
261 ResultScanner s
= t
.getScanner(new Scan());
269 @Test(timeout
=180000)
270 public void testFixAssignmentsWhenMETAinTransition() throws Exception
{
271 MiniHBaseCluster cluster
= TEST_UTIL
.getHBaseCluster();
272 admin
.closeRegion(cluster
.getServerHoldingMeta(), HRegionInfo
.FIRST_META_REGIONINFO
);
273 regionStates
.regionOffline(HRegionInfo
.FIRST_META_REGIONINFO
);
274 new MetaTableLocator().deleteMetaLocation(cluster
.getMaster().getZooKeeper());
275 assertFalse(regionStates
.isRegionOnline(HRegionInfo
.FIRST_META_REGIONINFO
));
276 HBaseFsck hbck
= doFsck(conf
, true);
277 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.UNKNOWN
, ERROR_CODE
.NO_META_REGION
,
278 ERROR_CODE
.NULL_META_REGION
});
279 assertNoErrors(doFsck(conf
, false));
283 * Create a new region in META.
285 private HRegionInfo
createRegion(final HTableDescriptor
286 htd
, byte[] startKey
, byte[] endKey
)
288 Table meta
= connection
.getTable(TableName
.META_TABLE_NAME
, tableExecutorService
);
289 HRegionInfo hri
= new HRegionInfo(htd
.getTableName(), startKey
, endKey
);
290 MetaTableAccessor
.addRegionToMeta(meta
, hri
);
296 * Debugging method to dump the contents of meta.
298 private void dumpMeta(TableName tableName
) throws IOException
{
299 List
<byte[]> metaRows
= TEST_UTIL
.getMetaTableRows(tableName
);
300 for (byte[] row
: metaRows
) {
301 LOG
.info(Bytes
.toString(row
));
306 * This method is used to undeploy a region -- close it and attempt to
307 * remove its state from the Master.
309 private void undeployRegion(Connection conn
, ServerName sn
,
310 HRegionInfo hri
) throws IOException
, InterruptedException
{
312 HBaseFsckRepair
.closeRegionSilentlyAndWait((HConnection
) conn
, sn
, hri
);
313 if (!hri
.isMetaTable()) {
314 admin
.offline(hri
.getRegionName());
316 } catch (IOException ioe
) {
317 LOG
.warn("Got exception when attempting to offline region "
318 + Bytes
.toString(hri
.getRegionName()), ioe
);
322 * Delete a region from assignments, meta, or completely from hdfs.
323 * @param unassign if true unassign region if assigned
324 * @param metaRow if true remove region's row from META
325 * @param hdfs if true remove region's dir in HDFS
327 private void deleteRegion(Configuration conf
, final HTableDescriptor htd
,
328 byte[] startKey
, byte[] endKey
, boolean unassign
, boolean metaRow
,
329 boolean hdfs
) throws IOException
, InterruptedException
{
330 deleteRegion(conf
, htd
, startKey
, endKey
, unassign
, metaRow
, hdfs
, false, HRegionInfo
.DEFAULT_REPLICA_ID
);
334 * Delete a region from assignments, meta, or completely from hdfs.
335 * @param unassign if true unassign region if assigned
336 * @param metaRow if true remove region's row from META
337 * @param hdfs if true remove region's dir in HDFS
338 * @param regionInfoOnly if true remove a region dir's .regioninfo file
339 * @param replicaId replica id
341 private void deleteRegion(Configuration conf
, final HTableDescriptor htd
,
342 byte[] startKey
, byte[] endKey
, boolean unassign
, boolean metaRow
,
343 boolean hdfs
, boolean regionInfoOnly
, int replicaId
)
344 throws IOException
, InterruptedException
{
345 LOG
.info("** Before delete:");
346 dumpMeta(htd
.getTableName());
348 List
<HRegionLocation
> locations
= tbl
.getAllRegionLocations();
349 for (HRegionLocation location
: locations
) {
350 HRegionInfo hri
= location
.getRegionInfo();
351 ServerName hsa
= location
.getServerName();
352 if (Bytes
.compareTo(hri
.getStartKey(), startKey
) == 0
353 && Bytes
.compareTo(hri
.getEndKey(), endKey
) == 0
354 && hri
.getReplicaId() == replicaId
) {
356 LOG
.info("RegionName: " +hri
.getRegionNameAsString());
357 byte[] deleteRow
= hri
.getRegionName();
360 LOG
.info("Undeploying region " + hri
+ " from server " + hsa
);
361 undeployRegion(connection
, hsa
, hri
);
364 if (regionInfoOnly
) {
365 LOG
.info("deleting hdfs .regioninfo data: " + hri
.toString() + hsa
.toString());
366 Path rootDir
= FSUtils
.getRootDir(conf
);
367 FileSystem fs
= rootDir
.getFileSystem(conf
);
368 Path p
= new Path(FSUtils
.getTableDir(rootDir
, htd
.getTableName()),
369 hri
.getEncodedName());
370 Path hriPath
= new Path(p
, HRegionFileSystem
.REGION_INFO_FILE
);
371 fs
.delete(hriPath
, true);
375 LOG
.info("deleting hdfs data: " + hri
.toString() + hsa
.toString());
376 Path rootDir
= FSUtils
.getRootDir(conf
);
377 FileSystem fs
= rootDir
.getFileSystem(conf
);
378 Path p
= new Path(FSUtils
.getTableDir(rootDir
, htd
.getTableName()),
379 hri
.getEncodedName());
380 HBaseFsck
.debugLsr(conf
, p
);
381 boolean success
= fs
.delete(p
, true);
382 LOG
.info("Deleted " + p
+ " sucessfully? " + success
);
383 HBaseFsck
.debugLsr(conf
, p
);
387 try (Table meta
= connection
.getTable(TableName
.META_TABLE_NAME
, tableExecutorService
)) {
388 Delete delete
= new Delete(deleteRow
);
393 LOG
.info(hri
.toString() + hsa
.toString());
396 TEST_UTIL
.getMetaTableRows(htd
.getTableName());
397 LOG
.info("*** After delete:");
398 dumpMeta(htd
.getTableName());
402 * Setup a clean table before we start mucking with it.
404 * It will set tbl which needs to be closed after test
406 * @throws IOException
407 * @throws InterruptedException
408 * @throws KeeperException
410 void setupTable(TableName tablename
) throws Exception
{
411 setupTableWithRegionReplica(tablename
, 1);
415 * Setup a clean table with a certain region_replica count
417 * It will set tbl which needs to be closed after test
420 * @param replicaCount
423 void setupTableWithRegionReplica(TableName tablename
, int replicaCount
) throws Exception
{
424 HTableDescriptor desc
= new HTableDescriptor(tablename
);
425 desc
.setRegionReplication(replicaCount
);
426 HColumnDescriptor hcd
= new HColumnDescriptor(Bytes
.toString(FAM
));
427 desc
.addFamily(hcd
); // If a table has no CF's it doesn't get checked
428 createTable(TEST_UTIL
, desc
, SPLITS
);
430 tbl
= (HTable
) connection
.getTable(tablename
, tableExecutorService
);
431 List
<Put
> puts
= new ArrayList
<Put
>();
432 for (byte[] row
: ROWKEYS
) {
433 Put p
= new Put(row
);
434 p
.add(FAM
, Bytes
.toBytes("val"), row
);
442 * Counts the number of row to verify data loss or non-dataloss.
444 int countRows() throws IOException
{
446 ResultScanner rs
= tbl
.getScanner(s
);
448 while(rs
.next() !=null) {
455 * delete table in preparation for next test
458 * @throws IOException
460 void cleanupTable(TableName tablename
) throws Exception
{
466 ((ClusterConnection
) connection
).clearRegionCache();
467 deleteTable(TEST_UTIL
, tablename
);
471 * This creates a clean table and confirms that the table is clean.
473 @Test (timeout
=180000)
474 public void testHBaseFsckClean() throws Exception
{
475 assertNoErrors(doFsck(conf
, false));
476 TableName table
= TableName
.valueOf("tableClean");
478 HBaseFsck hbck
= doFsck(conf
, false);
479 assertNoErrors(hbck
);
482 assertEquals(ROWKEYS
.length
, countRows());
484 // We created 1 table, should be fine
485 hbck
= doFsck(conf
, false);
486 assertNoErrors(hbck
);
487 assertEquals(0, hbck
.getOverlapGroups(table
).size());
488 assertEquals(ROWKEYS
.length
, countRows());
495 * Test thread pooling in the case where there are more regions than threads
497 @Test (timeout
=180000)
498 public void testHbckThreadpooling() throws Exception
{
500 TableName
.valueOf("tableDupeStartKey");
502 // Create table with 4 regions
505 // limit number of threads to 1.
506 Configuration newconf
= new Configuration(conf
);
507 newconf
.setInt("hbasefsck.numthreads", 1);
508 assertNoErrors(doFsck(newconf
, false));
510 // We should pass without triggering a RejectedExecutionException
516 @Test (timeout
=180000)
517 public void testHbckFixOrphanTable() throws Exception
{
518 TableName table
= TableName
.valueOf("tableInfo");
519 FileSystem fs
= null;
520 Path tableinfo
= null;
524 Path hbaseTableDir
= FSUtils
.getTableDir(
525 FSUtils
.getRootDir(conf
), table
);
526 fs
= hbaseTableDir
.getFileSystem(conf
);
527 FileStatus status
= FSTableDescriptors
.getTableInfoPath(fs
, hbaseTableDir
);
528 tableinfo
= status
.getPath();
529 fs
.rename(tableinfo
, new Path("/.tableinfo"));
531 //to report error if .tableinfo is missing.
532 HBaseFsck hbck
= doFsck(conf
, false);
533 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.NO_TABLEINFO_FILE
});
535 // fix OrphanTable with default .tableinfo (htd not yet cached on master)
536 hbck
= doFsck(conf
, true);
537 assertNoErrors(hbck
);
539 status
= FSTableDescriptors
.getTableInfoPath(fs
, hbaseTableDir
);
540 assertNotNull(status
);
542 HTableDescriptor htd
= admin
.getTableDescriptor(table
);
543 htd
.setValue("NOT_DEFAULT", "true");
544 admin
.disableTable(table
);
545 admin
.modifyTable(table
, htd
);
546 admin
.enableTable(table
);
547 fs
.delete(status
.getPath(), true);
549 // fix OrphanTable with cache
550 htd
= admin
.getTableDescriptor(table
); // warms up cached htd on master
551 hbck
= doFsck(conf
, true);
552 assertNoErrors(hbck
);
553 status
= FSTableDescriptors
.getTableInfoPath(fs
, hbaseTableDir
);
554 assertNotNull(status
);
555 htd
= admin
.getTableDescriptor(table
);
556 assertEquals(htd
.getValue("NOT_DEFAULT"), "true");
558 fs
.rename(new Path("/.tableinfo"), tableinfo
);
564 * This test makes sure that parallel instances of Hbck is disabled.
568 @Test (timeout
=180000)
569 public void testParallelHbck() throws Exception
{
570 final ExecutorService service
;
571 final Future
<HBaseFsck
> hbck1
,hbck2
;
573 class RunHbck
implements Callable
<HBaseFsck
>{
576 public HBaseFsck
call(){
577 Configuration c
= new Configuration(conf
);
578 c
.setInt("hbase.hbck.lockfile.attempts", 1);
580 return doFsck(c
, false);
581 } catch(Exception e
){
582 if (e
.getMessage().contains("Duplicate hbck")) {
586 // If we reach here, then an exception was caught
591 service
= Executors
.newFixedThreadPool(2);
592 hbck1
= service
.submit(new RunHbck());
593 hbck2
= service
.submit(new RunHbck());
595 //wait for 15 seconds, for both hbck calls finish
596 service
.awaitTermination(15, TimeUnit
.SECONDS
);
597 HBaseFsck h1
= hbck1
.get();
598 HBaseFsck h2
= hbck2
.get();
599 // Make sure only one of the calls was successful
600 assert(h1
== null || h2
== null);
602 assert(h1
.getRetCode() >= 0);
605 assert(h2
.getRetCode() >= 0);
610 * This test makes sure that with 10 retries both parallel instances
611 * of hbck will be completed successfully.
615 @Test (timeout
=180000)
616 public void testParallelWithRetriesHbck() throws Exception
{
617 final ExecutorService service
;
618 final Future
<HBaseFsck
> hbck1
,hbck2
;
620 class RunHbck
implements Callable
<HBaseFsck
>{
623 public HBaseFsck
call() throws Exception
{
624 // Increase retry attempts to make sure the non-active hbck doesn't get starved
625 Configuration c
= new Configuration(conf
);
626 c
.setInt("hbase.hbck.lockfile.attempts", 10);
627 return doFsck(c
, false);
630 service
= Executors
.newFixedThreadPool(2);
631 hbck1
= service
.submit(new RunHbck());
632 hbck2
= service
.submit(new RunHbck());
634 //wait for 15 seconds, for both hbck calls finish
635 service
.awaitTermination(25, TimeUnit
.SECONDS
);
636 HBaseFsck h1
= hbck1
.get();
637 HBaseFsck h2
= hbck2
.get();
638 // Both should be successful
641 assert(h1
.getRetCode() >= 0);
642 assert(h2
.getRetCode() >= 0);
647 * This create and fixes a bad table with regions that have a duplicate
650 @Test (timeout
=180000)
651 public void testDupeStartKey() throws Exception
{
653 TableName
.valueOf("tableDupeStartKey");
656 assertNoErrors(doFsck(conf
, false));
657 assertEquals(ROWKEYS
.length
, countRows());
659 // Now let's mess it up, by adding a region with a duplicate startkey
660 HRegionInfo hriDupe
=
661 createRegion(tbl
.getTableDescriptor(), Bytes
.toBytes("A"), Bytes
.toBytes("A2"));
662 TEST_UTIL
.getHBaseCluster().getMaster().assignRegion(hriDupe
);
663 TEST_UTIL
.getHBaseCluster().getMaster().getAssignmentManager()
664 .waitForAssignment(hriDupe
);
665 ServerName server
= regionStates
.getRegionServerOfRegion(hriDupe
);
666 TEST_UTIL
.assertRegionOnServer(hriDupe
, server
, REGION_ONLINE_TIMEOUT
);
668 HBaseFsck hbck
= doFsck(conf
, false);
669 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.DUPE_STARTKEYS
,
670 ERROR_CODE
.DUPE_STARTKEYS
});
671 assertEquals(2, hbck
.getOverlapGroups(table
).size());
672 assertEquals(ROWKEYS
.length
, countRows()); // seems like the "bigger" region won.
674 // fix the degenerate region.
677 // check that the degenerate region is gone and no data loss
678 HBaseFsck hbck2
= doFsck(conf
,false);
679 assertNoErrors(hbck2
);
680 assertEquals(0, hbck2
.getOverlapGroups(table
).size());
681 assertEquals(ROWKEYS
.length
, countRows());
688 * This creates a table with region_replica > 1 and verifies hbck runs
691 @Test (timeout
=180000)
692 public void testHbckWithRegionReplica() throws Exception
{
694 TableName
.valueOf("testHbckWithRegionReplica");
696 setupTableWithRegionReplica(table
, 2);
698 assertNoErrors(doFsck(conf
, false));
704 @Test (timeout
=180000)
705 public void testHbckWithFewerReplica() throws Exception
{
707 TableName
.valueOf("testHbckWithFewerReplica");
709 setupTableWithRegionReplica(table
, 2);
711 assertNoErrors(doFsck(conf
, false));
712 assertEquals(ROWKEYS
.length
, countRows());
713 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("B"),
714 Bytes
.toBytes("C"), true, false, false, false, 1); // unassign one replica
715 // check that problem exists
716 HBaseFsck hbck
= doFsck(conf
, false);
717 assertErrors(hbck
, new ERROR_CODE
[]{ERROR_CODE
.NOT_DEPLOYED
});
719 hbck
= doFsck(conf
, true);
720 // run hbck again to make sure we don't see any errors
721 hbck
= doFsck(conf
, false);
722 assertErrors(hbck
, new ERROR_CODE
[]{});
728 @Test (timeout
=180000)
729 public void testHbckWithExcessReplica() throws Exception
{
731 TableName
.valueOf("testHbckWithExcessReplica");
733 setupTableWithRegionReplica(table
, 2);
735 assertNoErrors(doFsck(conf
, false));
736 assertEquals(ROWKEYS
.length
, countRows());
737 // the next few lines inject a location in meta for a replica, and then
738 // asks the master to assign the replica (the meta needs to be injected
739 // for the master to treat the request for assignment as valid; the master
740 // checks the region is valid either from its memory or meta)
741 Table meta
= connection
.getTable(TableName
.META_TABLE_NAME
, tableExecutorService
);
742 List
<HRegionInfo
> regions
= admin
.getTableRegions(table
);
743 byte[] startKey
= Bytes
.toBytes("B");
744 byte[] endKey
= Bytes
.toBytes("C");
745 byte[] metaKey
= null;
746 HRegionInfo newHri
= null;
747 for (HRegionInfo h
: regions
) {
748 if (Bytes
.compareTo(h
.getStartKey(), startKey
) == 0 &&
749 Bytes
.compareTo(h
.getEndKey(), endKey
) == 0 &&
750 h
.getReplicaId() == HRegionInfo
.DEFAULT_REPLICA_ID
) {
751 metaKey
= h
.getRegionName();
752 //create a hri with replicaId as 2 (since we already have replicas with replicaid 0 and 1)
753 newHri
= RegionReplicaUtil
.getRegionInfoForReplica(h
, 2);
757 Put put
= new Put(metaKey
);
758 Collection
<ServerName
> var
= admin
.getClusterStatus().getServers();
759 ServerName sn
= var
.toArray(new ServerName
[var
.size()])[0];
760 //add a location with replicaId as 2 (since we already have replicas with replicaid 0 and 1)
761 MetaTableAccessor
.addLocation(put
, sn
, sn
.getStartcode(), 2);
763 // assign the new replica
764 HBaseFsckRepair
.fixUnassigned(admin
, newHri
);
765 HBaseFsckRepair
.waitUntilAssigned(admin
, newHri
);
766 // now reset the meta row to its original value
767 Delete delete
= new Delete(metaKey
);
768 delete
.addColumns(HConstants
.CATALOG_FAMILY
, MetaTableAccessor
.getServerColumn(2));
769 delete
.addColumns(HConstants
.CATALOG_FAMILY
, MetaTableAccessor
.getStartCodeColumn(2));
770 delete
.addColumns(HConstants
.CATALOG_FAMILY
, MetaTableAccessor
.getSeqNumColumn(2));
773 // check that problem exists
774 HBaseFsck hbck
= doFsck(conf
, false);
775 assertErrors(hbck
, new ERROR_CODE
[]{ERROR_CODE
.NOT_IN_META
});
777 hbck
= doFsck(conf
, true);
778 // run hbck again to make sure we don't see any errors
779 hbck
= doFsck(conf
, false);
780 assertErrors(hbck
, new ERROR_CODE
[]{});
786 * Get region info from local cluster.
788 Map
<ServerName
, List
<String
>> getDeployedHRIs(final HBaseAdmin admin
) throws IOException
{
789 ClusterStatus status
= admin
.getClusterStatus();
790 Collection
<ServerName
> regionServers
= status
.getServers();
791 Map
<ServerName
, List
<String
>> mm
=
792 new HashMap
<ServerName
, List
<String
>>();
793 for (ServerName hsi
: regionServers
) {
794 AdminProtos
.AdminService
.BlockingInterface server
= ((HConnection
) connection
).getAdmin(hsi
);
796 // list all online regions from this region server
797 List
<HRegionInfo
> regions
= ProtobufUtil
.getOnlineRegions(server
);
798 List
<String
> regionNames
= new ArrayList
<String
>();
799 for (HRegionInfo hri
: regions
) {
800 regionNames
.add(hri
.getRegionNameAsString());
802 mm
.put(hsi
, regionNames
);
808 * Returns the HSI a region info is on.
810 ServerName
findDeployedHSI(Map
<ServerName
, List
<String
>> mm
, HRegionInfo hri
) {
811 for (Map
.Entry
<ServerName
,List
<String
>> e
: mm
.entrySet()) {
812 if (e
.getValue().contains(hri
.getRegionNameAsString())) {
820 * This create and fixes a bad table with regions that have a duplicate
823 @Test (timeout
=180000)
824 public void testDupeRegion() throws Exception
{
826 TableName
.valueOf("tableDupeRegion");
829 assertNoErrors(doFsck(conf
, false));
830 assertEquals(ROWKEYS
.length
, countRows());
832 // Now let's mess it up, by adding a region with a duplicate startkey
833 HRegionInfo hriDupe
=
834 createRegion(tbl
.getTableDescriptor(), Bytes
.toBytes("A"), Bytes
.toBytes("B"));
836 TEST_UTIL
.getHBaseCluster().getMaster().assignRegion(hriDupe
);
837 TEST_UTIL
.getHBaseCluster().getMaster().getAssignmentManager()
838 .waitForAssignment(hriDupe
);
839 ServerName server
= regionStates
.getRegionServerOfRegion(hriDupe
);
840 TEST_UTIL
.assertRegionOnServer(hriDupe
, server
, REGION_ONLINE_TIMEOUT
);
842 // Yikes! The assignment manager can't tell between diff between two
843 // different regions with the same start/endkeys since it doesn't
844 // differentiate on ts/regionId! We actually need to recheck
846 while (findDeployedHSI(getDeployedHRIs((HBaseAdmin
) admin
), hriDupe
) == null) {
850 LOG
.debug("Finished assignment of dupe region");
852 // TODO why is dupe region different from dupe start keys?
853 HBaseFsck hbck
= doFsck(conf
, false);
854 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.DUPE_STARTKEYS
,
855 ERROR_CODE
.DUPE_STARTKEYS
});
856 assertEquals(2, hbck
.getOverlapGroups(table
).size());
857 assertEquals(ROWKEYS
.length
, countRows()); // seems like the "bigger" region won.
859 // fix the degenerate region.
862 // check that the degenerate region is gone and no data loss
863 HBaseFsck hbck2
= doFsck(conf
,false);
864 assertNoErrors(hbck2
);
865 assertEquals(0, hbck2
.getOverlapGroups(table
).size());
866 assertEquals(ROWKEYS
.length
, countRows());
873 * This creates and fixes a bad table with regions that has startkey == endkey
875 @Test (timeout
=180000)
876 public void testDegenerateRegions() throws Exception
{
877 TableName table
= TableName
.valueOf("tableDegenerateRegions");
880 assertNoErrors(doFsck(conf
,false));
881 assertEquals(ROWKEYS
.length
, countRows());
883 // Now let's mess it up, by adding a region with a duplicate startkey
884 HRegionInfo hriDupe
=
885 createRegion(tbl
.getTableDescriptor(), Bytes
.toBytes("B"), Bytes
.toBytes("B"));
886 TEST_UTIL
.getHBaseCluster().getMaster().assignRegion(hriDupe
);
887 TEST_UTIL
.getHBaseCluster().getMaster().getAssignmentManager()
888 .waitForAssignment(hriDupe
);
889 ServerName server
= regionStates
.getRegionServerOfRegion(hriDupe
);
890 TEST_UTIL
.assertRegionOnServer(hriDupe
, server
, REGION_ONLINE_TIMEOUT
);
892 HBaseFsck hbck
= doFsck(conf
,false);
893 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.DEGENERATE_REGION
, ERROR_CODE
.DUPE_STARTKEYS
,
894 ERROR_CODE
.DUPE_STARTKEYS
});
895 assertEquals(2, hbck
.getOverlapGroups(table
).size());
896 assertEquals(ROWKEYS
.length
, countRows());
898 // fix the degenerate region.
901 // check that the degenerate region is gone and no data loss
902 HBaseFsck hbck2
= doFsck(conf
,false);
903 assertNoErrors(hbck2
);
904 assertEquals(0, hbck2
.getOverlapGroups(table
).size());
905 assertEquals(ROWKEYS
.length
, countRows());
912 * This creates and fixes a bad table where a region is completely contained
915 @Test (timeout
=180000)
916 public void testContainedRegionOverlap() throws Exception
{
918 TableName
.valueOf("tableContainedRegionOverlap");
921 assertEquals(ROWKEYS
.length
, countRows());
923 // Mess it up by creating an overlap in the metadata
924 HRegionInfo hriOverlap
=
925 createRegion(tbl
.getTableDescriptor(), Bytes
.toBytes("A2"), Bytes
.toBytes("B"));
926 TEST_UTIL
.getHBaseCluster().getMaster().assignRegion(hriOverlap
);
927 TEST_UTIL
.getHBaseCluster().getMaster().getAssignmentManager()
928 .waitForAssignment(hriOverlap
);
929 ServerName server
= regionStates
.getRegionServerOfRegion(hriOverlap
);
930 TEST_UTIL
.assertRegionOnServer(hriOverlap
, server
, REGION_ONLINE_TIMEOUT
);
932 HBaseFsck hbck
= doFsck(conf
, false);
933 assertErrors(hbck
, new ERROR_CODE
[] {
934 ERROR_CODE
.OVERLAP_IN_REGION_CHAIN
});
935 assertEquals(2, hbck
.getOverlapGroups(table
).size());
936 assertEquals(ROWKEYS
.length
, countRows());
941 // verify that overlaps are fixed
942 HBaseFsck hbck2
= doFsck(conf
,false);
943 assertNoErrors(hbck2
);
944 assertEquals(0, hbck2
.getOverlapGroups(table
).size());
945 assertEquals(ROWKEYS
.length
, countRows());
952 * This creates and fixes a bad table where an overlap group of
953 * 3 regions. Set HBaseFsck.maxMerge to 2 to trigger sideline overlapped
954 * region. Mess around the meta data so that closeRegion/offlineRegion
957 @Test (timeout
=180000)
958 public void testSidelineOverlapRegion() throws Exception
{
960 TableName
.valueOf("testSidelineOverlapRegion");
963 assertEquals(ROWKEYS
.length
, countRows());
965 // Mess it up by creating an overlap
966 MiniHBaseCluster cluster
= TEST_UTIL
.getHBaseCluster();
967 HMaster master
= cluster
.getMaster();
968 HRegionInfo hriOverlap1
=
969 createRegion(tbl
.getTableDescriptor(), Bytes
.toBytes("A"), Bytes
.toBytes("AB"));
970 master
.assignRegion(hriOverlap1
);
971 master
.getAssignmentManager().waitForAssignment(hriOverlap1
);
972 HRegionInfo hriOverlap2
=
973 createRegion(tbl
.getTableDescriptor(), Bytes
.toBytes("AB"), Bytes
.toBytes("B"));
974 master
.assignRegion(hriOverlap2
);
975 master
.getAssignmentManager().waitForAssignment(hriOverlap2
);
977 HBaseFsck hbck
= doFsck(conf
, false);
978 assertErrors(hbck
, new ERROR_CODE
[] {ERROR_CODE
.DUPE_STARTKEYS
,
979 ERROR_CODE
.DUPE_STARTKEYS
, ERROR_CODE
.OVERLAP_IN_REGION_CHAIN
});
980 assertEquals(3, hbck
.getOverlapGroups(table
).size());
981 assertEquals(ROWKEYS
.length
, countRows());
983 // mess around the overlapped regions, to trigger NotServingRegionException
984 Multimap
<byte[], HbckInfo
> overlapGroups
= hbck
.getOverlapGroups(table
);
985 ServerName serverName
= null;
986 byte[] regionName
= null;
987 for (HbckInfo hbi
: overlapGroups
.values()) {
988 if ("A".equals(Bytes
.toString(hbi
.getStartKey()))
989 && "B".equals(Bytes
.toString(hbi
.getEndKey()))) {
990 regionName
= hbi
.getRegionName();
992 // get an RS not serving the region to force bad assignment info in to META.
993 int k
= cluster
.getServerWith(regionName
);
994 for (int i
= 0; i
< 3; i
++) {
996 HRegionServer rs
= cluster
.getRegionServer(i
);
997 serverName
= rs
.getServerName();
1002 HBaseFsckRepair
.closeRegionSilentlyAndWait((HConnection
) connection
,
1003 cluster
.getRegionServer(k
).getServerName(), hbi
.getHdfsHRI());
1004 admin
.offline(regionName
);
1009 assertNotNull(regionName
);
1010 assertNotNull(serverName
);
1011 try (Table meta
= connection
.getTable(TableName
.META_TABLE_NAME
, tableExecutorService
)) {
1012 Put put
= new Put(regionName
);
1013 put
.add(HConstants
.CATALOG_FAMILY
, HConstants
.SERVER_QUALIFIER
,
1014 Bytes
.toBytes(serverName
.getHostAndPort()));
1019 HBaseFsck fsck
= new HBaseFsck(conf
, hbfsckExecutorService
);
1021 HBaseFsck
.setDisplayFullReport(); // i.e. -details
1023 fsck
.setFixAssignments(true);
1024 fsck
.setFixMeta(true);
1025 fsck
.setFixHdfsHoles(true);
1026 fsck
.setFixHdfsOverlaps(true);
1027 fsck
.setFixHdfsOrphans(true);
1028 fsck
.setFixVersionFile(true);
1029 fsck
.setSidelineBigOverlaps(true);
1030 fsck
.setMaxMerge(2);
1034 // verify that overlaps are fixed, and there are less rows
1035 // since one region is sidelined.
1036 HBaseFsck hbck2
= doFsck(conf
,false);
1037 assertNoErrors(hbck2
);
1038 assertEquals(0, hbck2
.getOverlapGroups(table
).size());
1039 assertTrue(ROWKEYS
.length
> countRows());
1041 cleanupTable(table
);
1046 * This creates and fixes a bad table where a region is completely contained
1047 * by another region, and there is a hole (sort of like a bad split)
1049 @Test (timeout
=180000)
1050 public void testOverlapAndOrphan() throws Exception
{
1052 TableName
.valueOf("tableOverlapAndOrphan");
1055 assertEquals(ROWKEYS
.length
, countRows());
1057 // Mess it up by creating an overlap in the metadata
1058 admin
.disableTable(table
);
1059 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("A"),
1060 Bytes
.toBytes("B"), true, true, false, true, HRegionInfo
.DEFAULT_REPLICA_ID
);
1061 admin
.enableTable(table
);
1063 HRegionInfo hriOverlap
=
1064 createRegion(tbl
.getTableDescriptor(), Bytes
.toBytes("A2"), Bytes
.toBytes("B"));
1065 TEST_UTIL
.getHBaseCluster().getMaster().assignRegion(hriOverlap
);
1066 TEST_UTIL
.getHBaseCluster().getMaster().getAssignmentManager()
1067 .waitForAssignment(hriOverlap
);
1068 ServerName server
= regionStates
.getRegionServerOfRegion(hriOverlap
);
1069 TEST_UTIL
.assertRegionOnServer(hriOverlap
, server
, REGION_ONLINE_TIMEOUT
);
1071 HBaseFsck hbck
= doFsck(conf
, false);
1072 assertErrors(hbck
, new ERROR_CODE
[] {
1073 ERROR_CODE
.ORPHAN_HDFS_REGION
, ERROR_CODE
.NOT_IN_META_OR_DEPLOYED
,
1074 ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
1079 // verify that overlaps are fixed
1080 HBaseFsck hbck2
= doFsck(conf
,false);
1081 assertNoErrors(hbck2
);
1082 assertEquals(0, hbck2
.getOverlapGroups(table
).size());
1083 assertEquals(ROWKEYS
.length
, countRows());
1085 cleanupTable(table
);
1090 * This creates and fixes a bad table where a region overlaps two regions --
1091 * a start key contained in another region and its end key is contained in
1092 * yet another region.
1094 @Test (timeout
=180000)
1095 public void testCoveredStartKey() throws Exception
{
1097 TableName
.valueOf("tableCoveredStartKey");
1100 assertEquals(ROWKEYS
.length
, countRows());
1102 // Mess it up by creating an overlap in the metadata
1103 HRegionInfo hriOverlap
=
1104 createRegion(tbl
.getTableDescriptor(), Bytes
.toBytes("A2"), Bytes
.toBytes("B2"));
1105 TEST_UTIL
.getHBaseCluster().getMaster().assignRegion(hriOverlap
);
1106 TEST_UTIL
.getHBaseCluster().getMaster().getAssignmentManager()
1107 .waitForAssignment(hriOverlap
);
1108 ServerName server
= regionStates
.getRegionServerOfRegion(hriOverlap
);
1109 TEST_UTIL
.assertRegionOnServer(hriOverlap
, server
, REGION_ONLINE_TIMEOUT
);
1111 HBaseFsck hbck
= doFsck(conf
, false);
1112 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.OVERLAP_IN_REGION_CHAIN
,
1113 ERROR_CODE
.OVERLAP_IN_REGION_CHAIN
});
1114 assertEquals(3, hbck
.getOverlapGroups(table
).size());
1115 assertEquals(ROWKEYS
.length
, countRows());
1120 // verify that overlaps are fixed
1121 HBaseFsck hbck2
= doFsck(conf
, false);
1122 assertErrors(hbck2
, new ERROR_CODE
[0]);
1123 assertEquals(0, hbck2
.getOverlapGroups(table
).size());
1124 assertEquals(ROWKEYS
.length
, countRows());
1126 cleanupTable(table
);
1131 * This creates and fixes a bad table with a missing region -- hole in meta
1132 * and data missing in the fs.
1134 @Test (timeout
=180000)
1135 public void testRegionHole() throws Exception
{
1137 TableName
.valueOf("tableRegionHole");
1140 assertEquals(ROWKEYS
.length
, countRows());
1142 // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1143 admin
.disableTable(table
);
1144 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("B"),
1145 Bytes
.toBytes("C"), true, true, true);
1146 admin
.enableTable(table
);
1148 HBaseFsck hbck
= doFsck(conf
, false);
1149 assertErrors(hbck
, new ERROR_CODE
[] {
1150 ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
1151 // holes are separate from overlap groups
1152 assertEquals(0, hbck
.getOverlapGroups(table
).size());
1157 // check that hole fixed
1158 assertNoErrors(doFsck(conf
,false));
1159 assertEquals(ROWKEYS
.length
- 2 , countRows()); // lost a region so lost a row
1161 cleanupTable(table
);
1166 * This creates and fixes a bad table with a missing region -- hole in meta
1167 * and data present but .regioinfino missing (an orphan hdfs region)in the fs.
1169 @Test (timeout
=180000)
1170 public void testHDFSRegioninfoMissing() throws Exception
{
1171 TableName table
= TableName
.valueOf("tableHDFSRegioninfoMissing");
1174 assertEquals(ROWKEYS
.length
, countRows());
1176 // Mess it up by leaving a hole in the meta data
1177 admin
.disableTable(table
);
1178 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("B"),
1179 Bytes
.toBytes("C"), true, true, false, true, HRegionInfo
.DEFAULT_REPLICA_ID
);
1180 admin
.enableTable(table
);
1182 HBaseFsck hbck
= doFsck(conf
, false);
1183 assertErrors(hbck
, new ERROR_CODE
[] {
1184 ERROR_CODE
.ORPHAN_HDFS_REGION
,
1185 ERROR_CODE
.NOT_IN_META_OR_DEPLOYED
,
1186 ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
1187 // holes are separate from overlap groups
1188 assertEquals(0, hbck
.getOverlapGroups(table
).size());
1193 // check that hole fixed
1194 assertNoErrors(doFsck(conf
, false));
1195 assertEquals(ROWKEYS
.length
, countRows());
1197 cleanupTable(table
);
1202 * This creates and fixes a bad table with a region that is missing meta and
1203 * not assigned to a region server.
1205 @Test (timeout
=180000)
1206 public void testNotInMetaOrDeployedHole() throws Exception
{
1208 TableName
.valueOf("tableNotInMetaOrDeployedHole");
1211 assertEquals(ROWKEYS
.length
, countRows());
1213 // Mess it up by leaving a hole in the meta data
1214 admin
.disableTable(table
);
1215 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("B"),
1216 Bytes
.toBytes("C"), true, true, false); // don't rm from fs
1217 admin
.enableTable(table
);
1219 HBaseFsck hbck
= doFsck(conf
, false);
1220 assertErrors(hbck
, new ERROR_CODE
[] {
1221 ERROR_CODE
.NOT_IN_META_OR_DEPLOYED
, ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
1222 // holes are separate from overlap groups
1223 assertEquals(0, hbck
.getOverlapGroups(table
).size());
1226 assertErrors(doFsck(conf
, true) , new ERROR_CODE
[] {
1227 ERROR_CODE
.NOT_IN_META_OR_DEPLOYED
, ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
1229 // check that hole fixed
1230 assertNoErrors(doFsck(conf
,false));
1231 assertEquals(ROWKEYS
.length
, countRows());
1233 cleanupTable(table
);
1237 @Test (timeout
=180000)
1238 public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception
{
1239 TableName table
= TableName
.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit");
1240 MiniHBaseCluster cluster
= TEST_UTIL
.getHBaseCluster();
1242 HTableDescriptor desc
= new HTableDescriptor(table
);
1243 desc
.addFamily(new HColumnDescriptor(Bytes
.toBytes("f")));
1244 createTable(TEST_UTIL
, desc
, null);
1246 tbl
= (HTable
) connection
.getTable(desc
.getTableName());
1247 for (int i
= 0; i
< 5; i
++) {
1248 Put p1
= new Put(("r" + i
).getBytes());
1249 p1
.add(Bytes
.toBytes("f"), "q1".getBytes(), "v".getBytes());
1252 admin
.flush(desc
.getTableName());
1253 List
<HRegion
> regions
= cluster
.getRegions(desc
.getTableName());
1254 int serverWith
= cluster
.getServerWith(regions
.get(0).getRegionInfo().getRegionName());
1255 HRegionServer regionServer
= cluster
.getRegionServer(serverWith
);
1256 cluster
.getServerWith(regions
.get(0).getRegionInfo().getRegionName());
1257 SplitTransactionImpl st
= (SplitTransactionImpl
)
1258 new SplitTransactionFactory(TEST_UTIL
.getConfiguration())
1259 .create(regions
.get(0), Bytes
.toBytes("r3"));
1261 st
.stepsBeforePONR(regionServer
, regionServer
, false);
1262 AssignmentManager am
= cluster
.getMaster().getAssignmentManager();
1263 Map
<String
, RegionState
> regionsInTransition
= am
.getRegionStates().getRegionsInTransition();
1264 for (RegionState state
: regionsInTransition
.values()) {
1265 am
.regionOffline(state
.getRegion());
1267 Map
<HRegionInfo
, ServerName
> regionsMap
= new HashMap
<HRegionInfo
, ServerName
>();
1268 regionsMap
.put(regions
.get(0).getRegionInfo(), regionServer
.getServerName());
1269 am
.assign(regionsMap
);
1270 am
.waitForAssignment(regions
.get(0).getRegionInfo());
1271 HBaseFsck hbck
= doFsck(conf
, false);
1272 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.NOT_IN_META_OR_DEPLOYED
,
1273 ERROR_CODE
.NOT_IN_META_OR_DEPLOYED
});
1274 // holes are separate from overlap groups
1275 assertEquals(0, hbck
.getOverlapGroups(table
).size());
1279 doFsck(conf
, false, true, false, false, false, false, false, false, false, false, null),
1280 new ERROR_CODE
[] { ERROR_CODE
.NOT_IN_META_OR_DEPLOYED
,
1281 ERROR_CODE
.NOT_IN_META_OR_DEPLOYED
});
1283 // check that hole fixed
1284 assertNoErrors(doFsck(conf
, false));
1285 assertEquals(5, countRows());
1291 cleanupTable(table
);
1296 * This creates fixes a bad table with a hole in meta.
1298 @Test (timeout
=180000)
1299 public void testNotInMetaHole() throws Exception
{
1301 TableName
.valueOf("tableNotInMetaHole");
1304 assertEquals(ROWKEYS
.length
, countRows());
1306 // Mess it up by leaving a hole in the meta data
1307 admin
.disableTable(table
);
1308 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("B"),
1309 Bytes
.toBytes("C"), false, true, false); // don't rm from fs
1310 admin
.enableTable(table
);
1312 HBaseFsck hbck
= doFsck(conf
, false);
1313 assertErrors(hbck
, new ERROR_CODE
[] {
1314 ERROR_CODE
.NOT_IN_META_OR_DEPLOYED
, ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
1315 // holes are separate from overlap groups
1316 assertEquals(0, hbck
.getOverlapGroups(table
).size());
1319 assertErrors(doFsck(conf
, true) , new ERROR_CODE
[] {
1320 ERROR_CODE
.NOT_IN_META_OR_DEPLOYED
, ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
1322 // check that hole fixed
1323 assertNoErrors(doFsck(conf
,false));
1324 assertEquals(ROWKEYS
.length
, countRows());
1326 cleanupTable(table
);
1331 * This creates and fixes a bad table with a region that is in meta but has
1332 * no deployment or data hdfs
1334 @Test (timeout
=180000)
1335 public void testNotInHdfs() throws Exception
{
1337 TableName
.valueOf("tableNotInHdfs");
1340 assertEquals(ROWKEYS
.length
, countRows());
1342 // make sure data in regions, if in wal only there is no data loss
1345 // Mess it up by leaving a hole in the hdfs data
1346 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("B"),
1347 Bytes
.toBytes("C"), false, false, true); // don't rm meta
1349 HBaseFsck hbck
= doFsck(conf
, false);
1350 assertErrors(hbck
, new ERROR_CODE
[] {ERROR_CODE
.NOT_IN_HDFS
});
1351 // holes are separate from overlap groups
1352 assertEquals(0, hbck
.getOverlapGroups(table
).size());
1357 // check that hole fixed
1358 assertNoErrors(doFsck(conf
,false));
1359 assertEquals(ROWKEYS
.length
- 2, countRows());
1361 cleanupTable(table
);
1366 * This creates and fixes a bad table with a region that is in meta but has
1367 * no deployment or data hdfs. The table has region_replication set to 2.
1369 @Test (timeout
=180000)
1370 public void testNotInHdfsWithReplicas() throws Exception
{
1372 TableName
.valueOf("tableNotInHdfs");
1374 HRegionInfo
[] oldHris
= new HRegionInfo
[2];
1375 setupTableWithRegionReplica(table
, 2);
1376 assertEquals(ROWKEYS
.length
, countRows());
1377 NavigableMap
<HRegionInfo
, ServerName
> map
=
1378 MetaTableAccessor
.allTableRegions(TEST_UTIL
.getConnection(),
1381 // store the HRIs of the regions we will mess up
1382 for (Map
.Entry
<HRegionInfo
, ServerName
> m
: map
.entrySet()) {
1383 if (m
.getKey().getStartKey().length
> 0 &&
1384 m
.getKey().getStartKey()[0] == Bytes
.toBytes("B")[0]) {
1385 LOG
.debug("Initially server hosting " + m
.getKey() + " is " + m
.getValue());
1386 oldHris
[i
++] = m
.getKey();
1389 // make sure data in regions
1392 // Mess it up by leaving a hole in the hdfs data
1393 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("B"),
1394 Bytes
.toBytes("C"), false, false, true); // don't rm meta
1396 HBaseFsck hbck
= doFsck(conf
, false);
1397 assertErrors(hbck
, new ERROR_CODE
[] {ERROR_CODE
.NOT_IN_HDFS
});
1402 // check that hole fixed
1403 assertNoErrors(doFsck(conf
,false));
1404 assertEquals(ROWKEYS
.length
- 2, countRows());
1406 // the following code checks whether the old primary/secondary has
1407 // been unassigned and the new primary/secondary has been assigned
1409 HRegionInfo
[] newHris
= new HRegionInfo
[2];
1410 // get all table's regions from meta
1411 map
= MetaTableAccessor
.allTableRegions(TEST_UTIL
.getConnection(), tbl
.getName());
1412 // get the HRIs of the new regions (hbck created new regions for fixing the hdfs mess-up)
1413 for (Map
.Entry
<HRegionInfo
, ServerName
> m
: map
.entrySet()) {
1414 if (m
.getKey().getStartKey().length
> 0 &&
1415 m
.getKey().getStartKey()[0] == Bytes
.toBytes("B")[0]) {
1416 newHris
[i
++] = m
.getKey();
1419 // get all the online regions in the regionservers
1420 Collection
<ServerName
> servers
= admin
.getClusterStatus().getServers();
1421 Set
<HRegionInfo
> onlineRegions
= new HashSet
<HRegionInfo
>();
1422 for (ServerName s
: servers
) {
1423 List
<HRegionInfo
> list
= admin
.getOnlineRegions(s
);
1424 onlineRegions
.addAll(list
);
1426 // the new HRIs must be a subset of the online regions
1427 assertTrue(onlineRegions
.containsAll(Arrays
.asList(newHris
)));
1428 // the old HRIs must not be part of the set (removeAll would return false if
1429 // the set didn't change)
1430 assertFalse(onlineRegions
.removeAll(Arrays
.asList(oldHris
)));
1432 cleanupTable(table
);
1439 * This creates entries in hbase:meta with no hdfs data. This should cleanly
1442 @Test (timeout
=180000)
1443 public void testNoHdfsTable() throws Exception
{
1444 TableName table
= TableName
.valueOf("NoHdfsTable");
1446 assertEquals(ROWKEYS
.length
, countRows());
1448 // make sure data in regions, if in wal only there is no data loss
1451 // Mess it up by deleting hdfs dirs
1452 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes(""),
1453 Bytes
.toBytes("A"), false, false, true); // don't rm meta
1454 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("A"),
1455 Bytes
.toBytes("B"), false, false, true); // don't rm meta
1456 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("B"),
1457 Bytes
.toBytes("C"), false, false, true); // don't rm meta
1458 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("C"),
1459 Bytes
.toBytes(""), false, false, true); // don't rm meta
1461 // also remove the table directory in hdfs
1462 deleteTableDir(table
);
1464 HBaseFsck hbck
= doFsck(conf
, false);
1465 assertErrors(hbck
, new ERROR_CODE
[] {ERROR_CODE
.NOT_IN_HDFS
,
1466 ERROR_CODE
.NOT_IN_HDFS
, ERROR_CODE
.NOT_IN_HDFS
,
1467 ERROR_CODE
.NOT_IN_HDFS
, ERROR_CODE
.ORPHAN_TABLE_STATE
, });
1468 // holes are separate from overlap groups
1469 assertEquals(0, hbck
.getOverlapGroups(table
).size());
1472 doFsck(conf
, true); // detect dangling regions and remove those
1474 // check that hole fixed
1475 assertNoErrors(doFsck(conf
,false));
1476 assertFalse("Table " + table
+ " should have been deleted", admin
.tableExists(table
));
1479 public void deleteTableDir(TableName table
) throws IOException
{
1480 Path rootDir
= FSUtils
.getRootDir(conf
);
1481 FileSystem fs
= rootDir
.getFileSystem(conf
);
1482 Path p
= FSUtils
.getTableDir(rootDir
, table
);
1483 HBaseFsck
.debugLsr(conf
, p
);
1484 boolean success
= fs
.delete(p
, true);
1485 LOG
.info("Deleted " + p
+ " sucessfully? " + success
);
1489 * when the hbase.version file missing, It is fix the fault.
1491 @Test (timeout
=180000)
1492 public void testNoVersionFile() throws Exception
{
1493 // delete the hbase.version file
1494 Path rootDir
= FSUtils
.getRootDir(conf
);
1495 FileSystem fs
= rootDir
.getFileSystem(conf
);
1496 Path versionFile
= new Path(rootDir
, HConstants
.VERSION_FILE_NAME
);
1497 fs
.delete(versionFile
, true);
1500 HBaseFsck hbck
= doFsck(conf
, false);
1501 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.NO_VERSION_FILE
});
1502 // fix hbase.version missing
1505 // no version file fixed
1506 assertNoErrors(doFsck(conf
, false));
1510 * when the hbase.version file missing, It is fix the fault.
1512 @Test (timeout
=180000)
1513 public void testNoTableState() throws Exception
{
1514 // delete the hbase.version file
1516 TableName
.valueOf("testNoTableState");
1519 // make sure data in regions, if in wal only there is no data loss
1522 MetaTableAccessor
.deleteTableState(TEST_UTIL
.getConnection(), table
);
1525 HBaseFsck hbck
= doFsck(conf
, false);
1526 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.NO_TABLE_STATE
});
1527 // fix table state missing
1530 assertNoErrors(doFsck(conf
, false));
1531 assertTrue(TEST_UTIL
.getHBaseAdmin().isTableEnabled(table
));
1533 cleanupTable(table
);
1538 * The region is not deployed when the table is disabled.
1540 @Test (timeout
=180000)
1541 public void testRegionShouldNotBeDeployed() throws Exception
{
1543 TableName
.valueOf("tableRegionShouldNotBeDeployed");
1545 LOG
.info("Starting testRegionShouldNotBeDeployed.");
1546 MiniHBaseCluster cluster
= TEST_UTIL
.getHBaseCluster();
1547 assertTrue(cluster
.waitForActiveAndReadyMaster());
1550 byte[][] SPLIT_KEYS
= new byte[][] { new byte[0], Bytes
.toBytes("aaa"),
1551 Bytes
.toBytes("bbb"), Bytes
.toBytes("ccc"), Bytes
.toBytes("ddd") };
1552 HTableDescriptor htdDisabled
= new HTableDescriptor(table
);
1553 htdDisabled
.addFamily(new HColumnDescriptor(FAM
));
1555 // Write the .tableinfo
1556 FSTableDescriptors fstd
= new FSTableDescriptors(conf
);
1557 fstd
.createTableDescriptor(htdDisabled
);
1558 List
<HRegionInfo
> disabledRegions
=
1559 TEST_UTIL
.createMultiRegionsInMeta(conf
, htdDisabled
, SPLIT_KEYS
);
1561 // Let's just assign everything to first RS
1562 HRegionServer hrs
= cluster
.getRegionServer(0);
1564 // Create region files.
1565 admin
.disableTable(table
);
1566 admin
.enableTable(table
);
1568 // Disable the table and close its regions
1569 admin
.disableTable(table
);
1570 HRegionInfo region
= disabledRegions
.remove(0);
1571 byte[] regionName
= region
.getRegionName();
1573 // The region should not be assigned currently
1574 assertTrue(cluster
.getServerWith(regionName
) == -1);
1576 // Directly open a region on a region server.
1577 // If going through AM/ZK, the region won't be open.
1578 // Even it is opened, AM will close it which causes
1579 // flakiness of this test.
1580 HRegion r
= HRegion
.openHRegion(
1581 region
, htdDisabled
, hrs
.getWAL(region
), conf
);
1582 hrs
.addToOnlineRegions(r
);
1584 HBaseFsck hbck
= doFsck(conf
, false);
1585 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.SHOULD_NOT_BE_DEPLOYED
});
1591 assertNoErrors(doFsck(conf
, false));
1593 admin
.enableTable(table
);
1594 cleanupTable(table
);
1599 * This creates two tables and mess both of them and fix them one by one
1601 @Test (timeout
=180000)
1602 public void testFixByTable() throws Exception
{
1604 TableName
.valueOf("testFixByTable1");
1606 TableName
.valueOf("testFixByTable2");
1609 // make sure data in regions, if in wal only there is no data loss
1610 admin
.flush(table1
);
1611 // Mess them up by leaving a hole in the hdfs data
1612 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("B"),
1613 Bytes
.toBytes("C"), false, false, true); // don't rm meta
1616 // make sure data in regions, if in wal only there is no data loss
1617 admin
.flush(table2
);
1618 // Mess them up by leaving a hole in the hdfs data
1619 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("B"),
1620 Bytes
.toBytes("C"), false, false, true); // don't rm meta
1622 HBaseFsck hbck
= doFsck(conf
, false);
1623 assertErrors(hbck
, new ERROR_CODE
[] {
1624 ERROR_CODE
.NOT_IN_HDFS
, ERROR_CODE
.NOT_IN_HDFS
});
1626 // fix hole in table 1
1627 doFsck(conf
, true, table1
);
1628 // check that hole in table 1 fixed
1629 assertNoErrors(doFsck(conf
, false, table1
));
1630 // check that hole in table 2 still there
1631 assertErrors(doFsck(conf
, false, table2
),
1632 new ERROR_CODE
[] {ERROR_CODE
.NOT_IN_HDFS
});
1634 // fix hole in table 2
1635 doFsck(conf
, true, table2
);
1636 // check that hole in both tables fixed
1637 assertNoErrors(doFsck(conf
, false));
1638 assertEquals(ROWKEYS
.length
- 2, countRows());
1640 cleanupTable(table1
);
1641 cleanupTable(table2
);
1645 * A split parent in meta, in hdfs, and not deployed
1647 @Test (timeout
=180000)
1648 public void testLingeringSplitParent() throws Exception
{
1650 TableName
.valueOf("testLingeringSplitParent");
1654 assertEquals(ROWKEYS
.length
, countRows());
1656 // make sure data in regions, if in wal only there is no data loss
1658 HRegionLocation location
= tbl
.getRegionLocation("B");
1660 // Delete one region from meta, but not hdfs, unassign it.
1661 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("B"),
1662 Bytes
.toBytes("C"), true, true, false);
1664 // Create a new meta entry to fake it as a split parent.
1665 meta
= connection
.getTable(TableName
.META_TABLE_NAME
, tableExecutorService
);
1666 HRegionInfo hri
= location
.getRegionInfo();
1668 HRegionInfo a
= new HRegionInfo(tbl
.getName(),
1669 Bytes
.toBytes("B"), Bytes
.toBytes("BM"));
1670 HRegionInfo b
= new HRegionInfo(tbl
.getName(),
1671 Bytes
.toBytes("BM"), Bytes
.toBytes("C"));
1673 hri
.setOffline(true);
1676 MetaTableAccessor
.addRegionToMeta(meta
, hri
, a
, b
);
1678 admin
.flush(TableName
.META_TABLE_NAME
);
1680 HBaseFsck hbck
= doFsck(conf
, false);
1681 assertErrors(hbck
, new ERROR_CODE
[] {
1682 ERROR_CODE
.LINGERING_SPLIT_PARENT
, ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
1684 // regular repair cannot fix lingering split parent
1685 hbck
= doFsck(conf
, true);
1686 assertErrors(hbck
, new ERROR_CODE
[] {
1687 ERROR_CODE
.LINGERING_SPLIT_PARENT
, ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
1688 assertFalse(hbck
.shouldRerun());
1689 hbck
= doFsck(conf
, false);
1690 assertErrors(hbck
, new ERROR_CODE
[] {
1691 ERROR_CODE
.LINGERING_SPLIT_PARENT
, ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
1693 // fix lingering split parent
1694 hbck
= new HBaseFsck(conf
, hbfsckExecutorService
);
1696 HBaseFsck
.setDisplayFullReport(); // i.e. -details
1698 hbck
.setFixSplitParents(true);
1700 assertTrue(hbck
.shouldRerun());
1703 Get get
= new Get(hri
.getRegionName());
1704 Result result
= meta
.get(get
);
1705 assertTrue(result
.getColumnCells(HConstants
.CATALOG_FAMILY
,
1706 HConstants
.SPLITA_QUALIFIER
).isEmpty());
1707 assertTrue(result
.getColumnCells(HConstants
.CATALOG_FAMILY
,
1708 HConstants
.SPLITB_QUALIFIER
).isEmpty());
1709 admin
.flush(TableName
.META_TABLE_NAME
);
1714 // check that all are fixed
1715 assertNoErrors(doFsck(conf
, false));
1716 assertEquals(ROWKEYS
.length
, countRows());
1718 cleanupTable(table
);
1719 IOUtils
.closeQuietly(meta
);
1724 * Tests that LINGERING_SPLIT_PARENT is not erroneously reported for
1725 * valid cases where the daughters are there.
1727 @Test (timeout
=180000)
1728 public void testValidLingeringSplitParent() throws Exception
{
1730 TableName
.valueOf("testLingeringSplitParent");
1734 assertEquals(ROWKEYS
.length
, countRows());
1736 // make sure data in regions, if in wal only there is no data loss
1738 HRegionLocation location
= tbl
.getRegionLocation(Bytes
.toBytes("B"));
1740 meta
= connection
.getTable(TableName
.META_TABLE_NAME
, tableExecutorService
);
1741 HRegionInfo hri
= location
.getRegionInfo();
1743 // do a regular split
1744 byte[] regionName
= location
.getRegionInfo().getRegionName();
1745 admin
.splitRegion(location
.getRegionInfo().getRegionName(), Bytes
.toBytes("BM"));
1746 TestEndToEndSplitTransaction
.blockUntilRegionSplit(conf
, 60000, regionName
, true);
1748 // TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on
1749 // for some time until children references are deleted. HBCK erroneously sees this as
1750 // overlapping regions
1751 HBaseFsck hbck
= doFsck(conf
, true, true, false, false, false, true, true, true, false, false, null);
1752 assertErrors(hbck
, new ERROR_CODE
[] {}); //no LINGERING_SPLIT_PARENT reported
1754 // assert that the split hbase:meta entry is still there.
1755 Get get
= new Get(hri
.getRegionName());
1756 Result result
= meta
.get(get
);
1757 assertNotNull(result
);
1758 assertNotNull(MetaTableAccessor
.getHRegionInfo(result
));
1760 assertEquals(ROWKEYS
.length
, countRows());
1762 // assert that we still have the split regions
1763 assertEquals(tbl
.getStartKeys().length
, SPLITS
.length
+ 1 + 1); //SPLITS + 1 is # regions pre-split.
1764 assertNoErrors(doFsck(conf
, false));
1766 cleanupTable(table
);
1767 IOUtils
.closeQuietly(meta
);
1772 * Split crashed after write to hbase:meta finished for the parent region, but
1773 * failed to write daughters (pre HBASE-7721 codebase)
1775 @Test(timeout
=75000)
1776 public void testSplitDaughtersNotInMeta() throws Exception
{
1777 TableName table
= TableName
.valueOf("testSplitdaughtersNotInMeta");
1778 Table meta
= connection
.getTable(TableName
.META_TABLE_NAME
, tableExecutorService
);
1781 assertEquals(ROWKEYS
.length
, countRows());
1783 // make sure data in regions, if in wal only there is no data loss
1785 HRegionLocation location
= tbl
.getRegionLocation(Bytes
.toBytes("B"));
1787 HRegionInfo hri
= location
.getRegionInfo();
1789 // do a regular split
1790 byte[] regionName
= location
.getRegionInfo().getRegionName();
1791 admin
.splitRegion(location
.getRegionInfo().getRegionName(), Bytes
.toBytes("BM"));
1792 TestEndToEndSplitTransaction
.blockUntilRegionSplit(conf
, 60000, regionName
, true);
1794 PairOfSameType
<HRegionInfo
> daughters
=
1795 MetaTableAccessor
.getDaughterRegions(meta
.get(new Get(regionName
)));
1797 // Delete daughter regions from meta, but not hdfs, unassign it.
1798 Map
<HRegionInfo
, ServerName
> hris
= tbl
.getRegionLocations();
1799 undeployRegion(connection
, hris
.get(daughters
.getFirst()), daughters
.getFirst());
1800 undeployRegion(connection
, hris
.get(daughters
.getSecond()), daughters
.getSecond());
1802 List
<Delete
> deletes
= new ArrayList
<>();
1803 deletes
.add(new Delete(daughters
.getFirst().getRegionName()));
1804 deletes
.add(new Delete(daughters
.getSecond().getRegionName()));
1805 meta
.delete(deletes
);
1807 // Remove daughters from regionStates
1808 RegionStates regionStates
= TEST_UTIL
.getMiniHBaseCluster().getMaster().
1809 getAssignmentManager().getRegionStates();
1810 regionStates
.deleteRegion(daughters
.getFirst());
1811 regionStates
.deleteRegion(daughters
.getSecond());
1813 HBaseFsck hbck
= doFsck(conf
, false);
1815 new ERROR_CODE
[] { ERROR_CODE
.NOT_IN_META_OR_DEPLOYED
, ERROR_CODE
.NOT_IN_META_OR_DEPLOYED
,
1816 ERROR_CODE
.HOLE_IN_REGION_CHAIN
}); //no LINGERING_SPLIT_PARENT
1818 // now fix it. The fix should not revert the region split, but add daughters to META
1819 hbck
= doFsck(conf
, true, true, false, false, false, false, false, false, false, false, null);
1821 new ERROR_CODE
[] { ERROR_CODE
.NOT_IN_META_OR_DEPLOYED
, ERROR_CODE
.NOT_IN_META_OR_DEPLOYED
,
1822 ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
1824 // assert that the split hbase:meta entry is still there.
1825 Get get
= new Get(hri
.getRegionName());
1826 Result result
= meta
.get(get
);
1827 assertNotNull(result
);
1828 assertNotNull(MetaTableAccessor
.getHRegionInfo(result
));
1830 assertEquals(ROWKEYS
.length
, countRows());
1832 // assert that we still have the split regions
1833 assertEquals(tbl
.getStartKeys().length
, SPLITS
.length
+ 1 + 1); //SPLITS + 1 is # regions pre-split.
1834 assertNoErrors(doFsck(conf
, false)); //should be fixed by now
1837 cleanupTable(table
);
1842 * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1843 * meta and data missing in the fs.
1845 @Test(timeout
=120000)
1846 public void testMissingFirstRegion() throws Exception
{
1847 TableName table
= TableName
.valueOf("testMissingFirstRegion");
1850 assertEquals(ROWKEYS
.length
, countRows());
1852 // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1853 admin
.disableTable(table
);
1854 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes(""), Bytes
.toBytes("A"), true,
1856 admin
.enableTable(table
);
1858 HBaseFsck hbck
= doFsck(conf
, false);
1859 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.FIRST_REGION_STARTKEY_NOT_EMPTY
});
1862 // check that hole fixed
1863 assertNoErrors(doFsck(conf
, false));
1865 cleanupTable(table
);
1870 * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1871 * meta and data missing in the fs.
1873 @Test(timeout
=120000)
1874 public void testRegionDeployedNotInHdfs() throws Exception
{
1876 TableName
.valueOf("testSingleRegionDeployedNotInHdfs");
1881 // Mess it up by deleting region dir
1882 deleteRegion(conf
, tbl
.getTableDescriptor(),
1883 HConstants
.EMPTY_START_ROW
, Bytes
.toBytes("A"), false,
1886 HBaseFsck hbck
= doFsck(conf
, false);
1887 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.NOT_IN_HDFS
});
1890 // check that hole fixed
1891 assertNoErrors(doFsck(conf
, false));
1893 cleanupTable(table
);
1898 * This creates and fixes a bad table with missing last region -- hole in meta and data missing in
1901 @Test(timeout
=120000)
1902 public void testMissingLastRegion() throws Exception
{
1904 TableName
.valueOf("testMissingLastRegion");
1907 assertEquals(ROWKEYS
.length
, countRows());
1909 // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1910 admin
.disableTable(table
);
1911 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("C"), Bytes
.toBytes(""), true,
1913 admin
.enableTable(table
);
1915 HBaseFsck hbck
= doFsck(conf
, false);
1916 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.LAST_REGION_ENDKEY_NOT_EMPTY
});
1919 // check that hole fixed
1920 assertNoErrors(doFsck(conf
, false));
1922 cleanupTable(table
);
1927 * Test -noHdfsChecking option can detect and fix assignments issue.
1929 @Test (timeout
=180000)
1930 public void testFixAssignmentsAndNoHdfsChecking() throws Exception
{
1932 TableName
.valueOf("testFixAssignmentsAndNoHdfsChecking");
1935 assertEquals(ROWKEYS
.length
, countRows());
1937 // Mess it up by closing a region
1938 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("A"),
1939 Bytes
.toBytes("B"), true, false, false, false, HRegionInfo
.DEFAULT_REPLICA_ID
);
1941 // verify there is no other errors
1942 HBaseFsck hbck
= doFsck(conf
, false);
1943 assertErrors(hbck
, new ERROR_CODE
[] {
1944 ERROR_CODE
.NOT_DEPLOYED
, ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
1946 // verify that noHdfsChecking report the same errors
1947 HBaseFsck fsck
= new HBaseFsck(conf
, hbfsckExecutorService
);
1949 HBaseFsck
.setDisplayFullReport(); // i.e. -details
1951 fsck
.setCheckHdfs(false);
1953 assertErrors(fsck
, new ERROR_CODE
[] {
1954 ERROR_CODE
.NOT_DEPLOYED
, ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
1957 // verify that fixAssignments works fine with noHdfsChecking
1958 fsck
= new HBaseFsck(conf
, hbfsckExecutorService
);
1960 HBaseFsck
.setDisplayFullReport(); // i.e. -details
1962 fsck
.setCheckHdfs(false);
1963 fsck
.setFixAssignments(true);
1965 assertTrue(fsck
.shouldRerun());
1967 assertNoErrors(fsck
);
1969 assertEquals(ROWKEYS
.length
, countRows());
1973 cleanupTable(table
);
1978 * Test -noHdfsChecking option can detect region is not in meta but deployed.
1979 * However, it can not fix it without checking Hdfs because we need to get
1980 * the region info from Hdfs in this case, then to patch the meta.
1982 @Test (timeout
=180000)
1983 public void testFixMetaNotWorkingWithNoHdfsChecking() throws Exception
{
1985 TableName
.valueOf("testFixMetaNotWorkingWithNoHdfsChecking");
1988 assertEquals(ROWKEYS
.length
, countRows());
1990 // Mess it up by deleting a region from the metadata
1991 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("A"),
1992 Bytes
.toBytes("B"), false, true, false, false, HRegionInfo
.DEFAULT_REPLICA_ID
);
1994 // verify there is no other errors
1995 HBaseFsck hbck
= doFsck(conf
, false);
1997 new ERROR_CODE
[] { ERROR_CODE
.NOT_IN_META
, ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
1999 // verify that noHdfsChecking report the same errors
2000 HBaseFsck fsck
= new HBaseFsck(conf
, hbfsckExecutorService
);
2002 HBaseFsck
.setDisplayFullReport(); // i.e. -details
2004 fsck
.setCheckHdfs(false);
2007 new ERROR_CODE
[] { ERROR_CODE
.NOT_IN_META
, ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
2010 // verify that fixMeta doesn't work with noHdfsChecking
2011 fsck
= new HBaseFsck(conf
, hbfsckExecutorService
);
2013 HBaseFsck
.setDisplayFullReport(); // i.e. -details
2015 fsck
.setCheckHdfs(false);
2016 fsck
.setFixAssignments(true);
2017 fsck
.setFixMeta(true);
2019 assertFalse(fsck
.shouldRerun());
2021 new ERROR_CODE
[] { ERROR_CODE
.NOT_IN_META
, ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
2024 // fix the cluster so other tests won't be impacted
2025 fsck
= doFsck(conf
, true);
2026 assertTrue(fsck
.shouldRerun());
2027 fsck
= doFsck(conf
, true);
2028 assertNoErrors(fsck
);
2030 cleanupTable(table
);
2035 * Test -fixHdfsHoles doesn't work with -noHdfsChecking option,
2036 * and -noHdfsChecking can't detect orphan Hdfs region.
2038 @Test (timeout
=180000)
2039 public void testFixHdfsHolesNotWorkingWithNoHdfsChecking() throws Exception
{
2041 TableName
.valueOf("testFixHdfsHolesNotWorkingWithNoHdfsChecking");
2044 assertEquals(ROWKEYS
.length
, countRows());
2046 // Mess it up by creating an overlap in the metadata
2047 admin
.disableTable(table
);
2048 deleteRegion(conf
, tbl
.getTableDescriptor(), Bytes
.toBytes("A"),
2049 Bytes
.toBytes("B"), true, true, false, true, HRegionInfo
.DEFAULT_REPLICA_ID
);
2050 admin
.enableTable(table
);
2052 HRegionInfo hriOverlap
=
2053 createRegion(tbl
.getTableDescriptor(), Bytes
.toBytes("A2"), Bytes
.toBytes("B"));
2054 TEST_UTIL
.getHBaseCluster().getMaster().assignRegion(hriOverlap
);
2055 TEST_UTIL
.getHBaseCluster().getMaster().getAssignmentManager()
2056 .waitForAssignment(hriOverlap
);
2057 ServerName server
= regionStates
.getRegionServerOfRegion(hriOverlap
);
2058 TEST_UTIL
.assertRegionOnServer(hriOverlap
, server
, REGION_ONLINE_TIMEOUT
);
2060 HBaseFsck hbck
= doFsck(conf
, false);
2061 assertErrors(hbck
, new ERROR_CODE
[] {
2062 ERROR_CODE
.ORPHAN_HDFS_REGION
, ERROR_CODE
.NOT_IN_META_OR_DEPLOYED
,
2063 ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
2065 // verify that noHdfsChecking can't detect ORPHAN_HDFS_REGION
2066 HBaseFsck fsck
= new HBaseFsck(conf
, hbfsckExecutorService
);
2068 HBaseFsck
.setDisplayFullReport(); // i.e. -details
2070 fsck
.setCheckHdfs(false);
2072 assertErrors(fsck
, new ERROR_CODE
[] {
2073 ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
2076 // verify that fixHdfsHoles doesn't work with noHdfsChecking
2077 fsck
= new HBaseFsck(conf
, hbfsckExecutorService
);
2079 HBaseFsck
.setDisplayFullReport(); // i.e. -details
2081 fsck
.setCheckHdfs(false);
2082 fsck
.setFixHdfsHoles(true);
2083 fsck
.setFixHdfsOverlaps(true);
2084 fsck
.setFixHdfsOrphans(true);
2086 assertFalse(fsck
.shouldRerun());
2087 assertErrors(fsck
, new ERROR_CODE
[] { ERROR_CODE
.HOLE_IN_REGION_CHAIN
});
2090 if (admin
.isTableDisabled(table
)) {
2091 admin
.enableTable(table
);
2093 cleanupTable(table
);
2098 * We don't have an easy way to verify that a flush completed, so we loop until we find a
2099 * legitimate hfile and return it.
2102 * @return Path of a flushed hfile.
2103 * @throws IOException
2105 Path
getFlushedHFile(FileSystem fs
, TableName table
) throws IOException
{
2106 Path tableDir
= FSUtils
.getTableDir(FSUtils
.getRootDir(conf
), table
);
2107 Path regionDir
= FSUtils
.getRegionDirs(fs
, tableDir
).get(0);
2108 Path famDir
= new Path(regionDir
, FAM_STR
);
2110 // keep doing this until we get a legit hfile
2112 FileStatus
[] hfFss
= fs
.listStatus(famDir
);
2113 if (hfFss
.length
== 0) {
2116 for (FileStatus hfs
: hfFss
) {
2117 if (!hfs
.isDirectory()) {
2118 return hfs
.getPath();
2125 * This creates a table and then corrupts an hfile. Hbck should quarantine the file.
2127 @Test(timeout
=180000)
2128 public void testQuarantineCorruptHFile() throws Exception
{
2129 TableName table
= TableName
.valueOf(name
.getMethodName());
2132 assertEquals(ROWKEYS
.length
, countRows());
2133 admin
.flush(table
); // flush is async.
2135 FileSystem fs
= FileSystem
.get(conf
);
2136 Path hfile
= getFlushedHFile(fs
, table
);
2138 // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2139 admin
.disableTable(table
);
2141 // create new corrupt file called deadbeef (valid hfile name)
2142 Path corrupt
= new Path(hfile
.getParent(), "deadbeef");
2143 TestHFile
.truncateFile(fs
, hfile
, corrupt
);
2144 LOG
.info("Created corrupted file " + corrupt
);
2145 HBaseFsck
.debugLsr(conf
, FSUtils
.getRootDir(conf
));
2147 // we cannot enable here because enable never finished due to the corrupt region.
2148 HBaseFsck res
= HbckTestingUtil
.doHFileQuarantine(conf
, table
);
2149 assertEquals(res
.getRetCode(), 0);
2150 HFileCorruptionChecker hfcc
= res
.getHFilecorruptionChecker();
2151 assertEquals(hfcc
.getHFilesChecked(), 5);
2152 assertEquals(hfcc
.getCorrupted().size(), 1);
2153 assertEquals(hfcc
.getFailures().size(), 0);
2154 assertEquals(hfcc
.getQuarantined().size(), 1);
2155 assertEquals(hfcc
.getMissing().size(), 0);
2157 // Its been fixed, verify that we can enable.
2158 admin
.enableTable(table
);
2160 cleanupTable(table
);
2165 * Test that use this should have a timeout, because this method could potentially wait forever.
2167 private void doQuarantineTest(TableName table
, HBaseFsck hbck
, int check
,
2168 int corrupt
, int fail
, int quar
, int missing
) throws Exception
{
2171 assertEquals(ROWKEYS
.length
, countRows());
2172 admin
.flush(table
); // flush is async.
2174 // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2175 admin
.disableTable(table
);
2177 String
[] args
= {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
2178 table
.getNameAsString()};
2179 HBaseFsck res
= hbck
.exec(hbfsckExecutorService
, args
);
2181 HFileCorruptionChecker hfcc
= res
.getHFilecorruptionChecker();
2182 assertEquals(hfcc
.getHFilesChecked(), check
);
2183 assertEquals(hfcc
.getCorrupted().size(), corrupt
);
2184 assertEquals(hfcc
.getFailures().size(), fail
);
2185 assertEquals(hfcc
.getQuarantined().size(), quar
);
2186 assertEquals(hfcc
.getMissing().size(), missing
);
2188 // its been fixed, verify that we can enable
2189 admin
.enableTableAsync(table
);
2190 while (!admin
.isTableEnabled(table
)) {
2193 } catch (InterruptedException e
) {
2194 e
.printStackTrace();
2195 fail("Interrupted when trying to enable table " + table
);
2199 cleanupTable(table
);
2204 * This creates a table and simulates the race situation where a concurrent compaction or split
2205 * has removed an hfile after the corruption checker learned about it.
2207 @Test(timeout
=180000)
2208 public void testQuarantineMissingHFile() throws Exception
{
2209 TableName table
= TableName
.valueOf(name
.getMethodName());
2211 // inject a fault in the hfcc created.
2212 final FileSystem fs
= FileSystem
.get(conf
);
2213 HBaseFsck hbck
= new HBaseFsck(conf
, hbfsckExecutorService
) {
2215 public HFileCorruptionChecker
createHFileCorruptionChecker(boolean sidelineCorruptHFiles
) throws IOException
{
2216 return new HFileCorruptionChecker(conf
, executor
, sidelineCorruptHFiles
) {
2217 AtomicBoolean attemptedFirstHFile
= new AtomicBoolean(false);
2219 protected void checkHFile(Path p
) throws IOException
{
2220 if (attemptedFirstHFile
.compareAndSet(false, true)) {
2221 assertTrue(fs
.delete(p
, true)); // make sure delete happened.
2223 super.checkHFile(p
);
2228 doQuarantineTest(table
, hbck
, 4, 0, 0, 0, 1); // 4 attempted, but 1 missing.
2233 * This creates a table and simulates the race situation where a concurrent compaction or split
2234 * has removed an colfam dir before the corruption checker got to it.
2236 // Disabled because fails sporadically. Is this test right? Timing-wise, there could be no
2237 // files in a column family on initial creation -- as suggested by Matteo.
2238 @Ignore @Test(timeout
=180000)
2239 public void testQuarantineMissingFamdir() throws Exception
{
2240 TableName table
= TableName
.valueOf(name
.getMethodName());
2241 // inject a fault in the hfcc created.
2242 final FileSystem fs
= FileSystem
.get(conf
);
2243 HBaseFsck hbck
= new HBaseFsck(conf
, hbfsckExecutorService
) {
2245 public HFileCorruptionChecker
createHFileCorruptionChecker(boolean sidelineCorruptHFiles
) throws IOException
{
2246 return new HFileCorruptionChecker(conf
, executor
, sidelineCorruptHFiles
) {
2247 AtomicBoolean attemptedFirstHFile
= new AtomicBoolean(false);
2249 protected void checkColFamDir(Path p
) throws IOException
{
2250 if (attemptedFirstHFile
.compareAndSet(false, true)) {
2251 assertTrue(fs
.delete(p
, true)); // make sure delete happened.
2253 super.checkColFamDir(p
);
2258 doQuarantineTest(table
, hbck
, 3, 0, 0, 0, 1);
2263 * This creates a table and simulates the race situation where a concurrent compaction or split
2264 * has removed a region dir before the corruption checker got to it.
2266 @Test(timeout
=180000)
2267 public void testQuarantineMissingRegionDir() throws Exception
{
2268 TableName table
= TableName
.valueOf(name
.getMethodName());
2269 // inject a fault in the hfcc created.
2270 final FileSystem fs
= FileSystem
.get(conf
);
2271 HBaseFsck hbck
= new HBaseFsck(conf
, hbfsckExecutorService
) {
2273 public HFileCorruptionChecker
createHFileCorruptionChecker(boolean sidelineCorruptHFiles
)
2274 throws IOException
{
2275 return new HFileCorruptionChecker(conf
, executor
, sidelineCorruptHFiles
) {
2276 AtomicBoolean attemptedFirstHFile
= new AtomicBoolean(false);
2278 protected void checkRegionDir(Path p
) throws IOException
{
2279 if (attemptedFirstHFile
.compareAndSet(false, true)) {
2280 assertTrue(fs
.delete(p
, true)); // make sure delete happened.
2282 super.checkRegionDir(p
);
2287 doQuarantineTest(table
, hbck
, 3, 0, 0, 0, 1);
2292 * Test fixing lingering reference file.
2294 @Test (timeout
=180000)
2295 public void testLingeringReferenceFile() throws Exception
{
2297 TableName
.valueOf("testLingeringReferenceFile");
2300 assertEquals(ROWKEYS
.length
, countRows());
2302 // Mess it up by creating a fake reference file
2303 FileSystem fs
= FileSystem
.get(conf
);
2304 Path tableDir
= FSUtils
.getTableDir(FSUtils
.getRootDir(conf
), table
);
2305 Path regionDir
= FSUtils
.getRegionDirs(fs
, tableDir
).get(0);
2306 Path famDir
= new Path(regionDir
, FAM_STR
);
2307 Path fakeReferenceFile
= new Path(famDir
, "fbce357483ceea.12144538");
2308 fs
.create(fakeReferenceFile
);
2310 HBaseFsck hbck
= doFsck(conf
, false);
2311 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.LINGERING_REFERENCE_HFILE
});
2312 // fix reference file
2314 // check that reference file fixed
2315 assertNoErrors(doFsck(conf
, false));
2317 cleanupTable(table
);
2322 * Test mission REGIONINFO_QUALIFIER in hbase:meta
2324 @Test (timeout
=180000)
2325 public void testMissingRegionInfoQualifier() throws Exception
{
2326 Connection connection
= ConnectionFactory
.createConnection(conf
);
2327 TableName table
= TableName
.valueOf("testMissingRegionInfoQualifier");
2331 // Mess it up by removing the RegionInfo for one region.
2332 final List
<Delete
> deletes
= new LinkedList
<Delete
>();
2333 Table meta
= connection
.getTable(TableName
.META_TABLE_NAME
, hbfsckExecutorService
);
2334 MetaTableAccessor
.fullScanRegions(connection
, new MetaTableAccessor
.Visitor() {
2337 public boolean visit(Result rowResult
) throws IOException
{
2338 HRegionInfo hri
= MetaTableAccessor
.getHRegionInfo(rowResult
);
2339 if (hri
!= null && !hri
.getTable().isSystemTable()) {
2340 Delete delete
= new Delete(rowResult
.getRow());
2341 delete
.addColumn(HConstants
.CATALOG_FAMILY
, HConstants
.REGIONINFO_QUALIFIER
);
2342 deletes
.add(delete
);
2347 meta
.delete(deletes
);
2349 // Mess it up by creating a fake hbase:meta entry with no associated RegionInfo
2350 meta
.put(new Put(Bytes
.toBytes(table
+ ",,1361911384013.810e28f59a57da91c66")).add(
2351 HConstants
.CATALOG_FAMILY
, HConstants
.SERVER_QUALIFIER
, Bytes
.toBytes("node1:60020")));
2352 meta
.put(new Put(Bytes
.toBytes(table
+ ",,1361911384013.810e28f59a57da91c66")).add(
2353 HConstants
.CATALOG_FAMILY
, HConstants
.STARTCODE_QUALIFIER
, Bytes
.toBytes(1362150791183L)));
2356 HBaseFsck hbck
= doFsck(conf
, false);
2357 assertTrue(hbck
.getErrors().getErrorList().contains(ERROR_CODE
.EMPTY_META_CELL
));
2359 // fix reference file
2360 hbck
= doFsck(conf
, true);
2362 // check that reference file fixed
2363 assertFalse(hbck
.getErrors().getErrorList().contains(ERROR_CODE
.EMPTY_META_CELL
));
2365 cleanupTable(table
);
2371 * Test pluggable error reporter. It can be plugged in
2372 * from system property or configuration.
2374 @Test (timeout
=180000)
2375 public void testErrorReporter() throws Exception
{
2377 MockErrorReporter
.calledCount
= 0;
2378 doFsck(conf
, false);
2379 assertEquals(MockErrorReporter
.calledCount
, 0);
2381 conf
.set("hbasefsck.errorreporter", MockErrorReporter
.class.getName());
2382 doFsck(conf
, false);
2383 assertTrue(MockErrorReporter
.calledCount
> 20);
2385 conf
.set("hbasefsck.errorreporter",
2386 PrintingErrorReporter
.class.getName());
2387 MockErrorReporter
.calledCount
= 0;
2391 static class MockErrorReporter
implements ErrorReporter
{
2392 static int calledCount
= 0;
2395 public void clear() {
2400 public void report(String message
) {
2405 public void reportError(String message
) {
2410 public void reportError(ERROR_CODE errorCode
, String message
) {
2415 public void reportError(ERROR_CODE errorCode
, String message
, TableInfo table
) {
2420 public void reportError(ERROR_CODE errorCode
,
2421 String message
, TableInfo table
, HbckInfo info
) {
2426 public void reportError(ERROR_CODE errorCode
, String message
,
2427 TableInfo table
, HbckInfo info1
, HbckInfo info2
) {
2432 public int summarize() {
2433 return ++calledCount
;
2437 public void detail(String details
) {
2442 public ArrayList
<ERROR_CODE
> getErrorList() {
2444 return new ArrayList
<ERROR_CODE
>();
2448 public void progress() {
2453 public void print(String message
) {
2458 public void resetErrors() {
2463 public boolean tableHasErrors(TableInfo table
) {
2469 @Test(timeout
=60000)
2470 public void testCheckTableLocks() throws Exception
{
2471 IncrementingEnvironmentEdge edge
= new IncrementingEnvironmentEdge(0);
2472 EnvironmentEdgeManager
.injectEdge(edge
);
2474 HBaseFsck hbck
= doFsck(conf
, false);
2475 assertNoErrors(hbck
);
2477 ServerName mockName
= ServerName
.valueOf("localhost", 60000, 1);
2478 final TableName tableName
= TableName
.valueOf("foo");
2481 final TableLockManager tableLockManager
=
2482 TableLockManager
.createTableLockManager(conf
, TEST_UTIL
.getZooKeeperWatcher(), mockName
);
2483 TableLock writeLock
= tableLockManager
.writeLock(tableName
, "testCheckTableLocks");
2484 writeLock
.acquire();
2485 hbck
= doFsck(conf
, false);
2486 assertNoErrors(hbck
); // should not have expired, no problems
2488 edge
.incrementTime(conf
.getLong(TableLockManager
.TABLE_LOCK_EXPIRE_TIMEOUT
,
2489 TableLockManager
.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS
)); // let table lock expire
2491 hbck
= doFsck(conf
, false);
2492 assertErrors(hbck
, new ERROR_CODE
[] {ERROR_CODE
.EXPIRED_TABLE_LOCK
});
2494 final CountDownLatch latch
= new CountDownLatch(1);
2498 TableLock readLock
= tableLockManager
.writeLock(tableName
, "testCheckTableLocks");
2502 } catch (IOException ex
) {
2504 } catch (IllegalStateException ex
) {
2505 return; // expected, since this will be reaped under us.
2507 fail("should not have come here");
2511 latch
.await(); // wait until thread starts
2512 Threads
.sleep(300); // wait some more to ensure writeLock.acquire() is called
2514 hbck
= doFsck(conf
, false);
2515 assertErrors(hbck
, new ERROR_CODE
[] {ERROR_CODE
.EXPIRED_TABLE_LOCK
}); // still one expired, one not-expired
2517 edge
.incrementTime(conf
.getLong(TableLockManager
.TABLE_LOCK_EXPIRE_TIMEOUT
,
2518 TableLockManager
.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS
)); // let table lock expire
2520 hbck
= doFsck(conf
, false);
2521 assertErrors(hbck
, new ERROR_CODE
[] {ERROR_CODE
.EXPIRED_TABLE_LOCK
, ERROR_CODE
.EXPIRED_TABLE_LOCK
}); // both are expired
2523 conf
.setLong(TableLockManager
.TABLE_LOCK_EXPIRE_TIMEOUT
, 1); // reaping from ZKInterProcessWriteLock uses znode cTime,
2524 // which is not injectable through EnvironmentEdge
2526 hbck
= doFsck(conf
, true); // now fix both cases
2528 hbck
= doFsck(conf
, false);
2529 assertNoErrors(hbck
);
2531 // ensure that locks are deleted
2532 writeLock
= tableLockManager
.writeLock(tableName
, "should acquire without blocking");
2533 writeLock
.acquire(); // this should not block.
2534 writeLock
.release(); // release for clean state
2535 tableLockManager
.tableDeleted(tableName
);
2538 @Test (timeout
=180000)
2539 public void testMetaOffline() throws Exception
{
2541 HBaseFsck hbck
= doFsck(conf
, false);
2542 assertNoErrors(hbck
);
2543 deleteMetaRegion(conf
, true, false, false);
2544 hbck
= doFsck(conf
, false);
2545 // ERROR_CODE.UNKNOWN is coming because we reportError with a message for the hbase:meta
2546 // inconsistency and whether we will be fixing it or not.
2547 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.NO_META_REGION
, ERROR_CODE
.UNKNOWN
});
2548 hbck
= doFsck(conf
, true);
2549 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.NO_META_REGION
, ERROR_CODE
.UNKNOWN
});
2550 hbck
= doFsck(conf
, false);
2551 assertNoErrors(hbck
);
2554 private void deleteMetaRegion(Configuration conf
, boolean unassign
, boolean hdfs
,
2555 boolean regionInfoOnly
) throws IOException
, InterruptedException
{
2556 HRegionLocation metaLocation
= connection
.getRegionLocator(TableName
.META_TABLE_NAME
)
2557 .getRegionLocation(HConstants
.EMPTY_START_ROW
);
2558 ServerName hsa
= metaLocation
.getServerName();
2559 HRegionInfo hri
= metaLocation
.getRegionInfo();
2561 LOG
.info("Undeploying meta region " + hri
+ " from server " + hsa
);
2562 try (Connection unmanagedConnection
= ConnectionFactory
.createConnection(conf
)) {
2563 undeployRegion(unmanagedConnection
, hsa
, hri
);
2567 if (regionInfoOnly
) {
2568 LOG
.info("deleting hdfs .regioninfo data: " + hri
.toString() + hsa
.toString());
2569 Path rootDir
= FSUtils
.getRootDir(conf
);
2570 FileSystem fs
= rootDir
.getFileSystem(conf
);
2571 Path p
= new Path(rootDir
+ "/" + TableName
.META_TABLE_NAME
.getNameAsString(),
2572 hri
.getEncodedName());
2573 Path hriPath
= new Path(p
, HRegionFileSystem
.REGION_INFO_FILE
);
2574 fs
.delete(hriPath
, true);
2578 LOG
.info("deleting hdfs data: " + hri
.toString() + hsa
.toString());
2579 Path rootDir
= FSUtils
.getRootDir(conf
);
2580 FileSystem fs
= rootDir
.getFileSystem(conf
);
2581 Path p
= new Path(rootDir
+ "/" + TableName
.META_TABLE_NAME
.getNameAsString(),
2582 hri
.getEncodedName());
2583 HBaseFsck
.debugLsr(conf
, p
);
2584 boolean success
= fs
.delete(p
, true);
2585 LOG
.info("Deleted " + p
+ " sucessfully? " + success
);
2586 HBaseFsck
.debugLsr(conf
, p
);
2590 @Test (timeout
=180000)
2591 public void testTableWithNoRegions() throws Exception
{
2592 // We might end up with empty regions in a table
2593 // see also testNoHdfsTable()
2595 TableName
.valueOf(name
.getMethodName());
2597 // create table with one region
2598 HTableDescriptor desc
= new HTableDescriptor(table
);
2599 HColumnDescriptor hcd
= new HColumnDescriptor(Bytes
.toString(FAM
));
2600 desc
.addFamily(hcd
); // If a table has no CF's it doesn't get checked
2601 createTable(TEST_UTIL
, desc
, null);
2602 tbl
= (HTable
) connection
.getTable(table
, tableExecutorService
);
2604 // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2605 deleteRegion(conf
, tbl
.getTableDescriptor(), HConstants
.EMPTY_START_ROW
,
2606 HConstants
.EMPTY_END_ROW
, false, false, true);
2608 HBaseFsck hbck
= doFsck(conf
, false);
2609 assertErrors(hbck
, new ERROR_CODE
[] { ERROR_CODE
.NOT_IN_HDFS
});
2616 // check that hole fixed
2617 assertNoErrors(doFsck(conf
, false));
2619 cleanupTable(table
);
2624 @Test (timeout
=180000)
2625 public void testHbckAfterRegionMerge() throws Exception
{
2626 TableName table
= TableName
.valueOf("testMergeRegionFilesInHdfs");
2629 // disable CatalogJanitor
2630 TEST_UTIL
.getHBaseCluster().getMaster().setCatalogJanitorEnabled(false);
2632 assertEquals(ROWKEYS
.length
, countRows());
2634 // make sure data in regions, if in wal only there is no data loss
2636 HRegionInfo region1
= tbl
.getRegionLocation(Bytes
.toBytes("A")).getRegionInfo();
2637 HRegionInfo region2
= tbl
.getRegionLocation(Bytes
.toBytes("B")).getRegionInfo();
2639 int regionCountBeforeMerge
= tbl
.getRegionLocations().size();
2641 assertNotEquals(region1
, region2
);
2643 // do a region merge
2644 admin
.mergeRegions(region1
.getEncodedNameAsBytes(),
2645 region2
.getEncodedNameAsBytes(), false);
2647 // wait until region merged
2648 long timeout
= System
.currentTimeMillis() + 30 * 1000;
2650 if (tbl
.getRegionLocations().size() < regionCountBeforeMerge
) {
2652 } else if (System
.currentTimeMillis() > timeout
) {
2653 fail("Time out waiting on region " + region1
.getEncodedName()
2654 + " and " + region2
.getEncodedName() + " be merged");
2659 assertEquals(ROWKEYS
.length
, countRows());
2661 HBaseFsck hbck
= doFsck(conf
, false);
2662 assertNoErrors(hbck
); // no errors
2665 TEST_UTIL
.getHBaseCluster().getMaster().setCatalogJanitorEnabled(true);
2666 cleanupTable(table
);
2667 IOUtils
.closeQuietly(meta
);
2671 @Test (timeout
= 180000)
2672 public void testRegionBoundariesCheck() throws Exception
{
2673 HBaseFsck hbck
= doFsck(conf
, false);
2674 assertNoErrors(hbck
); // no errors
2676 hbck
.connect(); // need connection to have access to META
2677 hbck
.checkRegionBoundaries();
2678 } catch (IllegalArgumentException e
) {
2679 if (e
.getMessage().endsWith("not a valid DFS filename.")) {
2680 fail("Table directory path is not valid." + e
.getMessage());
2688 public TestName name
= new TestName();
2690 @Test (timeout
=180000)
2691 public void testReadOnlyProperty() throws Exception
{
2692 HBaseFsck hbck
= doFsck(conf
, false);
2693 Assert
.assertEquals("shouldIgnorePreCheckPermission", true,
2694 hbck
.shouldIgnorePreCheckPermission());
2696 hbck
= doFsck(conf
, true);
2697 Assert
.assertEquals("shouldIgnorePreCheckPermission", false,
2698 hbck
.shouldIgnorePreCheckPermission());
2700 hbck
= doFsck(conf
, true);
2701 hbck
.setIgnorePreCheckPermission(true);
2702 Assert
.assertEquals("shouldIgnorePreCheckPermission", true,
2703 hbck
.shouldIgnorePreCheckPermission());
2706 public static class MasterSyncObserver
extends BaseMasterObserver
{
2707 volatile CountDownLatch tableCreationLatch
= null;
2708 volatile CountDownLatch tableDeletionLatch
= null;
2711 public void postCreateTableHandler(final ObserverContext
<MasterCoprocessorEnvironment
> ctx
,
2712 HTableDescriptor desc
, HRegionInfo
[] regions
) throws IOException
{
2713 // the AccessController test, some times calls only and directly the postCreateTableHandler()
2714 if (tableCreationLatch
!= null) {
2715 tableCreationLatch
.countDown();
2720 public void postDeleteTableHandler(final ObserverContext
<MasterCoprocessorEnvironment
> ctx
,
2721 TableName tableName
)
2722 throws IOException
{
2723 // the AccessController test, some times calls only and directly the postDeleteTableHandler()
2724 if (tableDeletionLatch
!= null) {
2725 tableDeletionLatch
.countDown();
2730 public static void createTable(HBaseTestingUtility testUtil
, HTableDescriptor htd
,
2731 byte [][] splitKeys
) throws Exception
{
2732 // NOTE: We need a latch because admin is not sync,
2733 // so the postOp coprocessor method may be called after the admin operation returned.
2734 MasterSyncObserver observer
= (MasterSyncObserver
)testUtil
.getHBaseCluster().getMaster()
2735 .getMasterCoprocessorHost().findCoprocessor(MasterSyncObserver
.class.getName());
2736 observer
.tableCreationLatch
= new CountDownLatch(1);
2737 if (splitKeys
!= null) {
2738 admin
.createTable(htd
, splitKeys
);
2740 admin
.createTable(htd
);
2742 observer
.tableCreationLatch
.await();
2743 observer
.tableCreationLatch
= null;
2744 testUtil
.waitUntilAllRegionsAssigned(htd
.getTableName());
2747 public static void deleteTable(HBaseTestingUtility testUtil
, TableName tableName
)
2749 // NOTE: We need a latch because admin is not sync,
2750 // so the postOp coprocessor method may be called after the admin operation returned.
2751 MasterSyncObserver observer
= (MasterSyncObserver
)testUtil
.getHBaseCluster().getMaster()
2752 .getMasterCoprocessorHost().findCoprocessor(MasterSyncObserver
.class.getName());
2753 observer
.tableDeletionLatch
= new CountDownLatch(1);
2755 admin
.disableTable(tableName
);
2756 } catch (Exception e
) {
2757 LOG
.debug("Table: " + tableName
+ " already disabled, so just deleting it.");
2759 admin
.deleteTable(tableName
);
2760 observer
.tableDeletionLatch
.await();
2761 observer
.tableDeletionLatch
= null;