3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
19 package org
.apache
.hadoop
.hbase
.util
;
21 import java
.io
.IOException
;
22 import java
.util
.ArrayList
;
23 import java
.util
.Collection
;
24 import java
.util
.EnumSet
;
25 import java
.util
.HashMap
;
26 import java
.util
.List
;
28 import java
.util
.UUID
;
29 import java
.util
.concurrent
.CountDownLatch
;
30 import java
.util
.concurrent
.ExecutorService
;
31 import java
.util
.concurrent
.ScheduledThreadPoolExecutor
;
33 import org
.apache
.commons
.logging
.Log
;
34 import org
.apache
.commons
.logging
.LogFactory
;
35 import org
.apache
.hadoop
.conf
.Configuration
;
36 import org
.apache
.hadoop
.fs
.FileStatus
;
37 import org
.apache
.hadoop
.fs
.FileSystem
;
38 import org
.apache
.hadoop
.fs
.Path
;
39 import org
.apache
.hadoop
.hbase
.ClusterStatus
;
40 import org
.apache
.hadoop
.hbase
.ClusterStatus
.Option
;
41 import org
.apache
.hadoop
.hbase
.HBaseTestingUtility
;
42 import org
.apache
.hadoop
.hbase
.HColumnDescriptor
;
43 import org
.apache
.hadoop
.hbase
.HConstants
;
44 import org
.apache
.hadoop
.hbase
.HRegionInfo
;
45 import org
.apache
.hadoop
.hbase
.HRegionLocation
;
46 import org
.apache
.hadoop
.hbase
.HTableDescriptor
;
47 import org
.apache
.hadoop
.hbase
.MetaTableAccessor
;
48 import org
.apache
.hadoop
.hbase
.ServerName
;
49 import org
.apache
.hadoop
.hbase
.TableName
;
50 import org
.apache
.hadoop
.hbase
.client
.Admin
;
51 import org
.apache
.hadoop
.hbase
.client
.ClusterConnection
;
52 import org
.apache
.hadoop
.hbase
.client
.Connection
;
53 import org
.apache
.hadoop
.hbase
.client
.ConnectionFactory
;
54 import org
.apache
.hadoop
.hbase
.client
.Delete
;
55 import org
.apache
.hadoop
.hbase
.client
.Put
;
56 import org
.apache
.hadoop
.hbase
.client
.RegionInfo
;
57 import org
.apache
.hadoop
.hbase
.client
.RegionLocator
;
58 import org
.apache
.hadoop
.hbase
.client
.Scan
;
59 import org
.apache
.hadoop
.hbase
.client
.Table
;
60 import org
.apache
.hadoop
.hbase
.client
.TableDescriptor
;
61 import org
.apache
.hadoop
.hbase
.coprocessor
.MasterCoprocessorEnvironment
;
62 import org
.apache
.hadoop
.hbase
.coprocessor
.MasterObserver
;
63 import org
.apache
.hadoop
.hbase
.coprocessor
.ObserverContext
;
64 import org
.apache
.hadoop
.hbase
.master
.assignment
.AssignmentManager
;
65 import org
.apache
.hadoop
.hbase
.master
.assignment
.RegionStates
;
66 import org
.apache
.hadoop
.hbase
.mob
.MobFileName
;
67 import org
.apache
.hadoop
.hbase
.mob
.MobUtils
;
68 import org
.apache
.hadoop
.hbase
.regionserver
.HRegionFileSystem
;
69 import org
.apache
.hadoop
.hbase
.shaded
.protobuf
.ProtobufUtil
;
70 import org
.apache
.hadoop
.hbase
.shaded
.protobuf
.generated
.AdminProtos
;
71 import org
.apache
.hadoop
.hbase
.util
.HBaseFsck
.ErrorReporter
;
72 import org
.apache
.hadoop
.hbase
.util
.HBaseFsck
.HbckInfo
;
73 import org
.apache
.hadoop
.hbase
.util
.HBaseFsck
.TableInfo
;
74 import org
.apache
.hadoop
.hbase
.util
.hbck
.HFileCorruptionChecker
;
75 import org
.apache
.zookeeper
.KeeperException
;
76 import org
.junit
.rules
.TestName
;
78 import static org
.junit
.Assert
.assertEquals
;
79 import static org
.junit
.Assert
.fail
;
82 * This is the base class for HBaseFsck's ability to detect reasons for inconsistent tables.
84 * Actual tests are in :
88 * TestHBaseFsckReplicas
90 public class BaseTestHBaseFsck
{
91 static final int POOL_SIZE
= 7;
92 protected static final Log LOG
= LogFactory
.getLog(BaseTestHBaseFsck
.class);
93 protected final static HBaseTestingUtility TEST_UTIL
= new HBaseTestingUtility();
94 protected final static Configuration conf
= TEST_UTIL
.getConfiguration();
95 protected final static String FAM_STR
= "fam";
96 protected final static byte[] FAM
= Bytes
.toBytes(FAM_STR
);
97 protected final static int REGION_ONLINE_TIMEOUT
= 800;
98 protected static AssignmentManager assignmentManager
;
99 protected static RegionStates regionStates
;
100 protected static ExecutorService tableExecutorService
;
101 protected static ScheduledThreadPoolExecutor hbfsckExecutorService
;
102 protected static ClusterConnection connection
;
103 protected static Admin admin
;
105 // for the instance, reset every test run
107 protected final static byte[][] SPLITS
= new byte[][] { Bytes
.toBytes("A"),
108 Bytes
.toBytes("B"), Bytes
.toBytes("C") };
109 // one row per region.
110 protected final static byte[][] ROWKEYS
= new byte[][] {
111 Bytes
.toBytes("00"), Bytes
.toBytes("50"), Bytes
.toBytes("A0"), Bytes
.toBytes("A5"),
112 Bytes
.toBytes("B0"), Bytes
.toBytes("B5"), Bytes
.toBytes("C0"), Bytes
.toBytes("C5") };
116 * Create a new region in META.
118 protected HRegionInfo
createRegion(final HTableDescriptor
119 htd
, byte[] startKey
, byte[] endKey
)
121 Table meta
= connection
.getTable(TableName
.META_TABLE_NAME
, tableExecutorService
);
122 HRegionInfo hri
= new HRegionInfo(htd
.getTableName(), startKey
, endKey
);
123 MetaTableAccessor
.addRegionToMeta(meta
, hri
);
129 * Debugging method to dump the contents of meta.
131 protected void dumpMeta(TableName tableName
) throws IOException
{
132 List
<byte[]> metaRows
= TEST_UTIL
.getMetaTableRows(tableName
);
133 for (byte[] row
: metaRows
) {
134 LOG
.info(Bytes
.toString(row
));
139 * This method is used to undeploy a region -- close it and attempt to
140 * remove its state from the Master.
142 protected void undeployRegion(Connection conn
, ServerName sn
,
143 HRegionInfo hri
) throws IOException
, InterruptedException
{
145 HBaseFsckRepair
.closeRegionSilentlyAndWait(conn
, sn
, hri
);
146 if (!hri
.isMetaTable()) {
147 admin
.offline(hri
.getRegionName());
149 } catch (IOException ioe
) {
150 LOG
.warn("Got exception when attempting to offline region "
151 + Bytes
.toString(hri
.getRegionName()), ioe
);
155 * Delete a region from assignments, meta, or completely from hdfs.
156 * @param unassign if true unassign region if assigned
157 * @param metaRow if true remove region's row from META
158 * @param hdfs if true remove region's dir in HDFS
160 protected void deleteRegion(Configuration conf
, final HTableDescriptor htd
,
161 byte[] startKey
, byte[] endKey
, boolean unassign
, boolean metaRow
,
162 boolean hdfs
) throws IOException
, InterruptedException
{
163 deleteRegion(conf
, htd
, startKey
, endKey
, unassign
, metaRow
, hdfs
, false,
164 HRegionInfo
.DEFAULT_REPLICA_ID
);
168 * Delete a region from assignments, meta, or completely from hdfs.
169 * @param unassign if true unassign region if assigned
170 * @param metaRow if true remove region's row from META
171 * @param hdfs if true remove region's dir in HDFS
172 * @param regionInfoOnly if true remove a region dir's .regioninfo file
173 * @param replicaId replica id
175 protected void deleteRegion(Configuration conf
, final HTableDescriptor htd
,
176 byte[] startKey
, byte[] endKey
, boolean unassign
, boolean metaRow
,
177 boolean hdfs
, boolean regionInfoOnly
, int replicaId
)
178 throws IOException
, InterruptedException
{
179 LOG
.info("** Before delete:");
180 dumpMeta(htd
.getTableName());
182 List
<HRegionLocation
> locations
;
183 try(RegionLocator rl
= connection
.getRegionLocator(tbl
.getName())) {
184 locations
= rl
.getAllRegionLocations();
187 for (HRegionLocation location
: locations
) {
188 HRegionInfo hri
= location
.getRegionInfo();
189 ServerName hsa
= location
.getServerName();
190 if (Bytes
.compareTo(hri
.getStartKey(), startKey
) == 0
191 && Bytes
.compareTo(hri
.getEndKey(), endKey
) == 0
192 && hri
.getReplicaId() == replicaId
) {
194 LOG
.info("RegionName: " +hri
.getRegionNameAsString());
195 byte[] deleteRow
= hri
.getRegionName();
198 LOG
.info("Undeploying region " + hri
+ " from server " + hsa
);
199 undeployRegion(connection
, hsa
, hri
);
202 if (regionInfoOnly
) {
203 LOG
.info("deleting hdfs .regioninfo data: " + hri
.toString() + hsa
.toString());
204 Path rootDir
= FSUtils
.getRootDir(conf
);
205 FileSystem fs
= rootDir
.getFileSystem(conf
);
206 Path p
= new Path(FSUtils
.getTableDir(rootDir
, htd
.getTableName()),
207 hri
.getEncodedName());
208 Path hriPath
= new Path(p
, HRegionFileSystem
.REGION_INFO_FILE
);
209 fs
.delete(hriPath
, true);
213 LOG
.info("deleting hdfs data: " + hri
.toString() + hsa
.toString());
214 Path rootDir
= FSUtils
.getRootDir(conf
);
215 FileSystem fs
= rootDir
.getFileSystem(conf
);
216 Path p
= new Path(FSUtils
.getTableDir(rootDir
, htd
.getTableName()),
217 hri
.getEncodedName());
218 HBaseFsck
.debugLsr(conf
, p
);
219 boolean success
= fs
.delete(p
, true);
220 LOG
.info("Deleted " + p
+ " sucessfully? " + success
);
221 HBaseFsck
.debugLsr(conf
, p
);
225 try (Table meta
= connection
.getTable(TableName
.META_TABLE_NAME
, tableExecutorService
)) {
226 Delete delete
= new Delete(deleteRow
);
231 LOG
.info(hri
.toString() + hsa
.toString());
234 TEST_UTIL
.getMetaTableRows(htd
.getTableName());
235 LOG
.info("*** After delete:");
236 dumpMeta(htd
.getTableName());
240 * Setup a clean table before we start mucking with it.
242 * It will set tbl which needs to be closed after test
244 * @throws IOException
245 * @throws InterruptedException
246 * @throws KeeperException
248 void setupTable(TableName tablename
) throws Exception
{
249 setupTableWithRegionReplica(tablename
, 1);
253 * Setup a clean table with a certain region_replica count
255 * It will set tbl which needs to be closed after test
259 void setupTableWithRegionReplica(TableName tablename
, int replicaCount
) throws Exception
{
260 HTableDescriptor desc
= new HTableDescriptor(tablename
);
261 desc
.setRegionReplication(replicaCount
);
262 HColumnDescriptor hcd
= new HColumnDescriptor(Bytes
.toString(FAM
));
263 desc
.addFamily(hcd
); // If a table has no CF's it doesn't get checked
264 createTable(TEST_UTIL
, desc
, SPLITS
);
266 tbl
= connection
.getTable(tablename
, tableExecutorService
);
267 List
<Put
> puts
= new ArrayList
<>(ROWKEYS
.length
);
268 for (byte[] row
: ROWKEYS
) {
269 Put p
= new Put(row
);
270 p
.addColumn(FAM
, Bytes
.toBytes("val"), row
);
277 * Setup a clean table with a mob-enabled column.
279 * @param tablename The name of a table to be created.
282 void setupMobTable(TableName tablename
) throws Exception
{
283 HTableDescriptor desc
= new HTableDescriptor(tablename
);
284 HColumnDescriptor hcd
= new HColumnDescriptor(Bytes
.toString(FAM
));
285 hcd
.setMobEnabled(true);
286 hcd
.setMobThreshold(0);
287 desc
.addFamily(hcd
); // If a table has no CF's it doesn't get checked
288 createTable(TEST_UTIL
, desc
, SPLITS
);
290 tbl
= connection
.getTable(tablename
, tableExecutorService
);
291 List
<Put
> puts
= new ArrayList
<>(ROWKEYS
.length
);
292 for (byte[] row
: ROWKEYS
) {
293 Put p
= new Put(row
);
294 p
.addColumn(FAM
, Bytes
.toBytes("val"), row
);
301 * Counts the number of rows to verify data loss or non-dataloss.
303 int countRows() throws IOException
{
304 return TEST_UTIL
.countRows(tbl
);
308 * Counts the number of rows to verify data loss or non-dataloss.
310 int countRows(byte[] start
, byte[] end
) throws IOException
{
311 return TEST_UTIL
.countRows(tbl
, new Scan(start
, end
));
315 * delete table in preparation for next test
318 * @throws IOException
320 void cleanupTable(TableName tablename
) throws Exception
{
326 ((ClusterConnection
) connection
).clearRegionCache();
327 deleteTable(TEST_UTIL
, tablename
);
331 * Get region info from local cluster.
333 Map
<ServerName
, List
<String
>> getDeployedHRIs(final Admin admin
) throws IOException
{
334 ClusterStatus status
= admin
.getClusterStatus(EnumSet
.of(Option
.LIVE_SERVERS
));
335 Collection
<ServerName
> regionServers
= status
.getServers();
336 Map
<ServerName
, List
<String
>> mm
= new HashMap
<>();
337 for (ServerName hsi
: regionServers
) {
338 AdminProtos
.AdminService
.BlockingInterface server
= connection
.getAdmin(hsi
);
340 // list all online regions from this region server
341 List
<HRegionInfo
> regions
= ProtobufUtil
.getOnlineRegions(server
);
342 List
<String
> regionNames
= new ArrayList
<>(regions
.size());
343 for (HRegionInfo hri
: regions
) {
344 regionNames
.add(hri
.getRegionNameAsString());
346 mm
.put(hsi
, regionNames
);
352 * Returns the HSI a region info is on.
354 ServerName
findDeployedHSI(Map
<ServerName
, List
<String
>> mm
, HRegionInfo hri
) {
355 for (Map
.Entry
<ServerName
,List
<String
>> e
: mm
.entrySet()) {
356 if (e
.getValue().contains(hri
.getRegionNameAsString())) {
363 public void deleteTableDir(TableName table
) throws IOException
{
364 Path rootDir
= FSUtils
.getRootDir(conf
);
365 FileSystem fs
= rootDir
.getFileSystem(conf
);
366 Path p
= FSUtils
.getTableDir(rootDir
, table
);
367 HBaseFsck
.debugLsr(conf
, p
);
368 boolean success
= fs
.delete(p
, true);
369 LOG
.info("Deleted " + p
+ " sucessfully? " + success
);
373 * We don't have an easy way to verify that a flush completed, so we loop until we find a
374 * legitimate hfile and return it.
377 * @return Path of a flushed hfile.
378 * @throws IOException
380 Path
getFlushedHFile(FileSystem fs
, TableName table
) throws IOException
{
381 Path tableDir
= FSUtils
.getTableDir(FSUtils
.getRootDir(conf
), table
);
382 Path regionDir
= FSUtils
.getRegionDirs(fs
, tableDir
).get(0);
383 Path famDir
= new Path(regionDir
, FAM_STR
);
385 // keep doing this until we get a legit hfile
387 FileStatus
[] hfFss
= fs
.listStatus(famDir
);
388 if (hfFss
.length
== 0) {
391 for (FileStatus hfs
: hfFss
) {
392 if (!hfs
.isDirectory()) {
393 return hfs
.getPath();
400 * Gets flushed mob files.
401 * @param fs The current file system.
402 * @param table The current table name.
403 * @return Path of a flushed hfile.
404 * @throws IOException
406 Path
getFlushedMobFile(FileSystem fs
, TableName table
) throws IOException
{
407 Path famDir
= MobUtils
.getMobFamilyPath(conf
, table
, FAM_STR
);
409 // keep doing this until we get a legit hfile
411 FileStatus
[] hfFss
= fs
.listStatus(famDir
);
412 if (hfFss
.length
== 0) {
415 for (FileStatus hfs
: hfFss
) {
416 if (!hfs
.isDirectory()) {
417 return hfs
.getPath();
424 * Creates a new mob file name by the old one.
425 * @param oldFileName The old mob file name.
426 * @return The new mob file name.
428 String
createMobFileName(String oldFileName
) {
429 MobFileName mobFileName
= MobFileName
.create(oldFileName
);
430 String startKey
= mobFileName
.getStartKey();
431 String date
= mobFileName
.getDate();
432 return MobFileName
.create(startKey
, date
, UUID
.randomUUID().toString().replaceAll("-", ""))
440 * Test that use this should have a timeout, because this method could potentially wait forever.
442 protected void doQuarantineTest(TableName table
, HBaseFsck hbck
, int check
,
443 int corrupt
, int fail
, int quar
, int missing
) throws Exception
{
446 assertEquals(ROWKEYS
.length
, countRows());
447 admin
.flush(table
); // flush is async.
449 // Mess it up by leaving a hole in the assignment, meta, and hdfs data
450 admin
.disableTable(table
);
452 String
[] args
= {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
453 table
.getNameAsString()};
454 HBaseFsck res
= hbck
.exec(hbfsckExecutorService
, args
);
456 HFileCorruptionChecker hfcc
= res
.getHFilecorruptionChecker();
457 assertEquals(hfcc
.getHFilesChecked(), check
);
458 assertEquals(hfcc
.getCorrupted().size(), corrupt
);
459 assertEquals(hfcc
.getFailures().size(), fail
);
460 assertEquals(hfcc
.getQuarantined().size(), quar
);
461 assertEquals(hfcc
.getMissing().size(), missing
);
463 // its been fixed, verify that we can enable
464 admin
.enableTableAsync(table
);
465 while (!admin
.isTableEnabled(table
)) {
468 } catch (InterruptedException e
) {
470 fail("Interrupted when trying to enable table " + table
);
479 static class MockErrorReporter
implements ErrorReporter
{
480 static int calledCount
= 0;
483 public void clear() {
488 public void report(String message
) {
493 public void reportError(String message
) {
498 public void reportError(ERROR_CODE errorCode
, String message
) {
503 public void reportError(ERROR_CODE errorCode
, String message
, TableInfo table
) {
508 public void reportError(ERROR_CODE errorCode
,
509 String message
, TableInfo table
, HbckInfo info
) {
514 public void reportError(ERROR_CODE errorCode
, String message
,
515 TableInfo table
, HbckInfo info1
, HbckInfo info2
) {
520 public int summarize() {
521 return ++calledCount
;
525 public void detail(String details
) {
530 public ArrayList
<ERROR_CODE
> getErrorList() {
532 return new ArrayList
<>();
536 public void progress() {
541 public void print(String message
) {
546 public void resetErrors() {
551 public boolean tableHasErrors(TableInfo table
) {
558 protected void deleteMetaRegion(Configuration conf
, boolean unassign
, boolean hdfs
,
559 boolean regionInfoOnly
) throws IOException
, InterruptedException
{
560 HRegionLocation metaLocation
= connection
.getRegionLocator(TableName
.META_TABLE_NAME
)
561 .getRegionLocation(HConstants
.EMPTY_START_ROW
);
562 ServerName hsa
= metaLocation
.getServerName();
563 HRegionInfo hri
= metaLocation
.getRegionInfo();
565 LOG
.info("Undeploying meta region " + hri
+ " from server " + hsa
);
566 try (Connection unmanagedConnection
= ConnectionFactory
.createConnection(conf
)) {
567 undeployRegion(unmanagedConnection
, hsa
, hri
);
571 if (regionInfoOnly
) {
572 LOG
.info("deleting hdfs .regioninfo data: " + hri
.toString() + hsa
.toString());
573 Path rootDir
= FSUtils
.getRootDir(conf
);
574 FileSystem fs
= rootDir
.getFileSystem(conf
);
575 Path p
= new Path(rootDir
+ "/" + TableName
.META_TABLE_NAME
.getNameAsString(),
576 hri
.getEncodedName());
577 Path hriPath
= new Path(p
, HRegionFileSystem
.REGION_INFO_FILE
);
578 fs
.delete(hriPath
, true);
582 LOG
.info("deleting hdfs data: " + hri
.toString() + hsa
.toString());
583 Path rootDir
= FSUtils
.getRootDir(conf
);
584 FileSystem fs
= rootDir
.getFileSystem(conf
);
585 Path p
= new Path(rootDir
+ "/" + TableName
.META_TABLE_NAME
.getNameAsString(),
586 hri
.getEncodedName());
587 HBaseFsck
.debugLsr(conf
, p
);
588 boolean success
= fs
.delete(p
, true);
589 LOG
.info("Deleted " + p
+ " sucessfully? " + success
);
590 HBaseFsck
.debugLsr(conf
, p
);
595 public TestName name
= new TestName();
597 public static class MasterSyncObserver
implements MasterObserver
{
598 volatile CountDownLatch tableCreationLatch
= null;
599 volatile CountDownLatch tableDeletionLatch
= null;
602 public void postCompletedCreateTableAction(
603 final ObserverContext
<MasterCoprocessorEnvironment
> ctx
,
604 final TableDescriptor desc
,
605 final RegionInfo
[] regions
) throws IOException
{
606 // the AccessController test, some times calls only and directly the
607 // postCompletedCreateTableAction()
608 if (tableCreationLatch
!= null) {
609 tableCreationLatch
.countDown();
614 public void postCompletedDeleteTableAction(
615 final ObserverContext
<MasterCoprocessorEnvironment
> ctx
,
616 final TableName tableName
) throws IOException
{
617 // the AccessController test, some times calls only and directly the
618 // postCompletedDeleteTableAction()
619 if (tableDeletionLatch
!= null) {
620 tableDeletionLatch
.countDown();
625 public static void createTable(HBaseTestingUtility testUtil
, HTableDescriptor htd
,
626 byte [][] splitKeys
) throws Exception
{
627 // NOTE: We need a latch because admin is not sync,
628 // so the postOp coprocessor method may be called after the admin operation returned.
629 MasterSyncObserver observer
= (MasterSyncObserver
)testUtil
.getHBaseCluster().getMaster()
630 .getMasterCoprocessorHost().findCoprocessor(MasterSyncObserver
.class.getName());
631 observer
.tableCreationLatch
= new CountDownLatch(1);
632 if (splitKeys
!= null) {
633 admin
.createTable(htd
, splitKeys
);
635 admin
.createTable(htd
);
637 observer
.tableCreationLatch
.await();
638 observer
.tableCreationLatch
= null;
639 testUtil
.waitUntilAllRegionsAssigned(htd
.getTableName());
642 public static void deleteTable(HBaseTestingUtility testUtil
, TableName tableName
)
644 // NOTE: We need a latch because admin is not sync,
645 // so the postOp coprocessor method may be called after the admin operation returned.
646 MasterSyncObserver observer
= (MasterSyncObserver
)testUtil
.getHBaseCluster().getMaster()
647 .getMasterCoprocessorHost().findCoprocessor(MasterSyncObserver
.class.getName());
648 observer
.tableDeletionLatch
= new CountDownLatch(1);
650 admin
.disableTable(tableName
);
651 } catch (Exception e
) {
652 LOG
.debug("Table: " + tableName
+ " already disabled, so just deleting it.");
654 admin
.deleteTable(tableName
);
655 observer
.tableDeletionLatch
.await();
656 observer
.tableDeletionLatch
= null;