3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
19 package org
.apache
.hadoop
.hbase
.master
;
21 import java
.io
.IOException
;
22 import java
.util
.ArrayList
;
23 import java
.util
.Collections
;
24 import java
.util
.HashSet
;
25 import java
.util
.List
;
27 import java
.util
.concurrent
.locks
.Lock
;
28 import java
.util
.concurrent
.locks
.ReentrantLock
;
29 import java
.util
.stream
.Collectors
;
30 import java
.util
.stream
.Stream
;
31 import org
.apache
.hadoop
.conf
.Configuration
;
32 import org
.apache
.hadoop
.fs
.FileStatus
;
33 import org
.apache
.hadoop
.fs
.FileSystem
;
34 import org
.apache
.hadoop
.fs
.Path
;
35 import org
.apache
.hadoop
.fs
.PathFilter
;
36 import org
.apache
.hadoop
.hbase
.HConstants
;
37 import org
.apache
.hadoop
.hbase
.ServerName
;
38 import org
.apache
.hadoop
.hbase
.regionserver
.wal
.AbstractFSWAL
;
39 import org
.apache
.hadoop
.hbase
.util
.CommonFSUtils
;
40 import org
.apache
.hadoop
.hbase
.util
.EnvironmentEdgeManager
;
41 import org
.apache
.hadoop
.hbase
.util
.FSUtils
;
42 import org
.apache
.hadoop
.hbase
.wal
.AbstractFSWALProvider
;
43 import org
.apache
.hadoop
.hbase
.wal
.WALSplitter
;
44 import org
.apache
.yetus
.audience
.InterfaceAudience
;
45 import org
.slf4j
.Logger
;
46 import org
.slf4j
.LoggerFactory
;
47 import org
.apache
.hbase
.thirdparty
.com
.google
.common
.annotations
.VisibleForTesting
;
50 * This class abstracts a bunch of operations the HMaster needs
51 * when splitting log files e.g. finding log files, dirs etc.
53 @InterfaceAudience.Private
54 public class MasterWalManager
{
55 private static final Logger LOG
= LoggerFactory
.getLogger(MasterWalManager
.class);
57 final static PathFilter META_FILTER
= new PathFilter() {
59 public boolean accept(Path p
) {
60 return AbstractFSWALProvider
.isMetaFile(p
);
65 public final static PathFilter NON_META_FILTER
= new PathFilter() {
67 public boolean accept(Path p
) {
68 return !AbstractFSWALProvider
.isMetaFile(p
);
73 // TODO: Rename it, since those metrics are split-manager related
74 private final MetricsMasterFileSystem metricsMasterFilesystem
= new MetricsMasterFileSystem();
76 // Keep around for convenience.
77 private final MasterServices services
;
78 private final Configuration conf
;
79 private final FileSystem fs
;
81 // The Path to the old logs dir
82 private final Path oldLogDir
;
83 private final Path rootDir
;
85 // create the split log lock
86 private final Lock splitLogLock
= new ReentrantLock();
87 private final SplitLogManager splitLogManager
;
89 // Is the fileystem ok?
90 private volatile boolean fsOk
= true;
92 public MasterWalManager(MasterServices services
) throws IOException
{
93 this(services
.getConfiguration(), services
.getMasterFileSystem().getWALFileSystem(),
94 services
.getMasterFileSystem().getWALRootDir(), services
);
97 public MasterWalManager(Configuration conf
, FileSystem fs
, Path rootDir
, MasterServices services
)
101 this.rootDir
= rootDir
;
102 this.services
= services
;
103 this.splitLogManager
= new SplitLogManager(services
, conf
);
105 this.oldLogDir
= new Path(rootDir
, HConstants
.HREGION_OLDLOGDIR_NAME
);
109 if (splitLogManager
!= null) {
110 splitLogManager
.stop();
115 SplitLogManager
getSplitLogManager() {
116 return this.splitLogManager
;
120 * Get the directory where old logs go
123 Path
getOldLogDir() {
124 return this.oldLogDir
;
127 public FileSystem
getFileSystem() {
132 * Checks to see if the file system is still accessible.
133 * If not, sets closed
134 * @return false if file system is not available
136 private boolean checkFileSystem() {
139 FSUtils
.checkFileSystemAvailable(this.fs
);
140 FSUtils
.checkDfsSafeMode(this.conf
);
141 } catch (IOException e
) {
142 services
.abort("Shutting down HBase cluster: file system not available", e
);
150 * Get Servernames which are currently splitting; paths have a '-splitting' suffix.
152 * @throws IOException IOException
154 public Set
<ServerName
> getSplittingServersFromWALDir() throws IOException
{
155 return getServerNamesFromWALDirPath(
156 p
-> p
.getName().endsWith(AbstractFSWALProvider
.SPLITTING_EXT
));
160 * Get Servernames that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these
161 * are already being split -- they cannot be 'alive'.
163 * @throws IOException IOException
165 public Set
<ServerName
> getLiveServersFromWALDir() throws IOException
{
166 return getServerNamesFromWALDirPath(
167 p
-> !p
.getName().endsWith(AbstractFSWALProvider
.SPLITTING_EXT
));
171 * @return listing of ServerNames found by parsing WAL directory paths in FS.
173 public Set
<ServerName
> getServerNamesFromWALDirPath(final PathFilter filter
) throws IOException
{
174 FileStatus
[] walDirForServerNames
= getWALDirPaths(filter
);
175 return Stream
.of(walDirForServerNames
).map(s
-> {
176 ServerName serverName
= AbstractFSWALProvider
.getServerNameFromWALDirectoryName(s
.getPath());
177 if (serverName
== null) {
178 LOG
.warn("Log folder {} doesn't look like its name includes a " +
179 "region server name; leaving in place. If you see later errors about missing " +
180 "write ahead logs they may be saved in this location.", s
.getPath());
184 }).filter(s
-> s
!= null).collect(Collectors
.toSet());
188 * @return List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME.
190 public FileStatus
[] getWALDirPaths(final PathFilter filter
) throws IOException
{
191 Path walDirPath
= new Path(CommonFSUtils
.getWALRootDir(conf
), HConstants
.HREGION_LOGDIR_NAME
);
192 FileStatus
[] walDirForServerNames
= FSUtils
.listStatus(
193 CommonFSUtils
.getWALFileSystem(conf
), walDirPath
, filter
);
194 return walDirForServerNames
== null?
new FileStatus
[0]: walDirForServerNames
;
198 * Inspect the log directory to find dead servers which need recovery work
199 * @return A set of ServerNames which aren't running but still have WAL files left in file system
200 * @deprecated With proc-v2, we can record the crash server with procedure store, so do not need
201 * to scan the wal directory to find out the splitting wal directory any more. Leave
202 * it here only because {@code RecoverMetaProcedure}(which is also deprecated) uses
206 public Set
<ServerName
> getFailedServersFromLogFolders() throws IOException
{
207 boolean retrySplitting
= !conf
.getBoolean("hbase.hlog.split.skip.errors",
208 WALSplitter
.SPLIT_SKIP_ERRORS_DEFAULT
);
210 Set
<ServerName
> serverNames
= new HashSet
<>();
211 Path logsDirPath
= new Path(CommonFSUtils
.getWALRootDir(conf
), HConstants
.HREGION_LOGDIR_NAME
);
214 if (services
.isStopped()) {
215 LOG
.warn("Master stopped while trying to get failed servers.");
219 if (!this.fs
.exists(logsDirPath
)) return serverNames
;
220 FileStatus
[] logFolders
= FSUtils
.listStatus(this.fs
, logsDirPath
, null);
221 // Get online servers after getting log folders to avoid log folder deletion of newly
222 // checked in region servers . see HBASE-5916
223 Set
<ServerName
> onlineServers
= services
.getServerManager().getOnlineServers().keySet();
225 if (logFolders
== null || logFolders
.length
== 0) {
226 LOG
.debug("No log files to split, proceeding...");
229 for (FileStatus status
: logFolders
) {
230 FileStatus
[] curLogFiles
= FSUtils
.listStatus(this.fs
, status
.getPath(), null);
231 if (curLogFiles
== null || curLogFiles
.length
== 0) {
232 // Empty log folder. No recovery needed
235 final ServerName serverName
= AbstractFSWALProvider
.getServerNameFromWALDirectoryName(
237 if (null == serverName
) {
238 LOG
.warn("Log folder " + status
.getPath() + " doesn't look like its name includes a " +
239 "region server name; leaving in place. If you see later errors about missing " +
240 "write ahead logs they may be saved in this location.");
241 } else if (!onlineServers
.contains(serverName
)) {
242 LOG
.info("Log folder " + status
.getPath() + " doesn't belong "
243 + "to a known region server, splitting");
244 serverNames
.add(serverName
);
246 LOG
.info("Log folder " + status
.getPath() + " belongs to an existing region server");
249 retrySplitting
= false;
250 } catch (IOException ioe
) {
251 LOG
.warn("Failed getting failed servers to be recovered.", ioe
);
252 if (!checkFileSystem()) {
253 LOG
.warn("Bad Filesystem, exiting");
254 Runtime
.getRuntime().halt(1);
257 if (retrySplitting
) {
258 Thread
.sleep(conf
.getInt("hbase.hlog.split.failure.retry.interval", 30 * 1000));
260 } catch (InterruptedException e
) {
261 LOG
.warn("Interrupted, aborting since cannot return w/o splitting");
262 Thread
.currentThread().interrupt();
263 retrySplitting
= false;
264 Runtime
.getRuntime().halt(1);
267 } while (retrySplitting
);
272 public void splitLog(final ServerName serverName
) throws IOException
{
273 splitLog(Collections
.<ServerName
>singleton(serverName
));
277 * Specialized method to handle the splitting for meta WAL
278 * @param serverName logs belonging to this server will be split
280 public void splitMetaLog(final ServerName serverName
) throws IOException
{
281 splitMetaLog(Collections
.<ServerName
>singleton(serverName
));
285 * Specialized method to handle the splitting for meta WAL
286 * @param serverNames logs belonging to these servers will be split
288 public void splitMetaLog(final Set
<ServerName
> serverNames
) throws IOException
{
289 splitLog(serverNames
, META_FILTER
);
292 @edu.umd
.cs
.findbugs
.annotations
.SuppressWarnings(value
="UL_UNRELEASED_LOCK", justification
=
293 "We only release this lock when we set it. Updates to code that uses it should verify use " +
294 "of the guard boolean.")
295 List
<Path
> getLogDirs(final Set
<ServerName
> serverNames
) throws IOException
{
296 List
<Path
> logDirs
= new ArrayList
<>();
297 boolean needReleaseLock
= false;
298 if (!this.services
.isInitialized()) {
299 // during master initialization, we could have multiple places splitting a same wal
300 // XXX: Does this still exist after we move to proc-v2?
301 this.splitLogLock
.lock();
302 needReleaseLock
= true;
305 for (ServerName serverName
: serverNames
) {
306 Path logDir
= new Path(this.rootDir
,
307 AbstractFSWALProvider
.getWALDirectoryName(serverName
.toString()));
308 Path splitDir
= logDir
.suffix(AbstractFSWALProvider
.SPLITTING_EXT
);
309 // Rename the directory so a rogue RS doesn't create more WALs
310 if (fs
.exists(logDir
)) {
311 if (!this.fs
.rename(logDir
, splitDir
)) {
312 throw new IOException("Failed fs.rename for log split: " + logDir
);
315 LOG
.debug("Renamed region directory: " + splitDir
);
316 } else if (!fs
.exists(splitDir
)) {
317 LOG
.info("Log dir for server " + serverName
+ " does not exist");
320 logDirs
.add(splitDir
);
322 } catch (IOException ioe
) {
323 if (!checkFileSystem()) {
324 this.services
.abort("Aborting due to filesystem unavailable", ioe
);
328 if (needReleaseLock
) {
329 this.splitLogLock
.unlock();
335 public void splitLog(final Set
<ServerName
> serverNames
) throws IOException
{
336 splitLog(serverNames
, NON_META_FILTER
);
340 * This method is the base split method that splits WAL files matching a filter. Callers should
341 * pass the appropriate filter for meta and non-meta WALs.
342 * @param serverNames logs belonging to these servers will be split; this will rename the log
343 * directory out from under a soft-failed server
345 public void splitLog(final Set
<ServerName
> serverNames
, PathFilter filter
) throws IOException
{
346 long splitTime
= 0, splitLogSize
= 0;
347 List
<Path
> logDirs
= getLogDirs(serverNames
);
349 splitLogManager
.handleDeadWorkers(serverNames
);
350 splitTime
= EnvironmentEdgeManager
.currentTime();
351 splitLogSize
= splitLogManager
.splitLogDistributed(serverNames
, logDirs
, filter
);
352 splitTime
= EnvironmentEdgeManager
.currentTime() - splitTime
;
354 if (this.metricsMasterFilesystem
!= null) {
355 if (filter
== META_FILTER
) {
356 this.metricsMasterFilesystem
.addMetaWALSplit(splitTime
, splitLogSize
);
358 this.metricsMasterFilesystem
.addSplit(splitTime
, splitLogSize
);
364 * For meta region open and closed normally on a server, it may leave some meta
365 * WAL in the server's wal dir. Since meta region is no long on this server,
366 * The SCP won't split those meta wals, just leaving them there. So deleting
367 * the wal dir will fail since the dir is not empty. Actually We can safely achive those
368 * meta log and Archiving the meta log and delete the dir.
369 * @param serverName the server to archive meta log
371 public void archiveMetaLog(final ServerName serverName
) {
373 Path logDir
= new Path(this.rootDir
,
374 AbstractFSWALProvider
.getWALDirectoryName(serverName
.toString()));
375 Path splitDir
= logDir
.suffix(AbstractFSWALProvider
.SPLITTING_EXT
);
376 if (fs
.exists(splitDir
)) {
377 FileStatus
[] logfiles
= FSUtils
.listStatus(fs
, splitDir
, META_FILTER
);
378 if (logfiles
!= null) {
379 for (FileStatus status
: logfiles
) {
380 if (!status
.isDir()) {
381 Path newPath
= AbstractFSWAL
.getWALArchivePath(this.oldLogDir
,
383 if (!FSUtils
.renameAndSetModifyTime(fs
, status
.getPath(), newPath
)) {
384 LOG
.warn("Unable to move " + status
.getPath() + " to " + newPath
);
386 LOG
.debug("Archived meta log " + status
.getPath() + " to " + newPath
);
391 if (!fs
.delete(splitDir
, false)) {
392 LOG
.warn("Unable to delete log dir. Ignoring. " + splitDir
);
395 } catch (IOException ie
) {
396 LOG
.warn("Failed archiving meta log for server " + serverName
, ie
);