2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
18 package org
.apache
.hadoop
.hbase
.master
;
20 import java
.io
.IOException
;
21 import java
.util
.ArrayList
;
22 import java
.util
.Collections
;
23 import java
.util
.HashSet
;
24 import java
.util
.List
;
26 import java
.util
.concurrent
.locks
.Lock
;
27 import java
.util
.concurrent
.locks
.ReentrantLock
;
28 import java
.util
.stream
.Collectors
;
29 import java
.util
.stream
.Stream
;
30 import org
.apache
.hadoop
.conf
.Configuration
;
31 import org
.apache
.hadoop
.fs
.FileStatus
;
32 import org
.apache
.hadoop
.fs
.FileSystem
;
33 import org
.apache
.hadoop
.fs
.Path
;
34 import org
.apache
.hadoop
.fs
.PathFilter
;
35 import org
.apache
.hadoop
.hbase
.HConstants
;
36 import org
.apache
.hadoop
.hbase
.ServerName
;
37 import org
.apache
.hadoop
.hbase
.regionserver
.wal
.AbstractFSWAL
;
38 import org
.apache
.hadoop
.hbase
.util
.CommonFSUtils
;
39 import org
.apache
.hadoop
.hbase
.util
.EnvironmentEdgeManager
;
40 import org
.apache
.hadoop
.hbase
.util
.FSUtils
;
41 import org
.apache
.hadoop
.hbase
.wal
.AbstractFSWALProvider
;
42 import org
.apache
.hadoop
.hbase
.wal
.WALSplitter
;
43 import org
.apache
.yetus
.audience
.InterfaceAudience
;
44 import org
.slf4j
.Logger
;
45 import org
.slf4j
.LoggerFactory
;
48 * This class abstracts a bunch of operations the HMaster needs
49 * when splitting log files e.g. finding log files, dirs etc.
51 @InterfaceAudience.Private
52 public class MasterWalManager
{
53 private static final Logger LOG
= LoggerFactory
.getLogger(MasterWalManager
.class);
56 * Filter *in* WAL files that are for the hbase:meta Region.
58 final static PathFilter META_FILTER
= new PathFilter() {
60 public boolean accept(Path p
) {
61 return AbstractFSWALProvider
.isMetaFile(p
);
66 * Filter *out* WAL files that are for the hbase:meta Region; i.e. return user-space WALs only.
68 public final static PathFilter NON_META_FILTER
= new PathFilter() {
70 public boolean accept(Path p
) {
71 return !AbstractFSWALProvider
.isMetaFile(p
);
76 // TODO: Rename it, since those metrics are split-manager related
77 private final MetricsMasterFileSystem metricsMasterFilesystem
= new MetricsMasterFileSystem();
79 // Keep around for convenience.
80 private final MasterServices services
;
81 private final Configuration conf
;
82 private final FileSystem fs
;
84 // The Path to the old logs dir
85 private final Path oldLogDir
;
87 private final Path rootDir
;
89 // create the split log lock
90 private final Lock splitLogLock
= new ReentrantLock();
93 * Superceded by {@link SplitWALManager}; i.e. procedure-based WAL splitting rather than
94 * 'classic' zk-coordinated WAL splitting.
95 * @deprecated since 2.3.0 and 3.0.0 to be removed in 4.0.0; replaced by {@link SplitWALManager}.
96 * @see SplitWALManager
99 private final SplitLogManager splitLogManager
;
101 // Is the fileystem ok?
102 private volatile boolean fsOk
= true;
104 public MasterWalManager(MasterServices services
) throws IOException
{
105 this(services
.getConfiguration(), services
.getMasterFileSystem().getWALFileSystem(),
106 services
.getMasterFileSystem().getWALRootDir(), services
);
109 public MasterWalManager(Configuration conf
, FileSystem fs
, Path rootDir
, MasterServices services
)
113 this.rootDir
= rootDir
;
114 this.services
= services
;
115 this.splitLogManager
= new SplitLogManager(services
, conf
);
116 this.oldLogDir
= new Path(rootDir
, HConstants
.HREGION_OLDLOGDIR_NAME
);
120 if (splitLogManager
!= null) {
121 splitLogManager
.stop();
125 SplitLogManager
getSplitLogManager() {
126 return this.splitLogManager
;
130 * Get the directory where old logs go
133 Path
getOldLogDir() {
134 return this.oldLogDir
;
137 public FileSystem
getFileSystem() {
142 * Checks to see if the file system is still accessible.
143 * If not, sets closed
144 * @return false if file system is not available
146 private boolean checkFileSystem() {
149 FSUtils
.checkFileSystemAvailable(this.fs
);
150 FSUtils
.checkDfsSafeMode(this.conf
);
151 } catch (IOException e
) {
152 services
.abort("Shutting down HBase cluster: file system not available", e
);
160 * Get Servernames which are currently splitting; paths have a '-splitting' suffix.
162 public Set
<ServerName
> getSplittingServersFromWALDir() throws IOException
{
163 return getServerNamesFromWALDirPath(
164 p
-> p
.getName().endsWith(AbstractFSWALProvider
.SPLITTING_EXT
));
168 * Get Servernames that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these
169 * are already being split -- they cannot be 'alive'.
171 public Set
<ServerName
> getLiveServersFromWALDir() throws IOException
{
172 return getServerNamesFromWALDirPath(
173 p
-> !p
.getName().endsWith(AbstractFSWALProvider
.SPLITTING_EXT
));
177 * @return listing of ServerNames found by parsing WAL directory paths in FS.
179 public Set
<ServerName
> getServerNamesFromWALDirPath(final PathFilter filter
) throws IOException
{
180 FileStatus
[] walDirForServerNames
= getWALDirPaths(filter
);
181 return Stream
.of(walDirForServerNames
).map(s
-> {
182 ServerName serverName
= AbstractFSWALProvider
.getServerNameFromWALDirectoryName(s
.getPath());
183 if (serverName
== null) {
184 LOG
.warn("Log folder {} doesn't look like its name includes a " +
185 "region server name; leaving in place. If you see later errors about missing " +
186 "write ahead logs they may be saved in this location.", s
.getPath());
190 }).filter(s
-> s
!= null).collect(Collectors
.toSet());
194 * @return List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME.
196 public FileStatus
[] getWALDirPaths(final PathFilter filter
) throws IOException
{
197 Path walDirPath
= new Path(CommonFSUtils
.getWALRootDir(conf
), HConstants
.HREGION_LOGDIR_NAME
);
198 FileStatus
[] walDirForServerNames
= CommonFSUtils
.listStatus(fs
, walDirPath
, filter
);
199 return walDirForServerNames
== null?
new FileStatus
[0]: walDirForServerNames
;
203 * Inspect the log directory to find dead servers which need recovery work
204 * @return A set of ServerNames which aren't running but still have WAL files left in file system
205 * @deprecated With proc-v2, we can record the crash server with procedure store, so do not need
206 * to scan the wal directory to find out the splitting wal directory any more. Leave
207 * it here only because {@code RecoverMetaProcedure}(which is also deprecated) uses
211 public Set
<ServerName
> getFailedServersFromLogFolders() throws IOException
{
212 boolean retrySplitting
= !conf
.getBoolean(WALSplitter
.SPLIT_SKIP_ERRORS_KEY
,
213 WALSplitter
.SPLIT_SKIP_ERRORS_DEFAULT
);
215 Set
<ServerName
> serverNames
= new HashSet
<>();
216 Path logsDirPath
= new Path(CommonFSUtils
.getWALRootDir(conf
), HConstants
.HREGION_LOGDIR_NAME
);
219 if (services
.isStopped()) {
220 LOG
.warn("Master stopped while trying to get failed servers.");
224 if (!this.fs
.exists(logsDirPath
)) return serverNames
;
225 FileStatus
[] logFolders
= CommonFSUtils
.listStatus(this.fs
, logsDirPath
, null);
226 // Get online servers after getting log folders to avoid log folder deletion of newly
227 // checked in region servers . see HBASE-5916
228 Set
<ServerName
> onlineServers
= services
.getServerManager().getOnlineServers().keySet();
230 if (logFolders
== null || logFolders
.length
== 0) {
231 LOG
.debug("No log files to split, proceeding...");
234 for (FileStatus status
: logFolders
) {
235 FileStatus
[] curLogFiles
= CommonFSUtils
.listStatus(this.fs
, status
.getPath(), null);
236 if (curLogFiles
== null || curLogFiles
.length
== 0) {
237 // Empty log folder. No recovery needed
240 final ServerName serverName
= AbstractFSWALProvider
.getServerNameFromWALDirectoryName(
242 if (null == serverName
) {
243 LOG
.warn("Log folder " + status
.getPath() + " doesn't look like its name includes a " +
244 "region server name; leaving in place. If you see later errors about missing " +
245 "write ahead logs they may be saved in this location.");
246 } else if (!onlineServers
.contains(serverName
)) {
247 LOG
.info("Log folder " + status
.getPath() + " doesn't belong "
248 + "to a known region server, splitting");
249 serverNames
.add(serverName
);
251 LOG
.info("Log folder " + status
.getPath() + " belongs to an existing region server");
254 retrySplitting
= false;
255 } catch (IOException ioe
) {
256 LOG
.warn("Failed getting failed servers to be recovered.", ioe
);
257 if (!checkFileSystem()) {
258 LOG
.warn("Bad Filesystem, exiting");
259 Runtime
.getRuntime().halt(1);
262 if (retrySplitting
) {
263 Thread
.sleep(conf
.getInt("hbase.hlog.split.failure.retry.interval", 30 * 1000));
265 } catch (InterruptedException e
) {
266 LOG
.warn("Interrupted, aborting since cannot return w/o splitting");
267 Thread
.currentThread().interrupt();
268 retrySplitting
= false;
269 Runtime
.getRuntime().halt(1);
272 } while (retrySplitting
);
277 public void splitLog(final ServerName serverName
) throws IOException
{
278 splitLog(Collections
.<ServerName
>singleton(serverName
));
282 * Specialized method to handle the splitting for meta WAL
283 * @param serverName logs belonging to this server will be split
285 public void splitMetaLog(final ServerName serverName
) throws IOException
{
286 splitMetaLog(Collections
.<ServerName
>singleton(serverName
));
290 * Specialized method to handle the splitting for meta WAL
291 * @param serverNames logs belonging to these servers will be split
293 public void splitMetaLog(final Set
<ServerName
> serverNames
) throws IOException
{
294 splitLog(serverNames
, META_FILTER
);
297 @edu.umd
.cs
.findbugs
.annotations
.SuppressWarnings(value
="UL_UNRELEASED_LOCK", justification
=
298 "We only release this lock when we set it. Updates to code that uses it should verify use " +
299 "of the guard boolean.")
300 List
<Path
> getLogDirs(final Set
<ServerName
> serverNames
) throws IOException
{
301 List
<Path
> logDirs
= new ArrayList
<>();
302 boolean needReleaseLock
= false;
303 if (!this.services
.isInitialized()) {
304 // during master initialization, we could have multiple places splitting a same wal
305 // XXX: Does this still exist after we move to proc-v2?
306 this.splitLogLock
.lock();
307 needReleaseLock
= true;
310 for (ServerName serverName
: serverNames
) {
311 Path logDir
= new Path(this.rootDir
,
312 AbstractFSWALProvider
.getWALDirectoryName(serverName
.toString()));
313 Path splitDir
= logDir
.suffix(AbstractFSWALProvider
.SPLITTING_EXT
);
314 // Rename the directory so a rogue RS doesn't create more WALs
315 if (fs
.exists(logDir
)) {
316 if (!this.fs
.rename(logDir
, splitDir
)) {
317 throw new IOException("Failed fs.rename for log split: " + logDir
);
320 LOG
.debug("Renamed region directory: " + splitDir
);
321 } else if (!fs
.exists(splitDir
)) {
322 LOG
.info("Log dir for server " + serverName
+ " does not exist");
325 logDirs
.add(splitDir
);
327 } catch (IOException ioe
) {
328 if (!checkFileSystem()) {
329 this.services
.abort("Aborting due to filesystem unavailable", ioe
);
333 if (needReleaseLock
) {
334 this.splitLogLock
.unlock();
340 public void splitLog(final Set
<ServerName
> serverNames
) throws IOException
{
341 splitLog(serverNames
, NON_META_FILTER
);
345 * This method is the base split method that splits WAL files matching a filter. Callers should
346 * pass the appropriate filter for meta and non-meta WALs.
347 * @param serverNames logs belonging to these servers will be split; this will rename the log
348 * directory out from under a soft-failed server
350 public void splitLog(final Set
<ServerName
> serverNames
, PathFilter filter
) throws IOException
{
351 long splitTime
= 0, splitLogSize
= 0;
352 List
<Path
> logDirs
= getLogDirs(serverNames
);
354 splitLogManager
.handleDeadWorkers(serverNames
);
355 splitTime
= EnvironmentEdgeManager
.currentTime();
356 splitLogSize
= splitLogManager
.splitLogDistributed(serverNames
, logDirs
, filter
);
357 splitTime
= EnvironmentEdgeManager
.currentTime() - splitTime
;
359 if (this.metricsMasterFilesystem
!= null) {
360 if (filter
== META_FILTER
) {
361 this.metricsMasterFilesystem
.addMetaWALSplit(splitTime
, splitLogSize
);
363 this.metricsMasterFilesystem
.addSplit(splitTime
, splitLogSize
);
369 * The hbase:meta region may OPEN and CLOSE without issue on a server and then move elsewhere.
370 * On CLOSE, the WAL for the hbase:meta table may not be archived yet (The WAL is only needed if
371 * hbase:meta did not close cleanaly). Since meta region is no long on this server,
372 * the ServerCrashProcedure won't split these leftover hbase:meta WALs, just leaving them in
373 * the WAL splitting dir. If we try to delete the WAL splitting for the server, it fail since
374 * the dir is not totally empty. We can safely archive these hbase:meta log; then the
375 * WAL dir can be deleted.
376 * @param serverName the server to archive meta log
378 public void archiveMetaLog(final ServerName serverName
) {
380 Path logDir
= new Path(this.rootDir
,
381 AbstractFSWALProvider
.getWALDirectoryName(serverName
.toString()));
382 Path splitDir
= logDir
.suffix(AbstractFSWALProvider
.SPLITTING_EXT
);
383 if (fs
.exists(splitDir
)) {
384 FileStatus
[] logfiles
= CommonFSUtils
.listStatus(fs
, splitDir
, META_FILTER
);
385 if (logfiles
!= null) {
386 for (FileStatus status
: logfiles
) {
387 if (!status
.isDir()) {
388 Path newPath
= AbstractFSWAL
.getWALArchivePath(this.oldLogDir
,
390 if (!CommonFSUtils
.renameAndSetModifyTime(fs
, status
.getPath(), newPath
)) {
391 LOG
.warn("Unable to move " + status
.getPath() + " to " + newPath
);
393 LOG
.debug("Archived meta log " + status
.getPath() + " to " + newPath
);
398 if (!fs
.delete(splitDir
, false)) {
399 LOG
.warn("Unable to delete log dir. Ignoring. " + splitDir
);
402 } catch (IOException ie
) {
403 LOG
.warn("Failed archiving meta log for server " + serverName
, ie
);