HBASE-21843 RegionGroupingProvider breaks the meta wal file name pattern which may...
[hbase.git] / hbase-server / src / main / java / org / apache / hadoop / hbase / master / MasterWalManager.java
blob8fdcb17cf5fadfafdc540ff24bec86e3b0c2c3ef
1 /**
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
19 package org.apache.hadoop.hbase.master;
21 import java.io.IOException;
22 import java.util.ArrayList;
23 import java.util.Collections;
24 import java.util.HashSet;
25 import java.util.List;
26 import java.util.Set;
27 import java.util.concurrent.locks.Lock;
28 import java.util.concurrent.locks.ReentrantLock;
29 import java.util.stream.Collectors;
30 import java.util.stream.Stream;
31 import org.apache.hadoop.conf.Configuration;
32 import org.apache.hadoop.fs.FileStatus;
33 import org.apache.hadoop.fs.FileSystem;
34 import org.apache.hadoop.fs.Path;
35 import org.apache.hadoop.fs.PathFilter;
36 import org.apache.hadoop.hbase.HConstants;
37 import org.apache.hadoop.hbase.ServerName;
38 import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
39 import org.apache.hadoop.hbase.util.CommonFSUtils;
40 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
41 import org.apache.hadoop.hbase.util.FSUtils;
42 import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
43 import org.apache.hadoop.hbase.wal.WALSplitter;
44 import org.apache.yetus.audience.InterfaceAudience;
45 import org.slf4j.Logger;
46 import org.slf4j.LoggerFactory;
47 import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
49 /**
50 * This class abstracts a bunch of operations the HMaster needs
51 * when splitting log files e.g. finding log files, dirs etc.
53 @InterfaceAudience.Private
54 public class MasterWalManager {
55 private static final Logger LOG = LoggerFactory.getLogger(MasterWalManager.class);
57 final static PathFilter META_FILTER = new PathFilter() {
58 @Override
59 public boolean accept(Path p) {
60 return AbstractFSWALProvider.isMetaFile(p);
64 @VisibleForTesting
65 public final static PathFilter NON_META_FILTER = new PathFilter() {
66 @Override
67 public boolean accept(Path p) {
68 return !AbstractFSWALProvider.isMetaFile(p);
72 // metrics for master
73 // TODO: Rename it, since those metrics are split-manager related
74 private final MetricsMasterFileSystem metricsMasterFilesystem = new MetricsMasterFileSystem();
76 // Keep around for convenience.
77 private final MasterServices services;
78 private final Configuration conf;
79 private final FileSystem fs;
81 // The Path to the old logs dir
82 private final Path oldLogDir;
83 private final Path rootDir;
85 // create the split log lock
86 private final Lock splitLogLock = new ReentrantLock();
87 private final SplitLogManager splitLogManager;
89 // Is the fileystem ok?
90 private volatile boolean fsOk = true;
92 public MasterWalManager(MasterServices services) throws IOException {
93 this(services.getConfiguration(), services.getMasterFileSystem().getWALFileSystem(),
94 services.getMasterFileSystem().getWALRootDir(), services);
97 public MasterWalManager(Configuration conf, FileSystem fs, Path rootDir, MasterServices services)
98 throws IOException {
99 this.fs = fs;
100 this.conf = conf;
101 this.rootDir = rootDir;
102 this.services = services;
103 this.splitLogManager = new SplitLogManager(services, conf);
105 this.oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME);
108 public void stop() {
109 if (splitLogManager != null) {
110 splitLogManager.stop();
114 @VisibleForTesting
115 SplitLogManager getSplitLogManager() {
116 return this.splitLogManager;
120 * Get the directory where old logs go
121 * @return the dir
123 Path getOldLogDir() {
124 return this.oldLogDir;
127 public FileSystem getFileSystem() {
128 return this.fs;
132 * Checks to see if the file system is still accessible.
133 * If not, sets closed
134 * @return false if file system is not available
136 private boolean checkFileSystem() {
137 if (this.fsOk) {
138 try {
139 FSUtils.checkFileSystemAvailable(this.fs);
140 FSUtils.checkDfsSafeMode(this.conf);
141 } catch (IOException e) {
142 services.abort("Shutting down HBase cluster: file system not available", e);
143 this.fsOk = false;
146 return this.fsOk;
150 * Get Servernames which are currently splitting; paths have a '-splitting' suffix.
151 * @return ServerName
152 * @throws IOException IOException
154 public Set<ServerName> getSplittingServersFromWALDir() throws IOException {
155 return getServerNamesFromWALDirPath(
156 p -> p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
160 * Get Servernames that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these
161 * are already being split -- they cannot be 'alive'.
162 * @return ServerName
163 * @throws IOException IOException
165 public Set<ServerName> getLiveServersFromWALDir() throws IOException {
166 return getServerNamesFromWALDirPath(
167 p -> !p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
171 * @return listing of ServerNames found by parsing WAL directory paths in FS.
173 public Set<ServerName> getServerNamesFromWALDirPath(final PathFilter filter) throws IOException {
174 FileStatus[] walDirForServerNames = getWALDirPaths(filter);
175 return Stream.of(walDirForServerNames).map(s -> {
176 ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(s.getPath());
177 if (serverName == null) {
178 LOG.warn("Log folder {} doesn't look like its name includes a " +
179 "region server name; leaving in place. If you see later errors about missing " +
180 "write ahead logs they may be saved in this location.", s.getPath());
181 return null;
183 return serverName;
184 }).filter(s -> s != null).collect(Collectors.toSet());
188 * @return List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME.
190 public FileStatus[] getWALDirPaths(final PathFilter filter) throws IOException {
191 Path walDirPath = new Path(CommonFSUtils.getWALRootDir(conf), HConstants.HREGION_LOGDIR_NAME);
192 FileStatus[] walDirForServerNames = FSUtils.listStatus(
193 CommonFSUtils.getWALFileSystem(conf), walDirPath, filter);
194 return walDirForServerNames == null? new FileStatus[0]: walDirForServerNames;
198 * Inspect the log directory to find dead servers which need recovery work
199 * @return A set of ServerNames which aren't running but still have WAL files left in file system
200 * @deprecated With proc-v2, we can record the crash server with procedure store, so do not need
201 * to scan the wal directory to find out the splitting wal directory any more. Leave
202 * it here only because {@code RecoverMetaProcedure}(which is also deprecated) uses
203 * it.
205 @Deprecated
206 public Set<ServerName> getFailedServersFromLogFolders() throws IOException {
207 boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
208 WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT);
210 Set<ServerName> serverNames = new HashSet<>();
211 Path logsDirPath = new Path(CommonFSUtils.getWALRootDir(conf), HConstants.HREGION_LOGDIR_NAME);
213 do {
214 if (services.isStopped()) {
215 LOG.warn("Master stopped while trying to get failed servers.");
216 break;
218 try {
219 if (!this.fs.exists(logsDirPath)) return serverNames;
220 FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null);
221 // Get online servers after getting log folders to avoid log folder deletion of newly
222 // checked in region servers . see HBASE-5916
223 Set<ServerName> onlineServers = services.getServerManager().getOnlineServers().keySet();
225 if (logFolders == null || logFolders.length == 0) {
226 LOG.debug("No log files to split, proceeding...");
227 return serverNames;
229 for (FileStatus status : logFolders) {
230 FileStatus[] curLogFiles = FSUtils.listStatus(this.fs, status.getPath(), null);
231 if (curLogFiles == null || curLogFiles.length == 0) {
232 // Empty log folder. No recovery needed
233 continue;
235 final ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(
236 status.getPath());
237 if (null == serverName) {
238 LOG.warn("Log folder " + status.getPath() + " doesn't look like its name includes a " +
239 "region server name; leaving in place. If you see later errors about missing " +
240 "write ahead logs they may be saved in this location.");
241 } else if (!onlineServers.contains(serverName)) {
242 LOG.info("Log folder " + status.getPath() + " doesn't belong "
243 + "to a known region server, splitting");
244 serverNames.add(serverName);
245 } else {
246 LOG.info("Log folder " + status.getPath() + " belongs to an existing region server");
249 retrySplitting = false;
250 } catch (IOException ioe) {
251 LOG.warn("Failed getting failed servers to be recovered.", ioe);
252 if (!checkFileSystem()) {
253 LOG.warn("Bad Filesystem, exiting");
254 Runtime.getRuntime().halt(1);
256 try {
257 if (retrySplitting) {
258 Thread.sleep(conf.getInt("hbase.hlog.split.failure.retry.interval", 30 * 1000));
260 } catch (InterruptedException e) {
261 LOG.warn("Interrupted, aborting since cannot return w/o splitting");
262 Thread.currentThread().interrupt();
263 retrySplitting = false;
264 Runtime.getRuntime().halt(1);
267 } while (retrySplitting);
269 return serverNames;
272 public void splitLog(final ServerName serverName) throws IOException {
273 splitLog(Collections.<ServerName>singleton(serverName));
277 * Specialized method to handle the splitting for meta WAL
278 * @param serverName logs belonging to this server will be split
280 public void splitMetaLog(final ServerName serverName) throws IOException {
281 splitMetaLog(Collections.<ServerName>singleton(serverName));
285 * Specialized method to handle the splitting for meta WAL
286 * @param serverNames logs belonging to these servers will be split
288 public void splitMetaLog(final Set<ServerName> serverNames) throws IOException {
289 splitLog(serverNames, META_FILTER);
292 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK", justification=
293 "We only release this lock when we set it. Updates to code that uses it should verify use " +
294 "of the guard boolean.")
295 List<Path> getLogDirs(final Set<ServerName> serverNames) throws IOException {
296 List<Path> logDirs = new ArrayList<>();
297 boolean needReleaseLock = false;
298 if (!this.services.isInitialized()) {
299 // during master initialization, we could have multiple places splitting a same wal
300 // XXX: Does this still exist after we move to proc-v2?
301 this.splitLogLock.lock();
302 needReleaseLock = true;
304 try {
305 for (ServerName serverName : serverNames) {
306 Path logDir = new Path(this.rootDir,
307 AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
308 Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
309 // Rename the directory so a rogue RS doesn't create more WALs
310 if (fs.exists(logDir)) {
311 if (!this.fs.rename(logDir, splitDir)) {
312 throw new IOException("Failed fs.rename for log split: " + logDir);
314 logDir = splitDir;
315 LOG.debug("Renamed region directory: " + splitDir);
316 } else if (!fs.exists(splitDir)) {
317 LOG.info("Log dir for server " + serverName + " does not exist");
318 continue;
320 logDirs.add(splitDir);
322 } catch (IOException ioe) {
323 if (!checkFileSystem()) {
324 this.services.abort("Aborting due to filesystem unavailable", ioe);
325 throw ioe;
327 } finally {
328 if (needReleaseLock) {
329 this.splitLogLock.unlock();
332 return logDirs;
335 public void splitLog(final Set<ServerName> serverNames) throws IOException {
336 splitLog(serverNames, NON_META_FILTER);
340 * This method is the base split method that splits WAL files matching a filter. Callers should
341 * pass the appropriate filter for meta and non-meta WALs.
342 * @param serverNames logs belonging to these servers will be split; this will rename the log
343 * directory out from under a soft-failed server
345 public void splitLog(final Set<ServerName> serverNames, PathFilter filter) throws IOException {
346 long splitTime = 0, splitLogSize = 0;
347 List<Path> logDirs = getLogDirs(serverNames);
349 splitLogManager.handleDeadWorkers(serverNames);
350 splitTime = EnvironmentEdgeManager.currentTime();
351 splitLogSize = splitLogManager.splitLogDistributed(serverNames, logDirs, filter);
352 splitTime = EnvironmentEdgeManager.currentTime() - splitTime;
354 if (this.metricsMasterFilesystem != null) {
355 if (filter == META_FILTER) {
356 this.metricsMasterFilesystem.addMetaWALSplit(splitTime, splitLogSize);
357 } else {
358 this.metricsMasterFilesystem.addSplit(splitTime, splitLogSize);
364 * For meta region open and closed normally on a server, it may leave some meta
365 * WAL in the server's wal dir. Since meta region is no long on this server,
366 * The SCP won't split those meta wals, just leaving them there. So deleting
367 * the wal dir will fail since the dir is not empty. Actually We can safely achive those
368 * meta log and Archiving the meta log and delete the dir.
369 * @param serverName the server to archive meta log
371 public void archiveMetaLog(final ServerName serverName) {
372 try {
373 Path logDir = new Path(this.rootDir,
374 AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
375 Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
376 if (fs.exists(splitDir)) {
377 FileStatus[] logfiles = FSUtils.listStatus(fs, splitDir, META_FILTER);
378 if (logfiles != null) {
379 for (FileStatus status : logfiles) {
380 if (!status.isDir()) {
381 Path newPath = AbstractFSWAL.getWALArchivePath(this.oldLogDir,
382 status.getPath());
383 if (!FSUtils.renameAndSetModifyTime(fs, status.getPath(), newPath)) {
384 LOG.warn("Unable to move " + status.getPath() + " to " + newPath);
385 } else {
386 LOG.debug("Archived meta log " + status.getPath() + " to " + newPath);
391 if (!fs.delete(splitDir, false)) {
392 LOG.warn("Unable to delete log dir. Ignoring. " + splitDir);
395 } catch (IOException ie) {
396 LOG.warn("Failed archiving meta log for server " + serverName, ie);