HBASE-26921 Rewrite the counting cells part in TestMultiVersions (#4316)
[hbase.git] / hbase-server / src / main / java / org / apache / hadoop / hbase / master / MasterWalManager.java
blobcd92a63a179d5620d4a83625840370648fa78560
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
18 package org.apache.hadoop.hbase.master;
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.List;
25 import java.util.Set;
26 import java.util.concurrent.locks.Lock;
27 import java.util.concurrent.locks.ReentrantLock;
28 import java.util.stream.Collectors;
29 import java.util.stream.Stream;
30 import org.apache.hadoop.conf.Configuration;
31 import org.apache.hadoop.fs.FileStatus;
32 import org.apache.hadoop.fs.FileSystem;
33 import org.apache.hadoop.fs.Path;
34 import org.apache.hadoop.fs.PathFilter;
35 import org.apache.hadoop.hbase.HConstants;
36 import org.apache.hadoop.hbase.ServerName;
37 import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
38 import org.apache.hadoop.hbase.util.CommonFSUtils;
39 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
40 import org.apache.hadoop.hbase.util.FSUtils;
41 import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
42 import org.apache.hadoop.hbase.wal.WALSplitter;
43 import org.apache.yetus.audience.InterfaceAudience;
44 import org.slf4j.Logger;
45 import org.slf4j.LoggerFactory;
47 /**
48 * This class abstracts a bunch of operations the HMaster needs
49 * when splitting log files e.g. finding log files, dirs etc.
51 @InterfaceAudience.Private
52 public class MasterWalManager {
53 private static final Logger LOG = LoggerFactory.getLogger(MasterWalManager.class);
55 /**
56 * Filter *in* WAL files that are for the hbase:meta Region.
58 final static PathFilter META_FILTER = new PathFilter() {
59 @Override
60 public boolean accept(Path p) {
61 return AbstractFSWALProvider.isMetaFile(p);
65 /**
66 * Filter *out* WAL files that are for the hbase:meta Region; i.e. return user-space WALs only.
68 public final static PathFilter NON_META_FILTER = new PathFilter() {
69 @Override
70 public boolean accept(Path p) {
71 return !AbstractFSWALProvider.isMetaFile(p);
75 // metrics for master
76 // TODO: Rename it, since those metrics are split-manager related
77 private final MetricsMasterFileSystem metricsMasterFilesystem = new MetricsMasterFileSystem();
79 // Keep around for convenience.
80 private final MasterServices services;
81 private final Configuration conf;
82 private final FileSystem fs;
84 // The Path to the old logs dir
85 private final Path oldLogDir;
87 private final Path rootDir;
89 // create the split log lock
90 private final Lock splitLogLock = new ReentrantLock();
92 /**
93 * Superceded by {@link SplitWALManager}; i.e. procedure-based WAL splitting rather than
94 * 'classic' zk-coordinated WAL splitting.
95 * @deprecated since 2.3.0 and 3.0.0 to be removed in 4.0.0; replaced by {@link SplitWALManager}.
96 * @see SplitWALManager
98 @Deprecated
99 private final SplitLogManager splitLogManager;
101 // Is the fileystem ok?
102 private volatile boolean fsOk = true;
104 public MasterWalManager(MasterServices services) throws IOException {
105 this(services.getConfiguration(), services.getMasterFileSystem().getWALFileSystem(),
106 services.getMasterFileSystem().getWALRootDir(), services);
109 public MasterWalManager(Configuration conf, FileSystem fs, Path rootDir, MasterServices services)
110 throws IOException {
111 this.fs = fs;
112 this.conf = conf;
113 this.rootDir = rootDir;
114 this.services = services;
115 this.splitLogManager = new SplitLogManager(services, conf);
116 this.oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME);
119 public void stop() {
120 if (splitLogManager != null) {
121 splitLogManager.stop();
125 SplitLogManager getSplitLogManager() {
126 return this.splitLogManager;
130 * Get the directory where old logs go
131 * @return the dir
133 Path getOldLogDir() {
134 return this.oldLogDir;
137 public FileSystem getFileSystem() {
138 return this.fs;
142 * Checks to see if the file system is still accessible.
143 * If not, sets closed
144 * @return false if file system is not available
146 private boolean checkFileSystem() {
147 if (this.fsOk) {
148 try {
149 FSUtils.checkFileSystemAvailable(this.fs);
150 FSUtils.checkDfsSafeMode(this.conf);
151 } catch (IOException e) {
152 services.abort("Shutting down HBase cluster: file system not available", e);
153 this.fsOk = false;
156 return this.fsOk;
160 * Get Servernames which are currently splitting; paths have a '-splitting' suffix.
162 public Set<ServerName> getSplittingServersFromWALDir() throws IOException {
163 return getServerNamesFromWALDirPath(
164 p -> p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
168 * Get Servernames that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these
169 * are already being split -- they cannot be 'alive'.
171 public Set<ServerName> getLiveServersFromWALDir() throws IOException {
172 return getServerNamesFromWALDirPath(
173 p -> !p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
177 * @return listing of ServerNames found by parsing WAL directory paths in FS.
179 public Set<ServerName> getServerNamesFromWALDirPath(final PathFilter filter) throws IOException {
180 FileStatus[] walDirForServerNames = getWALDirPaths(filter);
181 return Stream.of(walDirForServerNames).map(s -> {
182 ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(s.getPath());
183 if (serverName == null) {
184 LOG.warn("Log folder {} doesn't look like its name includes a " +
185 "region server name; leaving in place. If you see later errors about missing " +
186 "write ahead logs they may be saved in this location.", s.getPath());
187 return null;
189 return serverName;
190 }).filter(s -> s != null).collect(Collectors.toSet());
194 * @return List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME.
196 public FileStatus[] getWALDirPaths(final PathFilter filter) throws IOException {
197 Path walDirPath = new Path(CommonFSUtils.getWALRootDir(conf), HConstants.HREGION_LOGDIR_NAME);
198 FileStatus[] walDirForServerNames = CommonFSUtils.listStatus(fs, walDirPath, filter);
199 return walDirForServerNames == null? new FileStatus[0]: walDirForServerNames;
203 * Inspect the log directory to find dead servers which need recovery work
204 * @return A set of ServerNames which aren't running but still have WAL files left in file system
205 * @deprecated With proc-v2, we can record the crash server with procedure store, so do not need
206 * to scan the wal directory to find out the splitting wal directory any more. Leave
207 * it here only because {@code RecoverMetaProcedure}(which is also deprecated) uses
208 * it.
210 @Deprecated
211 public Set<ServerName> getFailedServersFromLogFolders() throws IOException {
212 boolean retrySplitting = !conf.getBoolean(WALSplitter.SPLIT_SKIP_ERRORS_KEY,
213 WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT);
215 Set<ServerName> serverNames = new HashSet<>();
216 Path logsDirPath = new Path(CommonFSUtils.getWALRootDir(conf), HConstants.HREGION_LOGDIR_NAME);
218 do {
219 if (services.isStopped()) {
220 LOG.warn("Master stopped while trying to get failed servers.");
221 break;
223 try {
224 if (!this.fs.exists(logsDirPath)) return serverNames;
225 FileStatus[] logFolders = CommonFSUtils.listStatus(this.fs, logsDirPath, null);
226 // Get online servers after getting log folders to avoid log folder deletion of newly
227 // checked in region servers . see HBASE-5916
228 Set<ServerName> onlineServers = services.getServerManager().getOnlineServers().keySet();
230 if (logFolders == null || logFolders.length == 0) {
231 LOG.debug("No log files to split, proceeding...");
232 return serverNames;
234 for (FileStatus status : logFolders) {
235 FileStatus[] curLogFiles = CommonFSUtils.listStatus(this.fs, status.getPath(), null);
236 if (curLogFiles == null || curLogFiles.length == 0) {
237 // Empty log folder. No recovery needed
238 continue;
240 final ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(
241 status.getPath());
242 if (null == serverName) {
243 LOG.warn("Log folder " + status.getPath() + " doesn't look like its name includes a " +
244 "region server name; leaving in place. If you see later errors about missing " +
245 "write ahead logs they may be saved in this location.");
246 } else if (!onlineServers.contains(serverName)) {
247 LOG.info("Log folder " + status.getPath() + " doesn't belong "
248 + "to a known region server, splitting");
249 serverNames.add(serverName);
250 } else {
251 LOG.info("Log folder " + status.getPath() + " belongs to an existing region server");
254 retrySplitting = false;
255 } catch (IOException ioe) {
256 LOG.warn("Failed getting failed servers to be recovered.", ioe);
257 if (!checkFileSystem()) {
258 LOG.warn("Bad Filesystem, exiting");
259 Runtime.getRuntime().halt(1);
261 try {
262 if (retrySplitting) {
263 Thread.sleep(conf.getInt("hbase.hlog.split.failure.retry.interval", 30 * 1000));
265 } catch (InterruptedException e) {
266 LOG.warn("Interrupted, aborting since cannot return w/o splitting");
267 Thread.currentThread().interrupt();
268 retrySplitting = false;
269 Runtime.getRuntime().halt(1);
272 } while (retrySplitting);
274 return serverNames;
277 public void splitLog(final ServerName serverName) throws IOException {
278 splitLog(Collections.<ServerName>singleton(serverName));
282 * Specialized method to handle the splitting for meta WAL
283 * @param serverName logs belonging to this server will be split
285 public void splitMetaLog(final ServerName serverName) throws IOException {
286 splitMetaLog(Collections.<ServerName>singleton(serverName));
290 * Specialized method to handle the splitting for meta WAL
291 * @param serverNames logs belonging to these servers will be split
293 public void splitMetaLog(final Set<ServerName> serverNames) throws IOException {
294 splitLog(serverNames, META_FILTER);
297 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK", justification=
298 "We only release this lock when we set it. Updates to code that uses it should verify use " +
299 "of the guard boolean.")
300 List<Path> getLogDirs(final Set<ServerName> serverNames) throws IOException {
301 List<Path> logDirs = new ArrayList<>();
302 boolean needReleaseLock = false;
303 if (!this.services.isInitialized()) {
304 // during master initialization, we could have multiple places splitting a same wal
305 // XXX: Does this still exist after we move to proc-v2?
306 this.splitLogLock.lock();
307 needReleaseLock = true;
309 try {
310 for (ServerName serverName : serverNames) {
311 Path logDir = new Path(this.rootDir,
312 AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
313 Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
314 // Rename the directory so a rogue RS doesn't create more WALs
315 if (fs.exists(logDir)) {
316 if (!this.fs.rename(logDir, splitDir)) {
317 throw new IOException("Failed fs.rename for log split: " + logDir);
319 logDir = splitDir;
320 LOG.debug("Renamed region directory: " + splitDir);
321 } else if (!fs.exists(splitDir)) {
322 LOG.info("Log dir for server " + serverName + " does not exist");
323 continue;
325 logDirs.add(splitDir);
327 } catch (IOException ioe) {
328 if (!checkFileSystem()) {
329 this.services.abort("Aborting due to filesystem unavailable", ioe);
330 throw ioe;
332 } finally {
333 if (needReleaseLock) {
334 this.splitLogLock.unlock();
337 return logDirs;
340 public void splitLog(final Set<ServerName> serverNames) throws IOException {
341 splitLog(serverNames, NON_META_FILTER);
345 * This method is the base split method that splits WAL files matching a filter. Callers should
346 * pass the appropriate filter for meta and non-meta WALs.
347 * @param serverNames logs belonging to these servers will be split; this will rename the log
348 * directory out from under a soft-failed server
350 public void splitLog(final Set<ServerName> serverNames, PathFilter filter) throws IOException {
351 long splitTime = 0, splitLogSize = 0;
352 List<Path> logDirs = getLogDirs(serverNames);
354 splitLogManager.handleDeadWorkers(serverNames);
355 splitTime = EnvironmentEdgeManager.currentTime();
356 splitLogSize = splitLogManager.splitLogDistributed(serverNames, logDirs, filter);
357 splitTime = EnvironmentEdgeManager.currentTime() - splitTime;
359 if (this.metricsMasterFilesystem != null) {
360 if (filter == META_FILTER) {
361 this.metricsMasterFilesystem.addMetaWALSplit(splitTime, splitLogSize);
362 } else {
363 this.metricsMasterFilesystem.addSplit(splitTime, splitLogSize);
369 * The hbase:meta region may OPEN and CLOSE without issue on a server and then move elsewhere.
370 * On CLOSE, the WAL for the hbase:meta table may not be archived yet (The WAL is only needed if
371 * hbase:meta did not close cleanaly). Since meta region is no long on this server,
372 * the ServerCrashProcedure won't split these leftover hbase:meta WALs, just leaving them in
373 * the WAL splitting dir. If we try to delete the WAL splitting for the server, it fail since
374 * the dir is not totally empty. We can safely archive these hbase:meta log; then the
375 * WAL dir can be deleted.
376 * @param serverName the server to archive meta log
378 public void archiveMetaLog(final ServerName serverName) {
379 try {
380 Path logDir = new Path(this.rootDir,
381 AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
382 Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
383 if (fs.exists(splitDir)) {
384 FileStatus[] logfiles = CommonFSUtils.listStatus(fs, splitDir, META_FILTER);
385 if (logfiles != null) {
386 for (FileStatus status : logfiles) {
387 if (!status.isDir()) {
388 Path newPath = AbstractFSWAL.getWALArchivePath(this.oldLogDir,
389 status.getPath());
390 if (!CommonFSUtils.renameAndSetModifyTime(fs, status.getPath(), newPath)) {
391 LOG.warn("Unable to move " + status.getPath() + " to " + newPath);
392 } else {
393 LOG.debug("Archived meta log " + status.getPath() + " to " + newPath);
398 if (!fs.delete(splitDir, false)) {
399 LOG.warn("Unable to delete log dir. Ignoring. " + splitDir);
402 } catch (IOException ie) {
403 LOG.warn("Failed archiving meta log for server " + serverName, ie);