doc: Fix section of functions age(xid) and mxid_age(xid)
[pgsql.git] / src / common / file_utils.c
blob398fe1c334daeee78b7ed69f3110bbc1f98321ac
1 /*-------------------------------------------------------------------------
3 * File-processing utility routines.
5 * Assorted utility functions to work on files.
8 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
9 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/common/file_utils.c
13 *-------------------------------------------------------------------------
16 #ifndef FRONTEND
17 #include "postgres.h"
18 #else
19 #include "postgres_fe.h"
20 #endif
22 #include <dirent.h>
23 #include <fcntl.h>
24 #include <sys/stat.h>
25 #include <unistd.h>
27 #include "common/file_utils.h"
28 #ifdef FRONTEND
29 #include "common/logging.h"
30 #endif
31 #include "common/relpath.h"
32 #include "port/pg_iovec.h"
34 #ifdef FRONTEND
36 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
37 #if defined(HAVE_SYNC_FILE_RANGE)
38 #define PG_FLUSH_DATA_WORKS 1
39 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
40 #define PG_FLUSH_DATA_WORKS 1
41 #endif
44 * pg_xlog has been renamed to pg_wal in version 10.
46 #define MINIMUM_VERSION_FOR_PG_WAL 100000
48 #ifdef PG_FLUSH_DATA_WORKS
49 static int pre_sync_fname(const char *fname, bool isdir);
50 #endif
51 static void walkdir(const char *path,
52 int (*action) (const char *fname, bool isdir),
53 bool process_symlinks);
55 #ifdef HAVE_SYNCFS
58 * do_syncfs -- Try to syncfs a file system
60 * Reports errors trying to open the path. syncfs() errors are fatal.
62 static void
63 do_syncfs(const char *path)
65 int fd;
67 fd = open(path, O_RDONLY, 0);
69 if (fd < 0)
71 pg_log_error("could not open file \"%s\": %m", path);
72 return;
75 if (syncfs(fd) < 0)
77 pg_log_error("could not synchronize file system for file \"%s\": %m", path);
78 (void) close(fd);
79 exit(EXIT_FAILURE);
82 (void) close(fd);
85 #endif /* HAVE_SYNCFS */
88 * Synchronize PGDATA and all its contents.
90 * We sync regular files and directories wherever they are, but we follow
91 * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc.
92 * Other symlinks are presumed to point at files we're not responsible for
93 * syncing, and might not have privileges to write at all.
95 * serverVersion indicates the version of the server to be sync'd.
97 void
98 sync_pgdata(const char *pg_data,
99 int serverVersion,
100 DataDirSyncMethod sync_method)
102 bool xlog_is_symlink;
103 char pg_wal[MAXPGPATH];
104 char pg_tblspc[MAXPGPATH];
106 /* handle renaming of pg_xlog to pg_wal in post-10 clusters */
107 snprintf(pg_wal, MAXPGPATH, "%s/%s", pg_data,
108 serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal");
109 snprintf(pg_tblspc, MAXPGPATH, "%s/%s", pg_data, PG_TBLSPC_DIR);
112 * If pg_wal is a symlink, we'll need to recurse into it separately,
113 * because the first walkdir below will ignore it.
115 xlog_is_symlink = false;
118 struct stat st;
120 if (lstat(pg_wal, &st) < 0)
121 pg_log_error("could not stat file \"%s\": %m", pg_wal);
122 else if (S_ISLNK(st.st_mode))
123 xlog_is_symlink = true;
126 switch (sync_method)
128 case DATA_DIR_SYNC_METHOD_SYNCFS:
130 #ifndef HAVE_SYNCFS
131 pg_log_error("this build does not support sync method \"%s\"",
132 "syncfs");
133 exit(EXIT_FAILURE);
134 #else
135 DIR *dir;
136 struct dirent *de;
139 * On Linux, we don't have to open every single file one by
140 * one. We can use syncfs() to sync whole filesystems. We
141 * only expect filesystem boundaries to exist where we
142 * tolerate symlinks, namely pg_wal and the tablespaces, so we
143 * call syncfs() for each of those directories.
146 /* Sync the top level pgdata directory. */
147 do_syncfs(pg_data);
149 /* If any tablespaces are configured, sync each of those. */
150 dir = opendir(pg_tblspc);
151 if (dir == NULL)
152 pg_log_error("could not open directory \"%s\": %m",
153 pg_tblspc);
154 else
156 while (errno = 0, (de = readdir(dir)) != NULL)
158 char subpath[MAXPGPATH * 2];
160 if (strcmp(de->d_name, ".") == 0 ||
161 strcmp(de->d_name, "..") == 0)
162 continue;
164 snprintf(subpath, sizeof(subpath), "%s/%s",
165 pg_tblspc, de->d_name);
166 do_syncfs(subpath);
169 if (errno)
170 pg_log_error("could not read directory \"%s\": %m",
171 pg_tblspc);
173 (void) closedir(dir);
176 /* If pg_wal is a symlink, process that too. */
177 if (xlog_is_symlink)
178 do_syncfs(pg_wal);
179 #endif /* HAVE_SYNCFS */
181 break;
183 case DATA_DIR_SYNC_METHOD_FSYNC:
186 * If possible, hint to the kernel that we're soon going to
187 * fsync the data directory and its contents.
189 #ifdef PG_FLUSH_DATA_WORKS
190 walkdir(pg_data, pre_sync_fname, false);
191 if (xlog_is_symlink)
192 walkdir(pg_wal, pre_sync_fname, false);
193 walkdir(pg_tblspc, pre_sync_fname, true);
194 #endif
197 * Now we do the fsync()s in the same order.
199 * The main call ignores symlinks, so in addition to specially
200 * processing pg_wal if it's a symlink, pg_tblspc has to be
201 * visited separately with process_symlinks = true. Note that
202 * if there are any plain directories in pg_tblspc, they'll
203 * get fsync'd twice. That's not an expected case so we don't
204 * worry about optimizing it.
206 walkdir(pg_data, fsync_fname, false);
207 if (xlog_is_symlink)
208 walkdir(pg_wal, fsync_fname, false);
209 walkdir(pg_tblspc, fsync_fname, true);
211 break;
216 * Synchronize the given directory and all its contents.
218 * This is a convenient wrapper on top of walkdir() and do_syncfs().
220 void
221 sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method)
223 switch (sync_method)
225 case DATA_DIR_SYNC_METHOD_SYNCFS:
227 #ifndef HAVE_SYNCFS
228 pg_log_error("this build does not support sync method \"%s\"",
229 "syncfs");
230 exit(EXIT_FAILURE);
231 #else
233 * On Linux, we don't have to open every single file one by
234 * one. We can use syncfs() to sync the whole filesystem.
236 do_syncfs(dir);
237 #endif /* HAVE_SYNCFS */
239 break;
241 case DATA_DIR_SYNC_METHOD_FSYNC:
244 * If possible, hint to the kernel that we're soon going to
245 * fsync the data directory and its contents.
247 #ifdef PG_FLUSH_DATA_WORKS
248 walkdir(dir, pre_sync_fname, false);
249 #endif
251 walkdir(dir, fsync_fname, false);
253 break;
258 * walkdir: recursively walk a directory, applying the action to each
259 * regular file and directory (including the named directory itself).
261 * If process_symlinks is true, the action and recursion are also applied
262 * to regular files and directories that are pointed to by symlinks in the
263 * given directory; otherwise symlinks are ignored. Symlinks are always
264 * ignored in subdirectories, ie we intentionally don't pass down the
265 * process_symlinks flag to recursive calls.
267 * Errors are reported but not considered fatal.
269 * See also walkdir in fd.c, which is a backend version of this logic.
271 static void
272 walkdir(const char *path,
273 int (*action) (const char *fname, bool isdir),
274 bool process_symlinks)
276 DIR *dir;
277 struct dirent *de;
279 dir = opendir(path);
280 if (dir == NULL)
282 pg_log_error("could not open directory \"%s\": %m", path);
283 return;
286 while (errno = 0, (de = readdir(dir)) != NULL)
288 char subpath[MAXPGPATH * 2];
290 if (strcmp(de->d_name, ".") == 0 ||
291 strcmp(de->d_name, "..") == 0)
292 continue;
294 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
296 switch (get_dirent_type(subpath, de, process_symlinks, PG_LOG_ERROR))
298 case PGFILETYPE_REG:
299 (*action) (subpath, false);
300 break;
301 case PGFILETYPE_DIR:
302 walkdir(subpath, action, false);
303 break;
304 default:
307 * Errors are already reported directly by get_dirent_type(),
308 * and any remaining symlinks and unknown file types are
309 * ignored.
311 break;
315 if (errno)
316 pg_log_error("could not read directory \"%s\": %m", path);
318 (void) closedir(dir);
321 * It's important to fsync the destination directory itself as individual
322 * file fsyncs don't guarantee that the directory entry for the file is
323 * synced. Recent versions of ext4 have made the window much wider but
324 * it's been an issue for ext3 and other filesystems in the past.
326 (*action) (path, true);
330 * Hint to the OS that it should get ready to fsync() this file.
332 * Ignores errors trying to open unreadable files, and reports other errors
333 * non-fatally.
335 #ifdef PG_FLUSH_DATA_WORKS
337 static int
338 pre_sync_fname(const char *fname, bool isdir)
340 int fd;
342 fd = open(fname, O_RDONLY | PG_BINARY, 0);
344 if (fd < 0)
346 if (errno == EACCES || (isdir && errno == EISDIR))
347 return 0;
348 pg_log_error("could not open file \"%s\": %m", fname);
349 return -1;
353 * We do what pg_flush_data() would do in the backend: prefer to use
354 * sync_file_range, but fall back to posix_fadvise. We ignore errors
355 * because this is only a hint.
357 #if defined(HAVE_SYNC_FILE_RANGE)
358 (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
359 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
360 (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
361 #else
362 #error PG_FLUSH_DATA_WORKS should not have been defined
363 #endif
365 (void) close(fd);
366 return 0;
369 #endif /* PG_FLUSH_DATA_WORKS */
372 * fsync_fname -- Try to fsync a file or directory
374 * Ignores errors trying to open unreadable files, or trying to fsync
375 * directories on systems where that isn't allowed/required. All other errors
376 * are fatal.
379 fsync_fname(const char *fname, bool isdir)
381 int fd;
382 int flags;
383 int returncode;
386 * Some OSs require directories to be opened read-only whereas other
387 * systems don't allow us to fsync files opened read-only; so we need both
388 * cases here. Using O_RDWR will cause us to fail to fsync files that are
389 * not writable by our userid, but we assume that's OK.
391 flags = PG_BINARY;
392 if (!isdir)
393 flags |= O_RDWR;
394 else
395 flags |= O_RDONLY;
398 * Open the file, silently ignoring errors about unreadable files (or
399 * unsupported operations, e.g. opening a directory under Windows), and
400 * logging others.
402 fd = open(fname, flags, 0);
403 if (fd < 0)
405 if (errno == EACCES || (isdir && errno == EISDIR))
406 return 0;
407 pg_log_error("could not open file \"%s\": %m", fname);
408 return -1;
411 returncode = fsync(fd);
414 * Some OSes don't allow us to fsync directories at all, so we can ignore
415 * those errors. Anything else needs to be reported.
417 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
419 pg_log_error("could not fsync file \"%s\": %m", fname);
420 (void) close(fd);
421 exit(EXIT_FAILURE);
424 (void) close(fd);
425 return 0;
429 * fsync_parent_path -- fsync the parent path of a file or directory
431 * This is aimed at making file operations persistent on disk in case of
432 * an OS crash or power failure.
435 fsync_parent_path(const char *fname)
437 char parentpath[MAXPGPATH];
439 strlcpy(parentpath, fname, MAXPGPATH);
440 get_parent_directory(parentpath);
443 * get_parent_directory() returns an empty string if the input argument is
444 * just a file name (see comments in path.c), so handle that as being the
445 * current directory.
447 if (strlen(parentpath) == 0)
448 strlcpy(parentpath, ".", MAXPGPATH);
450 if (fsync_fname(parentpath, true) != 0)
451 return -1;
453 return 0;
457 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
459 * Wrapper around rename, similar to the backend version.
462 durable_rename(const char *oldfile, const char *newfile)
464 int fd;
467 * First fsync the old and target path (if it exists), to ensure that they
468 * are properly persistent on disk. Syncing the target file is not
469 * strictly necessary, but it makes it easier to reason about crashes;
470 * because it's then guaranteed that either source or target file exists
471 * after a crash.
473 if (fsync_fname(oldfile, false) != 0)
474 return -1;
476 fd = open(newfile, PG_BINARY | O_RDWR, 0);
477 if (fd < 0)
479 if (errno != ENOENT)
481 pg_log_error("could not open file \"%s\": %m", newfile);
482 return -1;
485 else
487 if (fsync(fd) != 0)
489 pg_log_error("could not fsync file \"%s\": %m", newfile);
490 close(fd);
491 exit(EXIT_FAILURE);
493 close(fd);
496 /* Time to do the real deal... */
497 if (rename(oldfile, newfile) != 0)
499 pg_log_error("could not rename file \"%s\" to \"%s\": %m",
500 oldfile, newfile);
501 return -1;
505 * To guarantee renaming the file is persistent, fsync the file with its
506 * new name, and its containing directory.
508 if (fsync_fname(newfile, false) != 0)
509 return -1;
511 if (fsync_parent_path(newfile) != 0)
512 return -1;
514 return 0;
517 #endif /* FRONTEND */
520 * Return the type of a directory entry.
522 * In frontend code, elevel should be a level from logging.h; in backend code
523 * it should be a level from elog.h.
525 PGFileType
526 get_dirent_type(const char *path,
527 const struct dirent *de,
528 bool look_through_symlinks,
529 int elevel)
531 PGFileType result;
534 * Some systems tell us the type directly in the dirent struct, but that's
535 * a BSD and Linux extension not required by POSIX. Even when the
536 * interface is present, sometimes the type is unknown, depending on the
537 * filesystem.
539 #if defined(DT_REG) && defined(DT_DIR) && defined(DT_LNK)
540 if (de->d_type == DT_REG)
541 result = PGFILETYPE_REG;
542 else if (de->d_type == DT_DIR)
543 result = PGFILETYPE_DIR;
544 else if (de->d_type == DT_LNK && !look_through_symlinks)
545 result = PGFILETYPE_LNK;
546 else
547 result = PGFILETYPE_UNKNOWN;
548 #else
549 result = PGFILETYPE_UNKNOWN;
550 #endif
552 if (result == PGFILETYPE_UNKNOWN)
554 struct stat fst;
555 int sret;
558 if (look_through_symlinks)
559 sret = stat(path, &fst);
560 else
561 sret = lstat(path, &fst);
563 if (sret < 0)
565 result = PGFILETYPE_ERROR;
566 #ifdef FRONTEND
567 pg_log_generic(elevel, PG_LOG_PRIMARY, "could not stat file \"%s\": %m", path);
568 #else
569 ereport(elevel,
570 (errcode_for_file_access(),
571 errmsg("could not stat file \"%s\": %m", path)));
572 #endif
574 else if (S_ISREG(fst.st_mode))
575 result = PGFILETYPE_REG;
576 else if (S_ISDIR(fst.st_mode))
577 result = PGFILETYPE_DIR;
578 else if (S_ISLNK(fst.st_mode))
579 result = PGFILETYPE_LNK;
582 return result;
586 * Compute what remains to be done after a possibly partial vectored read or
587 * write. The part of 'source' beginning after 'transferred' bytes is copied
588 * to 'destination', and its length is returned. 'source' and 'destination'
589 * may point to the same array, for in-place adjustment. A return value of
590 * zero indicates completion (for callers without a cheaper way to know that).
593 compute_remaining_iovec(struct iovec *destination,
594 const struct iovec *source,
595 int iovcnt,
596 size_t transferred)
598 Assert(iovcnt > 0);
600 /* Skip wholly transferred iovecs. */
601 while (source->iov_len <= transferred)
603 transferred -= source->iov_len;
604 source++;
605 iovcnt--;
607 /* All iovecs transferred? */
608 if (iovcnt == 0)
611 * We don't expect the kernel to transfer more than we asked it
612 * to, or something is out of sync.
614 Assert(transferred == 0);
615 return 0;
619 /* Copy the remaining iovecs to the front of the array. */
620 if (source != destination)
621 memmove(destination, source, sizeof(*source) * iovcnt);
623 /* Adjust leading iovec, which may have been partially transferred. */
624 Assert(destination->iov_len > transferred);
625 destination->iov_base = (char *) destination->iov_base + transferred;
626 destination->iov_len -= transferred;
628 return iovcnt;
632 * pg_pwritev_with_retry
634 * Convenience wrapper for pg_pwritev() that retries on partial write. If an
635 * error is returned, it is unspecified how much has been written.
637 ssize_t
638 pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
640 struct iovec iov_copy[PG_IOV_MAX];
641 ssize_t sum = 0;
642 ssize_t part;
644 /* We'd better have space to make a copy, in case we need to retry. */
645 if (iovcnt > PG_IOV_MAX)
647 errno = EINVAL;
648 return -1;
653 /* Write as much as we can. */
654 part = pg_pwritev(fd, iov, iovcnt, offset);
655 if (part < 0)
656 return -1;
658 #ifdef SIMULATE_SHORT_WRITE
659 part = Min(part, 4096);
660 #endif
662 /* Count our progress. */
663 sum += part;
664 offset += part;
667 * See what is left. On the first loop we used the caller's array,
668 * but in later loops we'll use our local copy that we are allowed to
669 * mutate.
671 iovcnt = compute_remaining_iovec(iov_copy, iov, iovcnt, part);
672 iov = iov_copy;
673 } while (iovcnt > 0);
675 return sum;
679 * pg_pwrite_zeros
681 * Writes zeros to file worth "size" bytes at "offset" (from the start of the
682 * file), using vectored I/O.
684 * Returns the total amount of data written. On failure, a negative value
685 * is returned with errno set.
687 ssize_t
688 pg_pwrite_zeros(int fd, size_t size, off_t offset)
690 static const PGIOAlignedBlock zbuffer = {{0}}; /* worth BLCKSZ */
691 void *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data;
692 struct iovec iov[PG_IOV_MAX];
693 size_t remaining_size = size;
694 ssize_t total_written = 0;
696 /* Loop, writing as many blocks as we can for each system call. */
697 while (remaining_size > 0)
699 int iovcnt = 0;
700 ssize_t written;
702 for (; iovcnt < PG_IOV_MAX && remaining_size > 0; iovcnt++)
704 size_t this_iov_size;
706 iov[iovcnt].iov_base = zerobuf_addr;
708 if (remaining_size < BLCKSZ)
709 this_iov_size = remaining_size;
710 else
711 this_iov_size = BLCKSZ;
713 iov[iovcnt].iov_len = this_iov_size;
714 remaining_size -= this_iov_size;
717 written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
719 if (written < 0)
720 return written;
722 offset += written;
723 total_written += written;
726 Assert(total_written == size);
728 return total_written;