Simplify and merge unwanted-module drop logic in AdjustUpgrade.pm.
[pgsql.git] / src / common / file_utils.c
blob6bac537a1e55c4492784f4194c01a1871bf49b28
1 /*-------------------------------------------------------------------------
3 * File-processing utility routines.
5 * Assorted utility functions to work on files.
8 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
9 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/common/file_utils.c
13 *-------------------------------------------------------------------------
16 #ifndef FRONTEND
17 #include "postgres.h"
18 #else
19 #include "postgres_fe.h"
20 #endif
22 #include <dirent.h>
23 #include <fcntl.h>
24 #include <sys/stat.h>
25 #include <unistd.h>
27 #include "common/file_utils.h"
28 #ifdef FRONTEND
29 #include "common/logging.h"
30 #endif
31 #include "port/pg_iovec.h"
33 #ifdef FRONTEND
35 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
36 #if defined(HAVE_SYNC_FILE_RANGE)
37 #define PG_FLUSH_DATA_WORKS 1
38 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
39 #define PG_FLUSH_DATA_WORKS 1
40 #endif
43 * pg_xlog has been renamed to pg_wal in version 10.
45 #define MINIMUM_VERSION_FOR_PG_WAL 100000
47 #ifdef PG_FLUSH_DATA_WORKS
48 static int pre_sync_fname(const char *fname, bool isdir);
49 #endif
50 static void walkdir(const char *path,
51 int (*action) (const char *fname, bool isdir),
52 bool process_symlinks);
54 #ifdef HAVE_SYNCFS
57 * do_syncfs -- Try to syncfs a file system
59 * Reports errors trying to open the path. syncfs() errors are fatal.
61 static void
62 do_syncfs(const char *path)
64 int fd;
66 fd = open(path, O_RDONLY, 0);
68 if (fd < 0)
70 pg_log_error("could not open file \"%s\": %m", path);
71 return;
74 if (syncfs(fd) < 0)
76 pg_log_error("could not synchronize file system for file \"%s\": %m", path);
77 (void) close(fd);
78 exit(EXIT_FAILURE);
81 (void) close(fd);
84 #endif /* HAVE_SYNCFS */
87 * Synchronize PGDATA and all its contents.
89 * We sync regular files and directories wherever they are, but we follow
90 * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc.
91 * Other symlinks are presumed to point at files we're not responsible for
92 * syncing, and might not have privileges to write at all.
94 * serverVersion indicates the version of the server to be sync'd.
96 void
97 sync_pgdata(const char *pg_data,
98 int serverVersion,
99 DataDirSyncMethod sync_method)
101 bool xlog_is_symlink;
102 char pg_wal[MAXPGPATH];
103 char pg_tblspc[MAXPGPATH];
105 /* handle renaming of pg_xlog to pg_wal in post-10 clusters */
106 snprintf(pg_wal, MAXPGPATH, "%s/%s", pg_data,
107 serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal");
108 snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data);
111 * If pg_wal is a symlink, we'll need to recurse into it separately,
112 * because the first walkdir below will ignore it.
114 xlog_is_symlink = false;
117 struct stat st;
119 if (lstat(pg_wal, &st) < 0)
120 pg_log_error("could not stat file \"%s\": %m", pg_wal);
121 else if (S_ISLNK(st.st_mode))
122 xlog_is_symlink = true;
125 switch (sync_method)
127 case DATA_DIR_SYNC_METHOD_SYNCFS:
129 #ifndef HAVE_SYNCFS
130 pg_log_error("this build does not support sync method \"%s\"",
131 "syncfs");
132 exit(EXIT_FAILURE);
133 #else
134 DIR *dir;
135 struct dirent *de;
138 * On Linux, we don't have to open every single file one by
139 * one. We can use syncfs() to sync whole filesystems. We
140 * only expect filesystem boundaries to exist where we
141 * tolerate symlinks, namely pg_wal and the tablespaces, so we
142 * call syncfs() for each of those directories.
145 /* Sync the top level pgdata directory. */
146 do_syncfs(pg_data);
148 /* If any tablespaces are configured, sync each of those. */
149 dir = opendir(pg_tblspc);
150 if (dir == NULL)
151 pg_log_error("could not open directory \"%s\": %m",
152 pg_tblspc);
153 else
155 while (errno = 0, (de = readdir(dir)) != NULL)
157 char subpath[MAXPGPATH * 2];
159 if (strcmp(de->d_name, ".") == 0 ||
160 strcmp(de->d_name, "..") == 0)
161 continue;
163 snprintf(subpath, sizeof(subpath), "%s/%s",
164 pg_tblspc, de->d_name);
165 do_syncfs(subpath);
168 if (errno)
169 pg_log_error("could not read directory \"%s\": %m",
170 pg_tblspc);
172 (void) closedir(dir);
175 /* If pg_wal is a symlink, process that too. */
176 if (xlog_is_symlink)
177 do_syncfs(pg_wal);
178 #endif /* HAVE_SYNCFS */
180 break;
182 case DATA_DIR_SYNC_METHOD_FSYNC:
185 * If possible, hint to the kernel that we're soon going to
186 * fsync the data directory and its contents.
188 #ifdef PG_FLUSH_DATA_WORKS
189 walkdir(pg_data, pre_sync_fname, false);
190 if (xlog_is_symlink)
191 walkdir(pg_wal, pre_sync_fname, false);
192 walkdir(pg_tblspc, pre_sync_fname, true);
193 #endif
196 * Now we do the fsync()s in the same order.
198 * The main call ignores symlinks, so in addition to specially
199 * processing pg_wal if it's a symlink, pg_tblspc has to be
200 * visited separately with process_symlinks = true. Note that
201 * if there are any plain directories in pg_tblspc, they'll
202 * get fsync'd twice. That's not an expected case so we don't
203 * worry about optimizing it.
205 walkdir(pg_data, fsync_fname, false);
206 if (xlog_is_symlink)
207 walkdir(pg_wal, fsync_fname, false);
208 walkdir(pg_tblspc, fsync_fname, true);
210 break;
215 * Synchronize the given directory and all its contents.
217 * This is a convenient wrapper on top of walkdir() and do_syncfs().
219 void
220 sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method)
222 switch (sync_method)
224 case DATA_DIR_SYNC_METHOD_SYNCFS:
226 #ifndef HAVE_SYNCFS
227 pg_log_error("this build does not support sync method \"%s\"",
228 "syncfs");
229 exit(EXIT_FAILURE);
230 #else
232 * On Linux, we don't have to open every single file one by
233 * one. We can use syncfs() to sync the whole filesystem.
235 do_syncfs(dir);
236 #endif /* HAVE_SYNCFS */
238 break;
240 case DATA_DIR_SYNC_METHOD_FSYNC:
243 * If possible, hint to the kernel that we're soon going to
244 * fsync the data directory and its contents.
246 #ifdef PG_FLUSH_DATA_WORKS
247 walkdir(dir, pre_sync_fname, false);
248 #endif
250 walkdir(dir, fsync_fname, false);
252 break;
257 * walkdir: recursively walk a directory, applying the action to each
258 * regular file and directory (including the named directory itself).
260 * If process_symlinks is true, the action and recursion are also applied
261 * to regular files and directories that are pointed to by symlinks in the
262 * given directory; otherwise symlinks are ignored. Symlinks are always
263 * ignored in subdirectories, ie we intentionally don't pass down the
264 * process_symlinks flag to recursive calls.
266 * Errors are reported but not considered fatal.
268 * See also walkdir in fd.c, which is a backend version of this logic.
270 static void
271 walkdir(const char *path,
272 int (*action) (const char *fname, bool isdir),
273 bool process_symlinks)
275 DIR *dir;
276 struct dirent *de;
278 dir = opendir(path);
279 if (dir == NULL)
281 pg_log_error("could not open directory \"%s\": %m", path);
282 return;
285 while (errno = 0, (de = readdir(dir)) != NULL)
287 char subpath[MAXPGPATH * 2];
289 if (strcmp(de->d_name, ".") == 0 ||
290 strcmp(de->d_name, "..") == 0)
291 continue;
293 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
295 switch (get_dirent_type(subpath, de, process_symlinks, PG_LOG_ERROR))
297 case PGFILETYPE_REG:
298 (*action) (subpath, false);
299 break;
300 case PGFILETYPE_DIR:
301 walkdir(subpath, action, false);
302 break;
303 default:
306 * Errors are already reported directly by get_dirent_type(),
307 * and any remaining symlinks and unknown file types are
308 * ignored.
310 break;
314 if (errno)
315 pg_log_error("could not read directory \"%s\": %m", path);
317 (void) closedir(dir);
320 * It's important to fsync the destination directory itself as individual
321 * file fsyncs don't guarantee that the directory entry for the file is
322 * synced. Recent versions of ext4 have made the window much wider but
323 * it's been an issue for ext3 and other filesystems in the past.
325 (*action) (path, true);
329 * Hint to the OS that it should get ready to fsync() this file.
331 * Ignores errors trying to open unreadable files, and reports other errors
332 * non-fatally.
334 #ifdef PG_FLUSH_DATA_WORKS
336 static int
337 pre_sync_fname(const char *fname, bool isdir)
339 int fd;
341 fd = open(fname, O_RDONLY | PG_BINARY, 0);
343 if (fd < 0)
345 if (errno == EACCES || (isdir && errno == EISDIR))
346 return 0;
347 pg_log_error("could not open file \"%s\": %m", fname);
348 return -1;
352 * We do what pg_flush_data() would do in the backend: prefer to use
353 * sync_file_range, but fall back to posix_fadvise. We ignore errors
354 * because this is only a hint.
356 #if defined(HAVE_SYNC_FILE_RANGE)
357 (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
358 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
359 (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
360 #else
361 #error PG_FLUSH_DATA_WORKS should not have been defined
362 #endif
364 (void) close(fd);
365 return 0;
368 #endif /* PG_FLUSH_DATA_WORKS */
371 * fsync_fname -- Try to fsync a file or directory
373 * Ignores errors trying to open unreadable files, or trying to fsync
374 * directories on systems where that isn't allowed/required. All other errors
375 * are fatal.
378 fsync_fname(const char *fname, bool isdir)
380 int fd;
381 int flags;
382 int returncode;
385 * Some OSs require directories to be opened read-only whereas other
386 * systems don't allow us to fsync files opened read-only; so we need both
387 * cases here. Using O_RDWR will cause us to fail to fsync files that are
388 * not writable by our userid, but we assume that's OK.
390 flags = PG_BINARY;
391 if (!isdir)
392 flags |= O_RDWR;
393 else
394 flags |= O_RDONLY;
397 * Open the file, silently ignoring errors about unreadable files (or
398 * unsupported operations, e.g. opening a directory under Windows), and
399 * logging others.
401 fd = open(fname, flags, 0);
402 if (fd < 0)
404 if (errno == EACCES || (isdir && errno == EISDIR))
405 return 0;
406 pg_log_error("could not open file \"%s\": %m", fname);
407 return -1;
410 returncode = fsync(fd);
413 * Some OSes don't allow us to fsync directories at all, so we can ignore
414 * those errors. Anything else needs to be reported.
416 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
418 pg_log_error("could not fsync file \"%s\": %m", fname);
419 (void) close(fd);
420 exit(EXIT_FAILURE);
423 (void) close(fd);
424 return 0;
428 * fsync_parent_path -- fsync the parent path of a file or directory
430 * This is aimed at making file operations persistent on disk in case of
431 * an OS crash or power failure.
434 fsync_parent_path(const char *fname)
436 char parentpath[MAXPGPATH];
438 strlcpy(parentpath, fname, MAXPGPATH);
439 get_parent_directory(parentpath);
442 * get_parent_directory() returns an empty string if the input argument is
443 * just a file name (see comments in path.c), so handle that as being the
444 * current directory.
446 if (strlen(parentpath) == 0)
447 strlcpy(parentpath, ".", MAXPGPATH);
449 if (fsync_fname(parentpath, true) != 0)
450 return -1;
452 return 0;
456 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
458 * Wrapper around rename, similar to the backend version.
461 durable_rename(const char *oldfile, const char *newfile)
463 int fd;
466 * First fsync the old and target path (if it exists), to ensure that they
467 * are properly persistent on disk. Syncing the target file is not
468 * strictly necessary, but it makes it easier to reason about crashes;
469 * because it's then guaranteed that either source or target file exists
470 * after a crash.
472 if (fsync_fname(oldfile, false) != 0)
473 return -1;
475 fd = open(newfile, PG_BINARY | O_RDWR, 0);
476 if (fd < 0)
478 if (errno != ENOENT)
480 pg_log_error("could not open file \"%s\": %m", newfile);
481 return -1;
484 else
486 if (fsync(fd) != 0)
488 pg_log_error("could not fsync file \"%s\": %m", newfile);
489 close(fd);
490 exit(EXIT_FAILURE);
492 close(fd);
495 /* Time to do the real deal... */
496 if (rename(oldfile, newfile) != 0)
498 pg_log_error("could not rename file \"%s\" to \"%s\": %m",
499 oldfile, newfile);
500 return -1;
504 * To guarantee renaming the file is persistent, fsync the file with its
505 * new name, and its containing directory.
507 if (fsync_fname(newfile, false) != 0)
508 return -1;
510 if (fsync_parent_path(newfile) != 0)
511 return -1;
513 return 0;
516 #endif /* FRONTEND */
519 * Return the type of a directory entry.
521 * In frontend code, elevel should be a level from logging.h; in backend code
522 * it should be a level from elog.h.
524 PGFileType
525 get_dirent_type(const char *path,
526 const struct dirent *de,
527 bool look_through_symlinks,
528 int elevel)
530 PGFileType result;
533 * Some systems tell us the type directly in the dirent struct, but that's
534 * a BSD and Linux extension not required by POSIX. Even when the
535 * interface is present, sometimes the type is unknown, depending on the
536 * filesystem.
538 #if defined(DT_REG) && defined(DT_DIR) && defined(DT_LNK)
539 if (de->d_type == DT_REG)
540 result = PGFILETYPE_REG;
541 else if (de->d_type == DT_DIR)
542 result = PGFILETYPE_DIR;
543 else if (de->d_type == DT_LNK && !look_through_symlinks)
544 result = PGFILETYPE_LNK;
545 else
546 result = PGFILETYPE_UNKNOWN;
547 #else
548 result = PGFILETYPE_UNKNOWN;
549 #endif
551 if (result == PGFILETYPE_UNKNOWN)
553 struct stat fst;
554 int sret;
557 if (look_through_symlinks)
558 sret = stat(path, &fst);
559 else
560 sret = lstat(path, &fst);
562 if (sret < 0)
564 result = PGFILETYPE_ERROR;
565 #ifdef FRONTEND
566 pg_log_generic(elevel, PG_LOG_PRIMARY, "could not stat file \"%s\": %m", path);
567 #else
568 ereport(elevel,
569 (errcode_for_file_access(),
570 errmsg("could not stat file \"%s\": %m", path)));
571 #endif
573 else if (S_ISREG(fst.st_mode))
574 result = PGFILETYPE_REG;
575 else if (S_ISDIR(fst.st_mode))
576 result = PGFILETYPE_DIR;
577 else if (S_ISLNK(fst.st_mode))
578 result = PGFILETYPE_LNK;
581 return result;
585 * Compute what remains to be done after a possibly partial vectored read or
586 * write. The part of 'source' beginning after 'transferred' bytes is copied
587 * to 'destination', and its length is returned. 'source' and 'destination'
588 * may point to the same array, for in-place adjustment. A return value of
589 * zero indicates completion (for callers without a cheaper way to know that).
592 compute_remaining_iovec(struct iovec *destination,
593 const struct iovec *source,
594 int iovcnt,
595 size_t transferred)
597 Assert(iovcnt > 0);
599 /* Skip wholly transferred iovecs. */
600 while (source->iov_len <= transferred)
602 transferred -= source->iov_len;
603 source++;
604 iovcnt--;
606 /* All iovecs transferred? */
607 if (iovcnt == 0)
610 * We don't expect the kernel to transfer more than we asked it
611 * to, or something is out of sync.
613 Assert(transferred == 0);
614 return 0;
618 /* Copy the remaining iovecs to the front of the array. */
619 if (source != destination)
620 memmove(destination, source, sizeof(*source) * iovcnt);
622 /* Adjust leading iovec, which may have been partially transferred. */
623 Assert(destination->iov_len > transferred);
624 destination->iov_base = (char *) destination->iov_base + transferred;
625 destination->iov_len -= transferred;
627 return iovcnt;
631 * pg_pwritev_with_retry
633 * Convenience wrapper for pg_pwritev() that retries on partial write. If an
634 * error is returned, it is unspecified how much has been written.
636 ssize_t
637 pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
639 struct iovec iov_copy[PG_IOV_MAX];
640 ssize_t sum = 0;
641 ssize_t part;
643 /* We'd better have space to make a copy, in case we need to retry. */
644 if (iovcnt > PG_IOV_MAX)
646 errno = EINVAL;
647 return -1;
652 /* Write as much as we can. */
653 part = pg_pwritev(fd, iov, iovcnt, offset);
654 if (part < 0)
655 return -1;
657 #ifdef SIMULATE_SHORT_WRITE
658 part = Min(part, 4096);
659 #endif
661 /* Count our progress. */
662 sum += part;
663 offset += part;
666 * See what is left. On the first loop we used the caller's array,
667 * but in later loops we'll use our local copy that we are allowed to
668 * mutate.
670 iovcnt = compute_remaining_iovec(iov_copy, iov, iovcnt, part);
671 iov = iov_copy;
672 } while (iovcnt > 0);
674 return sum;
678 * pg_pwrite_zeros
680 * Writes zeros to file worth "size" bytes at "offset" (from the start of the
681 * file), using vectored I/O.
683 * Returns the total amount of data written. On failure, a negative value
684 * is returned with errno set.
686 ssize_t
687 pg_pwrite_zeros(int fd, size_t size, off_t offset)
689 static const PGIOAlignedBlock zbuffer = {{0}}; /* worth BLCKSZ */
690 void *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data;
691 struct iovec iov[PG_IOV_MAX];
692 size_t remaining_size = size;
693 ssize_t total_written = 0;
695 /* Loop, writing as many blocks as we can for each system call. */
696 while (remaining_size > 0)
698 int iovcnt = 0;
699 ssize_t written;
701 for (; iovcnt < PG_IOV_MAX && remaining_size > 0; iovcnt++)
703 size_t this_iov_size;
705 iov[iovcnt].iov_base = zerobuf_addr;
707 if (remaining_size < BLCKSZ)
708 this_iov_size = remaining_size;
709 else
710 this_iov_size = BLCKSZ;
712 iov[iovcnt].iov_len = this_iov_size;
713 remaining_size -= this_iov_size;
716 written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
718 if (written < 0)
719 return written;
721 offset += written;
722 total_written += written;
725 Assert(total_written == size);
727 return total_written;