Harmonize more parameter names in bulk.
[pgsql.git] / src / backend / storage / file / fd.c
blob20c3741aa1ee2861552282d2d491fe24e1d4dd1a
1 /*-------------------------------------------------------------------------
3 * fd.c
4 * Virtual file descriptor code.
6 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
9 * IDENTIFICATION
10 * src/backend/storage/file/fd.c
12 * NOTES:
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 1024 on many modern
20 * operating systems, but may be lower on others.)
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
26 * descriptor).
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
33 * INTERFACE ROUTINES
35 * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 * A File opened with OpenTemporaryFile is automatically deleted when the
37 * File is closed, either explicitly or implicitly at end of transaction or
38 * process exit. PathNameOpenFile is intended for files that are held open
39 * for a long time, like relation files. It is the caller's responsibility
40 * to close them, there is no automatic mechanism in fd.c for that.
42 * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 * temporary files that have names so that they can be shared between
44 * backends. Such files are automatically closed and count against the
45 * temporary file limit of the backend that creates them, but unlike anonymous
46 * files they are not automatically deleted. See sharedfileset.c for a shared
47 * ownership mechanism that provides automatic cleanup for shared files when
48 * the last of a group of backends detaches.
50 * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 * They behave like the corresponding native functions, except that the handle
53 * is registered with the current subtransaction, and will be automatically
54 * closed at abort. These are intended mainly for short operations like
55 * reading a configuration file; there is a limit on the number of files that
56 * can be opened using these functions at any one time.
58 * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 * release file descriptors in use by the virtual file descriptors if
60 * necessary. There is no automatic cleanup of file descriptors returned by
61 * BasicOpenFile, it is solely the caller's responsibility to close the file
62 * descriptor by calling close(2).
64 * If a non-virtual file descriptor needs to be held open for any length of
65 * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 * (and eventually ReleaseExternalFD), so that we can take it into account
67 * while deciding how many VFDs can be open. This applies to FDs obtained
68 * with BasicOpenFile as well as those obtained without use of any fd.c API.
70 *-------------------------------------------------------------------------
73 #include "postgres.h"
75 #include <dirent.h>
76 #include <sys/file.h>
77 #include <sys/param.h>
78 #include <sys/resource.h> /* for getrlimit */
79 #include <sys/stat.h>
80 #include <sys/types.h>
81 #ifndef WIN32
82 #include <sys/mman.h>
83 #endif
84 #include <limits.h>
85 #include <unistd.h>
86 #include <fcntl.h>
88 #include "access/xact.h"
89 #include "access/xlog.h"
90 #include "catalog/pg_tablespace.h"
91 #include "common/file_perm.h"
92 #include "common/file_utils.h"
93 #include "common/pg_prng.h"
94 #include "miscadmin.h"
95 #include "pgstat.h"
96 #include "port/pg_iovec.h"
97 #include "portability/mem.h"
98 #include "postmaster/startup.h"
99 #include "storage/fd.h"
100 #include "storage/ipc.h"
101 #include "utils/guc.h"
102 #include "utils/resowner_private.h"
104 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
105 #if defined(HAVE_SYNC_FILE_RANGE)
106 #define PG_FLUSH_DATA_WORKS 1
107 #elif !defined(WIN32) && defined(MS_ASYNC)
108 #define PG_FLUSH_DATA_WORKS 1
109 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
110 #define PG_FLUSH_DATA_WORKS 1
111 #endif
114 * We must leave some file descriptors free for system(), the dynamic loader,
115 * and other code that tries to open files without consulting fd.c. This
116 * is the number left free. (While we try fairly hard to prevent EMFILE
117 * errors, there's never any guarantee that we won't get ENFILE due to
118 * other processes chewing up FDs. So it's a bad idea to try to open files
119 * without consulting fd.c. Nonetheless we cannot control all code.)
121 * Because this is just a fixed setting, we are effectively assuming that
122 * no such code will leave FDs open over the long term; otherwise the slop
123 * is likely to be insufficient. Note in particular that we expect that
124 * loading a shared library does not result in any permanent increase in
125 * the number of open files. (This appears to be true on most if not
126 * all platforms as of Feb 2004.)
128 #define NUM_RESERVED_FDS 10
131 * If we have fewer than this many usable FDs after allowing for the reserved
132 * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
133 * much less than that. Note that this value ensures numExternalFDs can be
134 * at least 16; as of this writing, the contrib/postgres_fdw regression tests
135 * will not pass unless that can grow to at least 14.)
137 #define FD_MINFREE 48
140 * A number of platforms allow individual processes to open many more files
141 * than they can really support when *many* processes do the same thing.
142 * This GUC parameter lets the DBA limit max_safe_fds to something less than
143 * what the postmaster's initial probe suggests will work.
145 int max_files_per_process = 1000;
148 * Maximum number of file descriptors to open for operations that fd.c knows
149 * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
150 * to a conservative value, and remains that way indefinitely in bootstrap or
151 * standalone-backend cases. In normal postmaster operation, the postmaster
152 * calls set_max_safe_fds() late in initialization to update the value, and
153 * that value is then inherited by forked subprocesses.
155 * Note: the value of max_files_per_process is taken into account while
156 * setting this variable, and so need not be tested separately.
158 int max_safe_fds = FD_MINFREE; /* default if not changed */
160 /* Whether it is safe to continue running after fsync() fails. */
161 bool data_sync_retry = false;
163 /* How SyncDataDirectory() should do its job. */
164 int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
166 /* Debugging.... */
168 #ifdef FDDEBUG
169 #define DO_DB(A) \
170 do { \
171 int _do_db_save_errno = errno; \
172 A; \
173 errno = _do_db_save_errno; \
174 } while (0)
175 #else
176 #define DO_DB(A) \
177 ((void) 0)
178 #endif
180 #define VFD_CLOSED (-1)
182 #define FileIsValid(file) \
183 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
185 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
187 /* these are the assigned bits in fdstate below: */
188 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
189 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
190 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
192 typedef struct vfd
194 int fd; /* current FD, or VFD_CLOSED if none */
195 unsigned short fdstate; /* bitflags for VFD's state */
196 ResourceOwner resowner; /* owner, for automatic cleanup */
197 File nextFree; /* link to next free VFD, if in freelist */
198 File lruMoreRecently; /* doubly linked recency-of-use list */
199 File lruLessRecently;
200 off_t fileSize; /* current size of file (0 if not temporary) */
201 char *fileName; /* name of file, or NULL for unused VFD */
202 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
203 int fileFlags; /* open(2) flags for (re)opening the file */
204 mode_t fileMode; /* mode to pass to open(2) */
205 } Vfd;
208 * Virtual File Descriptor array pointer and size. This grows as
209 * needed. 'File' values are indexes into this array.
210 * Note that VfdCache[0] is not a usable VFD, just a list header.
212 static Vfd *VfdCache;
213 static Size SizeVfdCache = 0;
216 * Number of file descriptors known to be in use by VFD entries.
218 static int nfile = 0;
221 * Flag to tell whether it's worth scanning VfdCache looking for temp files
222 * to close
224 static bool have_xact_temporary_files = false;
227 * Tracks the total size of all temporary files. Note: when temp_file_limit
228 * is being enforced, this cannot overflow since the limit cannot be more
229 * than INT_MAX kilobytes. When not enforcing, it could theoretically
230 * overflow, but we don't care.
232 static uint64 temporary_files_size = 0;
234 /* Temporary file access initialized and not yet shut down? */
235 #ifdef USE_ASSERT_CHECKING
236 static bool temporary_files_allowed = false;
237 #endif
240 * List of OS handles opened with AllocateFile, AllocateDir and
241 * OpenTransientFile.
243 typedef enum
245 AllocateDescFile,
246 AllocateDescPipe,
247 AllocateDescDir,
248 AllocateDescRawFD
249 } AllocateDescKind;
251 typedef struct
253 AllocateDescKind kind;
254 SubTransactionId create_subid;
255 union
257 FILE *file;
258 DIR *dir;
259 int fd;
260 } desc;
261 } AllocateDesc;
263 static int numAllocatedDescs = 0;
264 static int maxAllocatedDescs = 0;
265 static AllocateDesc *allocatedDescs = NULL;
268 * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
270 static int numExternalFDs = 0;
273 * Number of temporary files opened during the current session;
274 * this is used in generation of tempfile names.
276 static long tempFileCounter = 0;
279 * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
280 * indicating that the current database's default tablespace should be used.)
281 * When numTempTableSpaces is -1, this has not been set in the current
282 * transaction.
284 static Oid *tempTableSpaces = NULL;
285 static int numTempTableSpaces = -1;
286 static int nextTempTableSpace = 0;
289 /*--------------------
291 * Private Routines
293 * Delete - delete a file from the Lru ring
294 * LruDelete - remove a file from the Lru ring and close its FD
295 * Insert - put a file at the front of the Lru ring
296 * LruInsert - put a file at the front of the Lru ring and open it
297 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
298 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
299 * AllocateVfd - grab a free (or new) file record (from VfdCache)
300 * FreeVfd - free a file record
302 * The Least Recently Used ring is a doubly linked list that begins and
303 * ends on element zero. Element zero is special -- it doesn't represent
304 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
305 * anchor that shows us the beginning/end of the ring.
306 * Only VFD elements that are currently really open (have an FD assigned) are
307 * in the Lru ring. Elements that are "virtually" open can be recognized
308 * by having a non-null fileName field.
310 * example:
312 * /--less----\ /---------\
313 * v \ v \
314 * #0 --more---> LeastRecentlyUsed --more-\ \
315 * ^\ | |
316 * \\less--> MostRecentlyUsedFile <---/ |
317 * \more---/ \--less--/
319 *--------------------
321 static void Delete(File file);
322 static void LruDelete(File file);
323 static void Insert(File file);
324 static int LruInsert(File file);
325 static bool ReleaseLruFile(void);
326 static void ReleaseLruFiles(void);
327 static File AllocateVfd(void);
328 static void FreeVfd(File file);
330 static int FileAccess(File file);
331 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
332 static bool reserveAllocatedDesc(void);
333 static int FreeDesc(AllocateDesc *desc);
335 static void BeforeShmemExit_Files(int code, Datum arg);
336 static void CleanupTempFiles(bool isCommit, bool isProcExit);
337 static void RemovePgTempRelationFiles(const char *tsdirname);
338 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
340 static void walkdir(const char *path,
341 void (*action) (const char *fname, bool isdir, int elevel),
342 bool process_symlinks,
343 int elevel);
344 #ifdef PG_FLUSH_DATA_WORKS
345 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
346 #endif
347 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
348 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
350 static int fsync_parent_path(const char *fname, int elevel);
354 * pg_fsync --- do fsync with or without writethrough
357 pg_fsync(int fd)
359 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
360 struct stat st;
363 * Some operating system implementations of fsync() have requirements
364 * about the file access modes that were used when their file descriptor
365 * argument was opened, and these requirements differ depending on whether
366 * the file descriptor is for a directory.
368 * For any file descriptor that may eventually be handed to fsync(), we
369 * should have opened it with access modes that are compatible with
370 * fsync() on all supported systems, otherwise the code may not be
371 * portable, even if it runs ok on the current system.
373 * We assert here that a descriptor for a file was opened with write
374 * permissions (either O_RDWR or O_WRONLY) and for a directory without
375 * write permissions (O_RDONLY).
377 * Ignore any fstat errors and let the follow-up fsync() do its work.
378 * Doing this sanity check here counts for the case where fsync() is
379 * disabled.
381 if (fstat(fd, &st) == 0)
383 int desc_flags = fcntl(fd, F_GETFL);
386 * O_RDONLY is historically 0, so just make sure that for directories
387 * no write flags are used.
389 if (S_ISDIR(st.st_mode))
390 Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
391 else
392 Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
394 errno = 0;
395 #endif
397 /* #if is to skip the sync_method test if there's no need for it */
398 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
399 if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
400 return pg_fsync_writethrough(fd);
401 else
402 #endif
403 return pg_fsync_no_writethrough(fd);
408 * pg_fsync_no_writethrough --- same as fsync except does nothing if
409 * enableFsync is off
412 pg_fsync_no_writethrough(int fd)
414 if (enableFsync)
415 return fsync(fd);
416 else
417 return 0;
421 * pg_fsync_writethrough
424 pg_fsync_writethrough(int fd)
426 if (enableFsync)
428 #ifdef WIN32
429 return _commit(fd);
430 #elif defined(F_FULLFSYNC)
431 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
432 #else
433 errno = ENOSYS;
434 return -1;
435 #endif
437 else
438 return 0;
442 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
445 pg_fdatasync(int fd)
447 if (enableFsync)
448 return fdatasync(fd);
449 else
450 return 0;
454 * pg_flush_data --- advise OS that the described dirty data should be flushed
456 * offset of 0 with nbytes 0 means that the entire file should be flushed
458 void
459 pg_flush_data(int fd, off_t offset, off_t nbytes)
462 * Right now file flushing is primarily used to avoid making later
463 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
464 * if fsyncs are disabled - that's a decision we might want to make
465 * configurable at some point.
467 if (!enableFsync)
468 return;
471 * We compile all alternatives that are supported on the current platform,
472 * to find portability problems more easily.
474 #if defined(HAVE_SYNC_FILE_RANGE)
476 int rc;
477 static bool not_implemented_by_kernel = false;
479 if (not_implemented_by_kernel)
480 return;
483 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
484 * tells the OS that writeback for the specified blocks should be
485 * started, but that we don't want to wait for completion. Note that
486 * this call might block if too much dirty data exists in the range.
487 * This is the preferable method on OSs supporting it, as it works
488 * reliably when available (contrast to msync()) and doesn't flush out
489 * clean data (like FADV_DONTNEED).
491 rc = sync_file_range(fd, offset, nbytes,
492 SYNC_FILE_RANGE_WRITE);
493 if (rc != 0)
495 int elevel;
498 * For systems that don't have an implementation of
499 * sync_file_range() such as Windows WSL, generate only one
500 * warning and then suppress all further attempts by this process.
502 if (errno == ENOSYS)
504 elevel = WARNING;
505 not_implemented_by_kernel = true;
507 else
508 elevel = data_sync_elevel(WARNING);
510 ereport(elevel,
511 (errcode_for_file_access(),
512 errmsg("could not flush dirty data: %m")));
515 return;
517 #endif
518 #if !defined(WIN32) && defined(MS_ASYNC)
520 void *p;
521 static int pagesize = 0;
524 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
525 * writeback. On linux it only does so if MS_SYNC is specified, but
526 * then it does the writeback synchronously. Luckily all common linux
527 * systems have sync_file_range(). This is preferable over
528 * FADV_DONTNEED because it doesn't flush out clean data.
530 * We map the file (mmap()), tell the kernel to sync back the contents
531 * (msync()), and then remove the mapping again (munmap()).
534 /* mmap() needs actual length if we want to map whole file */
535 if (offset == 0 && nbytes == 0)
537 nbytes = lseek(fd, 0, SEEK_END);
538 if (nbytes < 0)
540 ereport(WARNING,
541 (errcode_for_file_access(),
542 errmsg("could not determine dirty data size: %m")));
543 return;
548 * Some platforms reject partial-page mmap() attempts. To deal with
549 * that, just truncate the request to a page boundary. If any extra
550 * bytes don't get flushed, well, it's only a hint anyway.
553 /* fetch pagesize only once */
554 if (pagesize == 0)
555 pagesize = sysconf(_SC_PAGESIZE);
557 /* align length to pagesize, dropping any fractional page */
558 if (pagesize > 0)
559 nbytes = (nbytes / pagesize) * pagesize;
561 /* fractional-page request is a no-op */
562 if (nbytes <= 0)
563 return;
566 * mmap could well fail, particularly on 32-bit platforms where there
567 * may simply not be enough address space. If so, silently fall
568 * through to the next implementation.
570 if (nbytes <= (off_t) SSIZE_MAX)
571 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
572 else
573 p = MAP_FAILED;
575 if (p != MAP_FAILED)
577 int rc;
579 rc = msync(p, (size_t) nbytes, MS_ASYNC);
580 if (rc != 0)
582 ereport(data_sync_elevel(WARNING),
583 (errcode_for_file_access(),
584 errmsg("could not flush dirty data: %m")));
585 /* NB: need to fall through to munmap()! */
588 rc = munmap(p, (size_t) nbytes);
589 if (rc != 0)
591 /* FATAL error because mapping would remain */
592 ereport(FATAL,
593 (errcode_for_file_access(),
594 errmsg("could not munmap() while flushing data: %m")));
597 return;
600 #endif
601 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
603 int rc;
606 * Signal the kernel that the passed in range should not be cached
607 * anymore. This has the, desired, side effect of writing out dirty
608 * data, and the, undesired, side effect of likely discarding useful
609 * clean cached blocks. For the latter reason this is the least
610 * preferable method.
613 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
615 if (rc != 0)
617 /* don't error out, this is just a performance optimization */
618 ereport(WARNING,
619 (errcode_for_file_access(),
620 errmsg("could not flush dirty data: %m")));
623 return;
625 #endif
629 * Truncate a file to a given length by name.
632 pg_truncate(const char *path, off_t length)
634 #ifdef WIN32
635 int save_errno;
636 int ret;
637 int fd;
639 fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
640 if (fd >= 0)
642 ret = ftruncate(fd, 0);
643 save_errno = errno;
644 CloseTransientFile(fd);
645 errno = save_errno;
647 else
648 ret = -1;
650 return ret;
651 #else
652 return truncate(path, length);
653 #endif
657 * fsync_fname -- fsync a file or directory, handling errors properly
659 * Try to fsync a file or directory. When doing the latter, ignore errors that
660 * indicate the OS just doesn't allow/require fsyncing directories.
662 void
663 fsync_fname(const char *fname, bool isdir)
665 fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
669 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
671 * This routine ensures that, after returning, the effect of renaming file
672 * persists in case of a crash. A crash while this routine is running will
673 * leave you with either the pre-existing or the moved file in place of the
674 * new file; no mixed state or truncated files are possible.
676 * It does so by using fsync on the old filename and the possibly existing
677 * target filename before the rename, and the target file and directory after.
679 * Note that rename() cannot be used across arbitrary directories, as they
680 * might not be on the same filesystem. Therefore this routine does not
681 * support renaming across directories.
683 * Log errors with the caller specified severity.
685 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
686 * valid upon return.
689 durable_rename(const char *oldfile, const char *newfile, int elevel)
691 int fd;
694 * First fsync the old and target path (if it exists), to ensure that they
695 * are properly persistent on disk. Syncing the target file is not
696 * strictly necessary, but it makes it easier to reason about crashes;
697 * because it's then guaranteed that either source or target file exists
698 * after a crash.
700 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
701 return -1;
703 fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
704 if (fd < 0)
706 if (errno != ENOENT)
708 ereport(elevel,
709 (errcode_for_file_access(),
710 errmsg("could not open file \"%s\": %m", newfile)));
711 return -1;
714 else
716 if (pg_fsync(fd) != 0)
718 int save_errno;
720 /* close file upon error, might not be in transaction context */
721 save_errno = errno;
722 CloseTransientFile(fd);
723 errno = save_errno;
725 ereport(elevel,
726 (errcode_for_file_access(),
727 errmsg("could not fsync file \"%s\": %m", newfile)));
728 return -1;
731 if (CloseTransientFile(fd) != 0)
733 ereport(elevel,
734 (errcode_for_file_access(),
735 errmsg("could not close file \"%s\": %m", newfile)));
736 return -1;
740 /* Time to do the real deal... */
741 if (rename(oldfile, newfile) < 0)
743 ereport(elevel,
744 (errcode_for_file_access(),
745 errmsg("could not rename file \"%s\" to \"%s\": %m",
746 oldfile, newfile)));
747 return -1;
751 * To guarantee renaming the file is persistent, fsync the file with its
752 * new name, and its containing directory.
754 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
755 return -1;
757 if (fsync_parent_path(newfile, elevel) != 0)
758 return -1;
760 return 0;
764 * durable_unlink -- remove a file in a durable manner
766 * This routine ensures that, after returning, the effect of removing file
767 * persists in case of a crash. A crash while this routine is running will
768 * leave the system in no mixed state.
770 * It does so by using fsync on the parent directory of the file after the
771 * actual removal is done.
773 * Log errors with the severity specified by caller.
775 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
776 * valid upon return.
779 durable_unlink(const char *fname, int elevel)
781 if (unlink(fname) < 0)
783 ereport(elevel,
784 (errcode_for_file_access(),
785 errmsg("could not remove file \"%s\": %m",
786 fname)));
787 return -1;
791 * To guarantee that the removal of the file is persistent, fsync its
792 * parent directory.
794 if (fsync_parent_path(fname, elevel) != 0)
795 return -1;
797 return 0;
801 * InitFileAccess --- initialize this module during backend startup
803 * This is called during either normal or standalone backend start.
804 * It is *not* called in the postmaster.
806 * Note that this does not initialize temporary file access, that is
807 * separately initialized via InitTemporaryFileAccess().
809 void
810 InitFileAccess(void)
812 Assert(SizeVfdCache == 0); /* call me only once */
814 /* initialize cache header entry */
815 VfdCache = (Vfd *) malloc(sizeof(Vfd));
816 if (VfdCache == NULL)
817 ereport(FATAL,
818 (errcode(ERRCODE_OUT_OF_MEMORY),
819 errmsg("out of memory")));
821 MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
822 VfdCache->fd = VFD_CLOSED;
824 SizeVfdCache = 1;
828 * InitTemporaryFileAccess --- initialize temporary file access during startup
830 * This is called during either normal or standalone backend start.
831 * It is *not* called in the postmaster.
833 * This is separate from InitFileAccess() because temporary file cleanup can
834 * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
835 * our reporting has to happen before that. Low level file access should be
836 * available for longer, hence the separate initialization / shutdown of
837 * temporary file handling.
839 void
840 InitTemporaryFileAccess(void)
842 Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
843 Assert(!temporary_files_allowed); /* call me only once */
846 * Register before-shmem-exit hook to ensure temp files are dropped while
847 * we can still report stats.
849 before_shmem_exit(BeforeShmemExit_Files, 0);
851 #ifdef USE_ASSERT_CHECKING
852 temporary_files_allowed = true;
853 #endif
857 * count_usable_fds --- count how many FDs the system will let us open,
858 * and estimate how many are already open.
860 * We stop counting if usable_fds reaches max_to_probe. Note: a small
861 * value of max_to_probe might result in an underestimate of already_open;
862 * we must fill in any "gaps" in the set of used FDs before the calculation
863 * of already_open will give the right answer. In practice, max_to_probe
864 * of a couple of dozen should be enough to ensure good results.
866 * We assume stderr (FD 2) is available for dup'ing. While the calling
867 * script could theoretically close that, it would be a really bad idea,
868 * since then one risks loss of error messages from, e.g., libc.
870 static void
871 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
873 int *fd;
874 int size;
875 int used = 0;
876 int highestfd = 0;
877 int j;
879 #ifdef HAVE_GETRLIMIT
880 struct rlimit rlim;
881 int getrlimit_status;
882 #endif
884 size = 1024;
885 fd = (int *) palloc(size * sizeof(int));
887 #ifdef HAVE_GETRLIMIT
888 getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
889 if (getrlimit_status != 0)
890 ereport(WARNING, (errmsg("getrlimit failed: %m")));
891 #endif /* HAVE_GETRLIMIT */
893 /* dup until failure or probe limit reached */
894 for (;;)
896 int thisfd;
898 #ifdef HAVE_GETRLIMIT
901 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
902 * some platforms
904 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
905 break;
906 #endif
908 thisfd = dup(2);
909 if (thisfd < 0)
911 /* Expect EMFILE or ENFILE, else it's fishy */
912 if (errno != EMFILE && errno != ENFILE)
913 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
914 break;
917 if (used >= size)
919 size *= 2;
920 fd = (int *) repalloc(fd, size * sizeof(int));
922 fd[used++] = thisfd;
924 if (highestfd < thisfd)
925 highestfd = thisfd;
927 if (used >= max_to_probe)
928 break;
931 /* release the files we opened */
932 for (j = 0; j < used; j++)
933 close(fd[j]);
935 pfree(fd);
938 * Return results. usable_fds is just the number of successful dups. We
939 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
940 * number) and so already_open is highestfd+1 - usable_fds.
942 *usable_fds = used;
943 *already_open = highestfd + 1 - used;
947 * set_max_safe_fds
948 * Determine number of file descriptors that fd.c is allowed to use
950 void
951 set_max_safe_fds(void)
953 int usable_fds;
954 int already_open;
956 /*----------
957 * We want to set max_safe_fds to
958 * MIN(usable_fds, max_files_per_process - already_open)
959 * less the slop factor for files that are opened without consulting
960 * fd.c. This ensures that we won't exceed either max_files_per_process
961 * or the experimentally-determined EMFILE limit.
962 *----------
964 count_usable_fds(max_files_per_process,
965 &usable_fds, &already_open);
967 max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
970 * Take off the FDs reserved for system() etc.
972 max_safe_fds -= NUM_RESERVED_FDS;
975 * Make sure we still have enough to get by.
977 if (max_safe_fds < FD_MINFREE)
978 ereport(FATAL,
979 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
980 errmsg("insufficient file descriptors available to start server process"),
981 errdetail("System allows %d, we need at least %d.",
982 max_safe_fds + NUM_RESERVED_FDS,
983 FD_MINFREE + NUM_RESERVED_FDS)));
985 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
986 max_safe_fds, usable_fds, already_open);
990 * Open a file with BasicOpenFilePerm() and pass default file mode for the
991 * fileMode parameter.
994 BasicOpenFile(const char *fileName, int fileFlags)
996 return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1000 * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1002 * This is exported for use by places that really want a plain kernel FD,
1003 * but need to be proof against running out of FDs. Once an FD has been
1004 * successfully returned, it is the caller's responsibility to ensure that
1005 * it will not be leaked on ereport()! Most users should *not* call this
1006 * routine directly, but instead use the VFD abstraction level, which
1007 * provides protection against descriptor leaks as well as management of
1008 * files that need to be open for more than a short period of time.
1010 * Ideally this should be the *only* direct call of open() in the backend.
1011 * In practice, the postmaster calls open() directly, and there are some
1012 * direct open() calls done early in backend startup. Those are OK since
1013 * this module wouldn't have any open files to close at that point anyway.
1016 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1018 int fd;
1020 tryAgain:
1021 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1024 * The value we defined to stand in for O_DIRECT when simulating it with
1025 * F_NOCACHE had better not collide with any of the standard flags.
1027 StaticAssertStmt((PG_O_DIRECT &
1028 (O_APPEND |
1029 O_CREAT |
1030 O_EXCL |
1031 O_RDWR |
1032 O_RDONLY |
1033 O_SYNC |
1034 O_TRUNC |
1035 O_WRONLY)) == 0,
1036 "PG_O_DIRECT value collides with standard flag");
1037 #if defined(O_CLOEXEC)
1038 StaticAssertStmt((PG_O_DIRECT & O_CLOEXEC) == 0,
1039 "PG_O_DIRECT value collides with O_CLOEXEC");
1040 #endif
1041 #if defined(O_DSYNC)
1042 StaticAssertStmt((PG_O_DIRECT & O_DSYNC) == 0,
1043 "PG_O_DIRECT value collides with O_DSYNC");
1044 #endif
1046 fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1047 #else
1048 fd = open(fileName, fileFlags, fileMode);
1049 #endif
1051 if (fd >= 0)
1053 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1054 if (fileFlags & PG_O_DIRECT)
1056 if (fcntl(fd, F_NOCACHE, 1) < 0)
1058 int save_errno = errno;
1060 close(fd);
1061 errno = save_errno;
1062 return -1;
1065 #endif
1067 return fd; /* success! */
1070 if (errno == EMFILE || errno == ENFILE)
1072 int save_errno = errno;
1074 ereport(LOG,
1075 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1076 errmsg("out of file descriptors: %m; release and retry")));
1077 errno = 0;
1078 if (ReleaseLruFile())
1079 goto tryAgain;
1080 errno = save_errno;
1083 return -1; /* failure */
1087 * AcquireExternalFD - attempt to reserve an external file descriptor
1089 * This should be used by callers that need to hold a file descriptor open
1090 * over more than a short interval, but cannot use any of the other facilities
1091 * provided by this module.
1093 * The difference between this and the underlying ReserveExternalFD function
1094 * is that this will report failure (by setting errno and returning false)
1095 * if "too many" external FDs are already reserved. This should be used in
1096 * any code where the total number of FDs to be reserved is not predictable
1097 * and small.
1099 bool
1100 AcquireExternalFD(void)
1103 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1104 * "external" FDs.
1106 if (numExternalFDs < max_safe_fds / 3)
1108 ReserveExternalFD();
1109 return true;
1111 errno = EMFILE;
1112 return false;
1116 * ReserveExternalFD - report external consumption of a file descriptor
1118 * This should be used by callers that need to hold a file descriptor open
1119 * over more than a short interval, but cannot use any of the other facilities
1120 * provided by this module. This just tracks the use of the FD and closes
1121 * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1123 * Call this directly only in code where failure to reserve the FD would be
1124 * fatal; for example, the WAL-writing code does so, since the alternative is
1125 * session failure. Also, it's very unwise to do so in code that could
1126 * consume more than one FD per process.
1128 * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1129 * available, it doesn't matter too much whether this is called before or
1130 * after actually opening the FD; but doing so beforehand reduces the risk of
1131 * an EMFILE failure if not everybody played nice. In any case, it's solely
1132 * caller's responsibility to keep the external-FD count in sync with reality.
1134 void
1135 ReserveExternalFD(void)
1138 * Release VFDs if needed to stay safe. Because we do this before
1139 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1140 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1142 ReleaseLruFiles();
1144 numExternalFDs++;
1148 * ReleaseExternalFD - report release of an external file descriptor
1150 * This is guaranteed not to change errno, so it can be used in failure paths.
1152 void
1153 ReleaseExternalFD(void)
1155 Assert(numExternalFDs > 0);
1156 numExternalFDs--;
1160 #if defined(FDDEBUG)
1162 static void
1163 _dump_lru(void)
1165 int mru = VfdCache[0].lruLessRecently;
1166 Vfd *vfdP = &VfdCache[mru];
1167 char buf[2048];
1169 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1170 while (mru != 0)
1172 mru = vfdP->lruLessRecently;
1173 vfdP = &VfdCache[mru];
1174 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1176 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1177 elog(LOG, "%s", buf);
1179 #endif /* FDDEBUG */
1181 static void
1182 Delete(File file)
1184 Vfd *vfdP;
1186 Assert(file != 0);
1188 DO_DB(elog(LOG, "Delete %d (%s)",
1189 file, VfdCache[file].fileName));
1190 DO_DB(_dump_lru());
1192 vfdP = &VfdCache[file];
1194 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1195 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1197 DO_DB(_dump_lru());
1200 static void
1201 LruDelete(File file)
1203 Vfd *vfdP;
1205 Assert(file != 0);
1207 DO_DB(elog(LOG, "LruDelete %d (%s)",
1208 file, VfdCache[file].fileName));
1210 vfdP = &VfdCache[file];
1213 * Close the file. We aren't expecting this to fail; if it does, better
1214 * to leak the FD than to mess up our internal state.
1216 if (close(vfdP->fd) != 0)
1217 elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1218 "could not close file \"%s\": %m", vfdP->fileName);
1219 vfdP->fd = VFD_CLOSED;
1220 --nfile;
1222 /* delete the vfd record from the LRU ring */
1223 Delete(file);
1226 static void
1227 Insert(File file)
1229 Vfd *vfdP;
1231 Assert(file != 0);
1233 DO_DB(elog(LOG, "Insert %d (%s)",
1234 file, VfdCache[file].fileName));
1235 DO_DB(_dump_lru());
1237 vfdP = &VfdCache[file];
1239 vfdP->lruMoreRecently = 0;
1240 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1241 VfdCache[0].lruLessRecently = file;
1242 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1244 DO_DB(_dump_lru());
1247 /* returns 0 on success, -1 on re-open failure (with errno set) */
1248 static int
1249 LruInsert(File file)
1251 Vfd *vfdP;
1253 Assert(file != 0);
1255 DO_DB(elog(LOG, "LruInsert %d (%s)",
1256 file, VfdCache[file].fileName));
1258 vfdP = &VfdCache[file];
1260 if (FileIsNotOpen(file))
1262 /* Close excess kernel FDs. */
1263 ReleaseLruFiles();
1266 * The open could still fail for lack of file descriptors, eg due to
1267 * overall system file table being full. So, be prepared to release
1268 * another FD if necessary...
1270 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1271 vfdP->fileMode);
1272 if (vfdP->fd < 0)
1274 DO_DB(elog(LOG, "re-open failed: %m"));
1275 return -1;
1277 else
1279 ++nfile;
1284 * put it at the head of the Lru ring
1287 Insert(file);
1289 return 0;
1293 * Release one kernel FD by closing the least-recently-used VFD.
1295 static bool
1296 ReleaseLruFile(void)
1298 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1300 if (nfile > 0)
1303 * There are opened files and so there should be at least one used vfd
1304 * in the ring.
1306 Assert(VfdCache[0].lruMoreRecently != 0);
1307 LruDelete(VfdCache[0].lruMoreRecently);
1308 return true; /* freed a file */
1310 return false; /* no files available to free */
1314 * Release kernel FDs as needed to get under the max_safe_fds limit.
1315 * After calling this, it's OK to try to open another file.
1317 static void
1318 ReleaseLruFiles(void)
1320 while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
1322 if (!ReleaseLruFile())
1323 break;
1327 static File
1328 AllocateVfd(void)
1330 Index i;
1331 File file;
1333 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1335 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1337 if (VfdCache[0].nextFree == 0)
1340 * The free list is empty so it is time to increase the size of the
1341 * array. We choose to double it each time this happens. However,
1342 * there's not much point in starting *real* small.
1344 Size newCacheSize = SizeVfdCache * 2;
1345 Vfd *newVfdCache;
1347 if (newCacheSize < 32)
1348 newCacheSize = 32;
1351 * Be careful not to clobber VfdCache ptr if realloc fails.
1353 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1354 if (newVfdCache == NULL)
1355 ereport(ERROR,
1356 (errcode(ERRCODE_OUT_OF_MEMORY),
1357 errmsg("out of memory")));
1358 VfdCache = newVfdCache;
1361 * Initialize the new entries and link them into the free list.
1363 for (i = SizeVfdCache; i < newCacheSize; i++)
1365 MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1366 VfdCache[i].nextFree = i + 1;
1367 VfdCache[i].fd = VFD_CLOSED;
1369 VfdCache[newCacheSize - 1].nextFree = 0;
1370 VfdCache[0].nextFree = SizeVfdCache;
1373 * Record the new size
1375 SizeVfdCache = newCacheSize;
1378 file = VfdCache[0].nextFree;
1380 VfdCache[0].nextFree = VfdCache[file].nextFree;
1382 return file;
1385 static void
1386 FreeVfd(File file)
1388 Vfd *vfdP = &VfdCache[file];
1390 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1391 file, vfdP->fileName ? vfdP->fileName : ""));
1393 if (vfdP->fileName != NULL)
1395 free(vfdP->fileName);
1396 vfdP->fileName = NULL;
1398 vfdP->fdstate = 0x0;
1400 vfdP->nextFree = VfdCache[0].nextFree;
1401 VfdCache[0].nextFree = file;
1404 /* returns 0 on success, -1 on re-open failure (with errno set) */
1405 static int
1406 FileAccess(File file)
1408 int returnValue;
1410 DO_DB(elog(LOG, "FileAccess %d (%s)",
1411 file, VfdCache[file].fileName));
1414 * Is the file open? If not, open it and put it at the head of the LRU
1415 * ring (possibly closing the least recently used file to get an FD).
1418 if (FileIsNotOpen(file))
1420 returnValue = LruInsert(file);
1421 if (returnValue != 0)
1422 return returnValue;
1424 else if (VfdCache[0].lruLessRecently != file)
1427 * We now know that the file is open and that it is not the last one
1428 * accessed, so we need to move it to the head of the Lru ring.
1431 Delete(file);
1432 Insert(file);
1435 return 0;
1439 * Called whenever a temporary file is deleted to report its size.
1441 static void
1442 ReportTemporaryFileUsage(const char *path, off_t size)
1444 pgstat_report_tempfile(size);
1446 if (log_temp_files >= 0)
1448 if ((size / 1024) >= log_temp_files)
1449 ereport(LOG,
1450 (errmsg("temporary file: path \"%s\", size %lu",
1451 path, (unsigned long) size)));
1456 * Called to register a temporary file for automatic close.
1457 * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1458 * before the file was opened.
1460 static void
1461 RegisterTemporaryFile(File file)
1463 ResourceOwnerRememberFile(CurrentResourceOwner, file);
1464 VfdCache[file].resowner = CurrentResourceOwner;
1466 /* Backup mechanism for closing at end of xact. */
1467 VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1468 have_xact_temporary_files = true;
1472 * Called when we get a shared invalidation message on some relation.
1474 #ifdef NOT_USED
1475 void
1476 FileInvalidate(File file)
1478 Assert(FileIsValid(file));
1479 if (!FileIsNotOpen(file))
1480 LruDelete(file);
1482 #endif
1485 * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1486 * fileMode parameter.
1488 File
1489 PathNameOpenFile(const char *fileName, int fileFlags)
1491 return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1495 * open a file in an arbitrary directory
1497 * NB: if the passed pathname is relative (which it usually is),
1498 * it will be interpreted relative to the process' working directory
1499 * (which should always be $PGDATA when this code is running).
1501 File
1502 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1504 char *fnamecopy;
1505 File file;
1506 Vfd *vfdP;
1508 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1509 fileName, fileFlags, fileMode));
1512 * We need a malloc'd copy of the file name; fail cleanly if no room.
1514 fnamecopy = strdup(fileName);
1515 if (fnamecopy == NULL)
1516 ereport(ERROR,
1517 (errcode(ERRCODE_OUT_OF_MEMORY),
1518 errmsg("out of memory")));
1520 file = AllocateVfd();
1521 vfdP = &VfdCache[file];
1523 /* Close excess kernel FDs. */
1524 ReleaseLruFiles();
1526 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1528 if (vfdP->fd < 0)
1530 int save_errno = errno;
1532 FreeVfd(file);
1533 free(fnamecopy);
1534 errno = save_errno;
1535 return -1;
1537 ++nfile;
1538 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1539 vfdP->fd));
1541 vfdP->fileName = fnamecopy;
1542 /* Saved flags are adjusted to be OK for re-opening file */
1543 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1544 vfdP->fileMode = fileMode;
1545 vfdP->fileSize = 0;
1546 vfdP->fdstate = 0x0;
1547 vfdP->resowner = NULL;
1549 Insert(file);
1551 return file;
1555 * Create directory 'directory'. If necessary, create 'basedir', which must
1556 * be the directory above it. This is designed for creating the top-level
1557 * temporary directory on demand before creating a directory underneath it.
1558 * Do nothing if the directory already exists.
1560 * Directories created within the top-level temporary directory should begin
1561 * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1562 * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1563 * that do not need any particular prefix.
1565 void
1566 PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1568 if (MakePGDirectory(directory) < 0)
1570 if (errno == EEXIST)
1571 return;
1574 * Failed. Try to create basedir first in case it's missing. Tolerate
1575 * EEXIST to close a race against another process following the same
1576 * algorithm.
1578 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1579 ereport(ERROR,
1580 (errcode_for_file_access(),
1581 errmsg("cannot create temporary directory \"%s\": %m",
1582 basedir)));
1584 /* Try again. */
1585 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1586 ereport(ERROR,
1587 (errcode_for_file_access(),
1588 errmsg("cannot create temporary subdirectory \"%s\": %m",
1589 directory)));
1594 * Delete a directory and everything in it, if it exists.
1596 void
1597 PathNameDeleteTemporaryDir(const char *dirname)
1599 struct stat statbuf;
1601 /* Silently ignore missing directory. */
1602 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1603 return;
1606 * Currently, walkdir doesn't offer a way for our passed in function to
1607 * maintain state. Perhaps it should, so that we could tell the caller
1608 * whether this operation succeeded or failed. Since this operation is
1609 * used in a cleanup path, we wouldn't actually behave differently: we'll
1610 * just log failures.
1612 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1616 * Open a temporary file that will disappear when we close it.
1618 * This routine takes care of generating an appropriate tempfile name.
1619 * There's no need to pass in fileFlags or fileMode either, since only
1620 * one setting makes any sense for a temp file.
1622 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1623 * to ensure it's closed and deleted when it's no longer needed, typically at
1624 * the end-of-transaction. In most cases, you don't want temporary files to
1625 * outlive the transaction that created them, so this should be false -- but
1626 * if you need "somewhat" temporary storage, this might be useful. In either
1627 * case, the file is removed when the File is explicitly closed.
1629 File
1630 OpenTemporaryFile(bool interXact)
1632 File file = 0;
1634 Assert(temporary_files_allowed); /* check temp file access is up */
1637 * Make sure the current resource owner has space for this File before we
1638 * open it, if we'll be registering it below.
1640 if (!interXact)
1641 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1644 * If some temp tablespace(s) have been given to us, try to use the next
1645 * one. If a given tablespace can't be found, we silently fall back to
1646 * the database's default tablespace.
1648 * BUT: if the temp file is slated to outlive the current transaction,
1649 * force it into the database's default tablespace, so that it will not
1650 * pose a threat to possible tablespace drop attempts.
1652 if (numTempTableSpaces > 0 && !interXact)
1654 Oid tblspcOid = GetNextTempTableSpace();
1656 if (OidIsValid(tblspcOid))
1657 file = OpenTemporaryFileInTablespace(tblspcOid, false);
1661 * If not, or if tablespace is bad, create in database's default
1662 * tablespace. MyDatabaseTableSpace should normally be set before we get
1663 * here, but just in case it isn't, fall back to pg_default tablespace.
1665 if (file <= 0)
1666 file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1667 MyDatabaseTableSpace :
1668 DEFAULTTABLESPACE_OID,
1669 true);
1671 /* Mark it for deletion at close and temporary file size limit */
1672 VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1674 /* Register it with the current resource owner */
1675 if (!interXact)
1676 RegisterTemporaryFile(file);
1678 return file;
1682 * Return the path of the temp directory in a given tablespace.
1684 void
1685 TempTablespacePath(char *path, Oid tablespace)
1688 * Identify the tempfile directory for this tablespace.
1690 * If someone tries to specify pg_global, use pg_default instead.
1692 if (tablespace == InvalidOid ||
1693 tablespace == DEFAULTTABLESPACE_OID ||
1694 tablespace == GLOBALTABLESPACE_OID)
1695 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1696 else
1698 /* All other tablespaces are accessed via symlinks */
1699 snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1700 tablespace, TABLESPACE_VERSION_DIRECTORY,
1701 PG_TEMP_FILES_DIR);
1706 * Open a temporary file in a specific tablespace.
1707 * Subroutine for OpenTemporaryFile, which see for details.
1709 static File
1710 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1712 char tempdirpath[MAXPGPATH];
1713 char tempfilepath[MAXPGPATH];
1714 File file;
1716 TempTablespacePath(tempdirpath, tblspcOid);
1719 * Generate a tempfile name that should be unique within the current
1720 * database instance.
1722 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1723 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1726 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1727 * temp file that can be reused.
1729 file = PathNameOpenFile(tempfilepath,
1730 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1731 if (file <= 0)
1734 * We might need to create the tablespace's tempfile directory, if no
1735 * one has yet done so.
1737 * Don't check for an error from MakePGDirectory; it could fail if
1738 * someone else just did the same thing. If it doesn't work then
1739 * we'll bomb out on the second create attempt, instead.
1741 (void) MakePGDirectory(tempdirpath);
1743 file = PathNameOpenFile(tempfilepath,
1744 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1745 if (file <= 0 && rejectError)
1746 elog(ERROR, "could not create temporary file \"%s\": %m",
1747 tempfilepath);
1750 return file;
1755 * Create a new file. The directory containing it must already exist. Files
1756 * created this way are subject to temp_file_limit and are automatically
1757 * closed at end of transaction, but are not automatically deleted on close
1758 * because they are intended to be shared between cooperating backends.
1760 * If the file is inside the top-level temporary directory, its name should
1761 * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1762 * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1763 * inside a directory created with PathNameCreateTemporaryDir(), in which case
1764 * the prefix isn't needed.
1766 File
1767 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1769 File file;
1771 Assert(temporary_files_allowed); /* check temp file access is up */
1773 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1776 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1777 * temp file that can be reused.
1779 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1780 if (file <= 0)
1782 if (error_on_failure)
1783 ereport(ERROR,
1784 (errcode_for_file_access(),
1785 errmsg("could not create temporary file \"%s\": %m",
1786 path)));
1787 else
1788 return file;
1791 /* Mark it for temp_file_limit accounting. */
1792 VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1794 /* Register it for automatic close. */
1795 RegisterTemporaryFile(file);
1797 return file;
1801 * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1802 * another backend. Files opened this way don't count against the
1803 * temp_file_limit of the caller, are automatically closed at the end of the
1804 * transaction but are not deleted on close.
1806 File
1807 PathNameOpenTemporaryFile(const char *path, int mode)
1809 File file;
1811 Assert(temporary_files_allowed); /* check temp file access is up */
1813 ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1815 file = PathNameOpenFile(path, mode | PG_BINARY);
1817 /* If no such file, then we don't raise an error. */
1818 if (file <= 0 && errno != ENOENT)
1819 ereport(ERROR,
1820 (errcode_for_file_access(),
1821 errmsg("could not open temporary file \"%s\": %m",
1822 path)));
1824 if (file > 0)
1826 /* Register it for automatic close. */
1827 RegisterTemporaryFile(file);
1830 return file;
1834 * Delete a file by pathname. Return true if the file existed, false if
1835 * didn't.
1837 bool
1838 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1840 struct stat filestats;
1841 int stat_errno;
1843 /* Get the final size for pgstat reporting. */
1844 if (stat(path, &filestats) != 0)
1845 stat_errno = errno;
1846 else
1847 stat_errno = 0;
1850 * Unlike FileClose's automatic file deletion code, we tolerate
1851 * non-existence to support BufFileDeleteFileSet which doesn't know how
1852 * many segments it has to delete until it runs out.
1854 if (stat_errno == ENOENT)
1855 return false;
1857 if (unlink(path) < 0)
1859 if (errno != ENOENT)
1860 ereport(error_on_failure ? ERROR : LOG,
1861 (errcode_for_file_access(),
1862 errmsg("could not unlink temporary file \"%s\": %m",
1863 path)));
1864 return false;
1867 if (stat_errno == 0)
1868 ReportTemporaryFileUsage(path, filestats.st_size);
1869 else
1871 errno = stat_errno;
1872 ereport(LOG,
1873 (errcode_for_file_access(),
1874 errmsg("could not stat file \"%s\": %m", path)));
1877 return true;
1881 * close a file when done with it
1883 void
1884 FileClose(File file)
1886 Vfd *vfdP;
1888 Assert(FileIsValid(file));
1890 DO_DB(elog(LOG, "FileClose: %d (%s)",
1891 file, VfdCache[file].fileName));
1893 vfdP = &VfdCache[file];
1895 if (!FileIsNotOpen(file))
1897 /* close the file */
1898 if (close(vfdP->fd) != 0)
1901 * We may need to panic on failure to close non-temporary files;
1902 * see LruDelete.
1904 elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1905 "could not close file \"%s\": %m", vfdP->fileName);
1908 --nfile;
1909 vfdP->fd = VFD_CLOSED;
1911 /* remove the file from the lru ring */
1912 Delete(file);
1915 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1917 /* Subtract its size from current usage (do first in case of error) */
1918 temporary_files_size -= vfdP->fileSize;
1919 vfdP->fileSize = 0;
1923 * Delete the file if it was temporary, and make a log entry if wanted
1925 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1927 struct stat filestats;
1928 int stat_errno;
1931 * If we get an error, as could happen within the ereport/elog calls,
1932 * we'll come right back here during transaction abort. Reset the
1933 * flag to ensure that we can't get into an infinite loop. This code
1934 * is arranged to ensure that the worst-case consequence is failing to
1935 * emit log message(s), not failing to attempt the unlink.
1937 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1940 /* first try the stat() */
1941 if (stat(vfdP->fileName, &filestats))
1942 stat_errno = errno;
1943 else
1944 stat_errno = 0;
1946 /* in any case do the unlink */
1947 if (unlink(vfdP->fileName))
1948 ereport(LOG,
1949 (errcode_for_file_access(),
1950 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
1952 /* and last report the stat results */
1953 if (stat_errno == 0)
1954 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1955 else
1957 errno = stat_errno;
1958 ereport(LOG,
1959 (errcode_for_file_access(),
1960 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
1964 /* Unregister it from the resource owner */
1965 if (vfdP->resowner)
1966 ResourceOwnerForgetFile(vfdP->resowner, file);
1969 * Return the Vfd slot to the free list
1971 FreeVfd(file);
1975 * FilePrefetch - initiate asynchronous read of a given range of the file.
1977 * Currently the only implementation of this function is using posix_fadvise
1978 * which is the simplest standardized interface that accomplishes this.
1979 * We could add an implementation using libaio in the future; but note that
1980 * this API is inappropriate for libaio, which wants to have a buffer provided
1981 * to read into.
1984 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1986 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1987 int returnCode;
1989 Assert(FileIsValid(file));
1991 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1992 file, VfdCache[file].fileName,
1993 (int64) offset, amount));
1995 returnCode = FileAccess(file);
1996 if (returnCode < 0)
1997 return returnCode;
1999 pgstat_report_wait_start(wait_event_info);
2000 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2001 POSIX_FADV_WILLNEED);
2002 pgstat_report_wait_end();
2004 return returnCode;
2005 #else
2006 Assert(FileIsValid(file));
2007 return 0;
2008 #endif
2011 void
2012 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2014 int returnCode;
2016 Assert(FileIsValid(file));
2018 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2019 file, VfdCache[file].fileName,
2020 (int64) offset, (int64) nbytes));
2022 if (nbytes <= 0)
2023 return;
2025 returnCode = FileAccess(file);
2026 if (returnCode < 0)
2027 return;
2029 pgstat_report_wait_start(wait_event_info);
2030 pg_flush_data(VfdCache[file].fd, offset, nbytes);
2031 pgstat_report_wait_end();
2035 FileRead(File file, char *buffer, int amount, off_t offset,
2036 uint32 wait_event_info)
2038 int returnCode;
2039 Vfd *vfdP;
2041 Assert(FileIsValid(file));
2043 DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
2044 file, VfdCache[file].fileName,
2045 (int64) offset,
2046 amount, buffer));
2048 returnCode = FileAccess(file);
2049 if (returnCode < 0)
2050 return returnCode;
2052 vfdP = &VfdCache[file];
2054 retry:
2055 pgstat_report_wait_start(wait_event_info);
2056 returnCode = pread(vfdP->fd, buffer, amount, offset);
2057 pgstat_report_wait_end();
2059 if (returnCode < 0)
2062 * Windows may run out of kernel buffers and return "Insufficient
2063 * system resources" error. Wait a bit and retry to solve it.
2065 * It is rumored that EINTR is also possible on some Unix filesystems,
2066 * in which case immediate retry is indicated.
2068 #ifdef WIN32
2069 DWORD error = GetLastError();
2071 switch (error)
2073 case ERROR_NO_SYSTEM_RESOURCES:
2074 pg_usleep(1000L);
2075 errno = EINTR;
2076 break;
2077 default:
2078 _dosmaperr(error);
2079 break;
2081 #endif
2082 /* OK to retry if interrupted */
2083 if (errno == EINTR)
2084 goto retry;
2087 return returnCode;
2091 FileWrite(File file, char *buffer, int amount, off_t offset,
2092 uint32 wait_event_info)
2094 int returnCode;
2095 Vfd *vfdP;
2097 Assert(FileIsValid(file));
2099 DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
2100 file, VfdCache[file].fileName,
2101 (int64) offset,
2102 amount, buffer));
2104 returnCode = FileAccess(file);
2105 if (returnCode < 0)
2106 return returnCode;
2108 vfdP = &VfdCache[file];
2111 * If enforcing temp_file_limit and it's a temp file, check to see if the
2112 * write would overrun temp_file_limit, and throw error if so. Note: it's
2113 * really a modularity violation to throw error here; we should set errno
2114 * and return -1. However, there's no way to report a suitable error
2115 * message if we do that. All current callers would just throw error
2116 * immediately anyway, so this is safe at present.
2118 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2120 off_t past_write = offset + amount;
2122 if (past_write > vfdP->fileSize)
2124 uint64 newTotal = temporary_files_size;
2126 newTotal += past_write - vfdP->fileSize;
2127 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2128 ereport(ERROR,
2129 (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2130 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2131 temp_file_limit)));
2135 retry:
2136 errno = 0;
2137 pgstat_report_wait_start(wait_event_info);
2138 returnCode = pwrite(VfdCache[file].fd, buffer, amount, offset);
2139 pgstat_report_wait_end();
2141 /* if write didn't set errno, assume problem is no disk space */
2142 if (returnCode != amount && errno == 0)
2143 errno = ENOSPC;
2145 if (returnCode >= 0)
2148 * Maintain fileSize and temporary_files_size if it's a temp file.
2150 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2152 off_t past_write = offset + amount;
2154 if (past_write > vfdP->fileSize)
2156 temporary_files_size += past_write - vfdP->fileSize;
2157 vfdP->fileSize = past_write;
2161 else
2164 * See comments in FileRead()
2166 #ifdef WIN32
2167 DWORD error = GetLastError();
2169 switch (error)
2171 case ERROR_NO_SYSTEM_RESOURCES:
2172 pg_usleep(1000L);
2173 errno = EINTR;
2174 break;
2175 default:
2176 _dosmaperr(error);
2177 break;
2179 #endif
2180 /* OK to retry if interrupted */
2181 if (errno == EINTR)
2182 goto retry;
2185 return returnCode;
2189 FileSync(File file, uint32 wait_event_info)
2191 int returnCode;
2193 Assert(FileIsValid(file));
2195 DO_DB(elog(LOG, "FileSync: %d (%s)",
2196 file, VfdCache[file].fileName));
2198 returnCode = FileAccess(file);
2199 if (returnCode < 0)
2200 return returnCode;
2202 pgstat_report_wait_start(wait_event_info);
2203 returnCode = pg_fsync(VfdCache[file].fd);
2204 pgstat_report_wait_end();
2206 return returnCode;
2209 off_t
2210 FileSize(File file)
2212 Assert(FileIsValid(file));
2214 DO_DB(elog(LOG, "FileSize %d (%s)",
2215 file, VfdCache[file].fileName));
2217 if (FileIsNotOpen(file))
2219 if (FileAccess(file) < 0)
2220 return (off_t) -1;
2223 return lseek(VfdCache[file].fd, 0, SEEK_END);
2227 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2229 int returnCode;
2231 Assert(FileIsValid(file));
2233 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2234 file, VfdCache[file].fileName));
2236 returnCode = FileAccess(file);
2237 if (returnCode < 0)
2238 return returnCode;
2240 pgstat_report_wait_start(wait_event_info);
2241 returnCode = ftruncate(VfdCache[file].fd, offset);
2242 pgstat_report_wait_end();
2244 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2246 /* adjust our state for truncation of a temp file */
2247 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2248 temporary_files_size -= VfdCache[file].fileSize - offset;
2249 VfdCache[file].fileSize = offset;
2252 return returnCode;
2256 * Return the pathname associated with an open file.
2258 * The returned string points to an internal buffer, which is valid until
2259 * the file is closed.
2261 char *
2262 FilePathName(File file)
2264 Assert(FileIsValid(file));
2266 return VfdCache[file].fileName;
2270 * Return the raw file descriptor of an opened file.
2272 * The returned file descriptor will be valid until the file is closed, but
2273 * there are a lot of things that can make that happen. So the caller should
2274 * be careful not to do much of anything else before it finishes using the
2275 * returned file descriptor.
2278 FileGetRawDesc(File file)
2280 Assert(FileIsValid(file));
2281 return VfdCache[file].fd;
2285 * FileGetRawFlags - returns the file flags on open(2)
2288 FileGetRawFlags(File file)
2290 Assert(FileIsValid(file));
2291 return VfdCache[file].fileFlags;
2295 * FileGetRawMode - returns the mode bitmask passed to open(2)
2297 mode_t
2298 FileGetRawMode(File file)
2300 Assert(FileIsValid(file));
2301 return VfdCache[file].fileMode;
2305 * Make room for another allocatedDescs[] array entry if needed and possible.
2306 * Returns true if an array element is available.
2308 static bool
2309 reserveAllocatedDesc(void)
2311 AllocateDesc *newDescs;
2312 int newMax;
2314 /* Quick out if array already has a free slot. */
2315 if (numAllocatedDescs < maxAllocatedDescs)
2316 return true;
2319 * If the array hasn't yet been created in the current process, initialize
2320 * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2321 * we will ever need, anyway. We don't want to look at max_safe_fds
2322 * immediately because set_max_safe_fds() may not have run yet.
2324 if (allocatedDescs == NULL)
2326 newMax = FD_MINFREE / 3;
2327 newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2328 /* Out of memory already? Treat as fatal error. */
2329 if (newDescs == NULL)
2330 ereport(ERROR,
2331 (errcode(ERRCODE_OUT_OF_MEMORY),
2332 errmsg("out of memory")));
2333 allocatedDescs = newDescs;
2334 maxAllocatedDescs = newMax;
2335 return true;
2339 * Consider enlarging the array beyond the initial allocation used above.
2340 * By the time this happens, max_safe_fds should be known accurately.
2342 * We mustn't let allocated descriptors hog all the available FDs, and in
2343 * practice we'd better leave a reasonable number of FDs for VFD use. So
2344 * set the maximum to max_safe_fds / 3. (This should certainly be at
2345 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2346 * tightening the restriction here.) Recall that "external" FDs are
2347 * allowed to consume another third of max_safe_fds.
2349 newMax = max_safe_fds / 3;
2350 if (newMax > maxAllocatedDescs)
2352 newDescs = (AllocateDesc *) realloc(allocatedDescs,
2353 newMax * sizeof(AllocateDesc));
2354 /* Treat out-of-memory as a non-fatal error. */
2355 if (newDescs == NULL)
2356 return false;
2357 allocatedDescs = newDescs;
2358 maxAllocatedDescs = newMax;
2359 return true;
2362 /* Can't enlarge allocatedDescs[] any more. */
2363 return false;
2367 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2368 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2369 * necessary to open the file. When done, call FreeFile rather than fclose.
2371 * Note that files that will be open for any significant length of time
2372 * should NOT be handled this way, since they cannot share kernel file
2373 * descriptors with other files; there is grave risk of running out of FDs
2374 * if anyone locks down too many FDs. Most callers of this routine are
2375 * simply reading a config file that they will read and close immediately.
2377 * fd.c will automatically close all files opened with AllocateFile at
2378 * transaction commit or abort; this prevents FD leakage if a routine
2379 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2381 * Ideally this should be the *only* direct call of fopen() in the backend.
2383 FILE *
2384 AllocateFile(const char *name, const char *mode)
2386 FILE *file;
2388 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2389 numAllocatedDescs, name));
2391 /* Can we allocate another non-virtual FD? */
2392 if (!reserveAllocatedDesc())
2393 ereport(ERROR,
2394 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2395 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2396 maxAllocatedDescs, name)));
2398 /* Close excess kernel FDs. */
2399 ReleaseLruFiles();
2401 TryAgain:
2402 if ((file = fopen(name, mode)) != NULL)
2404 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2406 desc->kind = AllocateDescFile;
2407 desc->desc.file = file;
2408 desc->create_subid = GetCurrentSubTransactionId();
2409 numAllocatedDescs++;
2410 return desc->desc.file;
2413 if (errno == EMFILE || errno == ENFILE)
2415 int save_errno = errno;
2417 ereport(LOG,
2418 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2419 errmsg("out of file descriptors: %m; release and retry")));
2420 errno = 0;
2421 if (ReleaseLruFile())
2422 goto TryAgain;
2423 errno = save_errno;
2426 return NULL;
2430 * Open a file with OpenTransientFilePerm() and pass default file mode for
2431 * the fileMode parameter.
2434 OpenTransientFile(const char *fileName, int fileFlags)
2436 return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2440 * Like AllocateFile, but returns an unbuffered fd like open(2)
2443 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2445 int fd;
2447 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2448 numAllocatedDescs, fileName));
2450 /* Can we allocate another non-virtual FD? */
2451 if (!reserveAllocatedDesc())
2452 ereport(ERROR,
2453 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2454 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2455 maxAllocatedDescs, fileName)));
2457 /* Close excess kernel FDs. */
2458 ReleaseLruFiles();
2460 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2462 if (fd >= 0)
2464 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2466 desc->kind = AllocateDescRawFD;
2467 desc->desc.fd = fd;
2468 desc->create_subid = GetCurrentSubTransactionId();
2469 numAllocatedDescs++;
2471 return fd;
2474 return -1; /* failure */
2478 * Routines that want to initiate a pipe stream should use OpenPipeStream
2479 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2480 * necessary. When done, call ClosePipeStream rather than pclose.
2482 * This function also ensures that the popen'd program is run with default
2483 * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2484 * uses. This ensures desirable response to, eg, closing a read pipe early.
2486 FILE *
2487 OpenPipeStream(const char *command, const char *mode)
2489 FILE *file;
2490 int save_errno;
2492 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2493 numAllocatedDescs, command));
2495 /* Can we allocate another non-virtual FD? */
2496 if (!reserveAllocatedDesc())
2497 ereport(ERROR,
2498 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2499 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2500 maxAllocatedDescs, command)));
2502 /* Close excess kernel FDs. */
2503 ReleaseLruFiles();
2505 TryAgain:
2506 fflush(NULL);
2507 pqsignal(SIGPIPE, SIG_DFL);
2508 errno = 0;
2509 file = popen(command, mode);
2510 save_errno = errno;
2511 pqsignal(SIGPIPE, SIG_IGN);
2512 errno = save_errno;
2513 if (file != NULL)
2515 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2517 desc->kind = AllocateDescPipe;
2518 desc->desc.file = file;
2519 desc->create_subid = GetCurrentSubTransactionId();
2520 numAllocatedDescs++;
2521 return desc->desc.file;
2524 if (errno == EMFILE || errno == ENFILE)
2526 ereport(LOG,
2527 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2528 errmsg("out of file descriptors: %m; release and retry")));
2529 if (ReleaseLruFile())
2530 goto TryAgain;
2531 errno = save_errno;
2534 return NULL;
2538 * Free an AllocateDesc of any type.
2540 * The argument *must* point into the allocatedDescs[] array.
2542 static int
2543 FreeDesc(AllocateDesc *desc)
2545 int result;
2547 /* Close the underlying object */
2548 switch (desc->kind)
2550 case AllocateDescFile:
2551 result = fclose(desc->desc.file);
2552 break;
2553 case AllocateDescPipe:
2554 result = pclose(desc->desc.file);
2555 break;
2556 case AllocateDescDir:
2557 result = closedir(desc->desc.dir);
2558 break;
2559 case AllocateDescRawFD:
2560 result = close(desc->desc.fd);
2561 break;
2562 default:
2563 elog(ERROR, "AllocateDesc kind not recognized");
2564 result = 0; /* keep compiler quiet */
2565 break;
2568 /* Compact storage in the allocatedDescs array */
2569 numAllocatedDescs--;
2570 *desc = allocatedDescs[numAllocatedDescs];
2572 return result;
2576 * Close a file returned by AllocateFile.
2578 * Note we do not check fclose's return value --- it is up to the caller
2579 * to handle close errors.
2582 FreeFile(FILE *file)
2584 int i;
2586 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2588 /* Remove file from list of allocated files, if it's present */
2589 for (i = numAllocatedDescs; --i >= 0;)
2591 AllocateDesc *desc = &allocatedDescs[i];
2593 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2594 return FreeDesc(desc);
2597 /* Only get here if someone passes us a file not in allocatedDescs */
2598 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2600 return fclose(file);
2604 * Close a file returned by OpenTransientFile.
2606 * Note we do not check close's return value --- it is up to the caller
2607 * to handle close errors.
2610 CloseTransientFile(int fd)
2612 int i;
2614 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2616 /* Remove fd from list of allocated files, if it's present */
2617 for (i = numAllocatedDescs; --i >= 0;)
2619 AllocateDesc *desc = &allocatedDescs[i];
2621 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2622 return FreeDesc(desc);
2625 /* Only get here if someone passes us a file not in allocatedDescs */
2626 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2628 return close(fd);
2632 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2633 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2634 * necessary to open the directory, and with closing it after an elog.
2635 * When done, call FreeDir rather than closedir.
2637 * Returns NULL, with errno set, on failure. Note that failure detection
2638 * is commonly left to the following call of ReadDir or ReadDirExtended;
2639 * see the comments for ReadDir.
2641 * Ideally this should be the *only* direct call of opendir() in the backend.
2643 DIR *
2644 AllocateDir(const char *dirname)
2646 DIR *dir;
2648 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2649 numAllocatedDescs, dirname));
2651 /* Can we allocate another non-virtual FD? */
2652 if (!reserveAllocatedDesc())
2653 ereport(ERROR,
2654 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2655 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2656 maxAllocatedDescs, dirname)));
2658 /* Close excess kernel FDs. */
2659 ReleaseLruFiles();
2661 TryAgain:
2662 if ((dir = opendir(dirname)) != NULL)
2664 AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2666 desc->kind = AllocateDescDir;
2667 desc->desc.dir = dir;
2668 desc->create_subid = GetCurrentSubTransactionId();
2669 numAllocatedDescs++;
2670 return desc->desc.dir;
2673 if (errno == EMFILE || errno == ENFILE)
2675 int save_errno = errno;
2677 ereport(LOG,
2678 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2679 errmsg("out of file descriptors: %m; release and retry")));
2680 errno = 0;
2681 if (ReleaseLruFile())
2682 goto TryAgain;
2683 errno = save_errno;
2686 return NULL;
2690 * Read a directory opened with AllocateDir, ereport'ing any error.
2692 * This is easier to use than raw readdir() since it takes care of some
2693 * otherwise rather tedious and error-prone manipulation of errno. Also,
2694 * if you are happy with a generic error message for AllocateDir failure,
2695 * you can just do
2697 * dir = AllocateDir(path);
2698 * while ((dirent = ReadDir(dir, path)) != NULL)
2699 * process dirent;
2700 * FreeDir(dir);
2702 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2703 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2704 * use this shortcut.)
2706 * The pathname passed to AllocateDir must be passed to this routine too,
2707 * but it is only used for error reporting.
2709 struct dirent *
2710 ReadDir(DIR *dir, const char *dirname)
2712 return ReadDirExtended(dir, dirname, ERROR);
2716 * Alternate version of ReadDir that allows caller to specify the elevel
2717 * for any error report (whether it's reporting an initial failure of
2718 * AllocateDir or a subsequent directory read failure).
2720 * If elevel < ERROR, returns NULL after any error. With the normal coding
2721 * pattern, this will result in falling out of the loop immediately as
2722 * though the directory contained no (more) entries.
2724 struct dirent *
2725 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2727 struct dirent *dent;
2729 /* Give a generic message for AllocateDir failure, if caller didn't */
2730 if (dir == NULL)
2732 ereport(elevel,
2733 (errcode_for_file_access(),
2734 errmsg("could not open directory \"%s\": %m",
2735 dirname)));
2736 return NULL;
2739 errno = 0;
2740 if ((dent = readdir(dir)) != NULL)
2741 return dent;
2743 if (errno)
2744 ereport(elevel,
2745 (errcode_for_file_access(),
2746 errmsg("could not read directory \"%s\": %m",
2747 dirname)));
2748 return NULL;
2752 * Close a directory opened with AllocateDir.
2754 * Returns closedir's return value (with errno set if it's not 0).
2755 * Note we do not check the return value --- it is up to the caller
2756 * to handle close errors if wanted.
2758 * Does nothing if dir == NULL; we assume that directory open failure was
2759 * already reported if desired.
2762 FreeDir(DIR *dir)
2764 int i;
2766 /* Nothing to do if AllocateDir failed */
2767 if (dir == NULL)
2768 return 0;
2770 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2772 /* Remove dir from list of allocated dirs, if it's present */
2773 for (i = numAllocatedDescs; --i >= 0;)
2775 AllocateDesc *desc = &allocatedDescs[i];
2777 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2778 return FreeDesc(desc);
2781 /* Only get here if someone passes us a dir not in allocatedDescs */
2782 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2784 return closedir(dir);
2789 * Close a pipe stream returned by OpenPipeStream.
2792 ClosePipeStream(FILE *file)
2794 int i;
2796 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2798 /* Remove file from list of allocated files, if it's present */
2799 for (i = numAllocatedDescs; --i >= 0;)
2801 AllocateDesc *desc = &allocatedDescs[i];
2803 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2804 return FreeDesc(desc);
2807 /* Only get here if someone passes us a file not in allocatedDescs */
2808 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2810 return pclose(file);
2814 * closeAllVfds
2816 * Force all VFDs into the physically-closed state, so that the fewest
2817 * possible number of kernel file descriptors are in use. There is no
2818 * change in the logical state of the VFDs.
2820 void
2821 closeAllVfds(void)
2823 Index i;
2825 if (SizeVfdCache > 0)
2827 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2828 for (i = 1; i < SizeVfdCache; i++)
2830 if (!FileIsNotOpen(i))
2831 LruDelete(i);
2838 * SetTempTablespaces
2840 * Define a list (actually an array) of OIDs of tablespaces to use for
2841 * temporary files. This list will be used until end of transaction,
2842 * unless this function is called again before then. It is caller's
2843 * responsibility that the passed-in array has adequate lifespan (typically
2844 * it'd be allocated in TopTransactionContext).
2846 * Some entries of the array may be InvalidOid, indicating that the current
2847 * database's default tablespace should be used.
2849 void
2850 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2852 Assert(numSpaces >= 0);
2853 tempTableSpaces = tableSpaces;
2854 numTempTableSpaces = numSpaces;
2857 * Select a random starting point in the list. This is to minimize
2858 * conflicts between backends that are most likely sharing the same list
2859 * of temp tablespaces. Note that if we create multiple temp files in the
2860 * same transaction, we'll advance circularly through the list --- this
2861 * ensures that large temporary sort files are nicely spread across all
2862 * available tablespaces.
2864 if (numSpaces > 1)
2865 nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state,
2866 0, numSpaces - 1);
2867 else
2868 nextTempTableSpace = 0;
2872 * TempTablespacesAreSet
2874 * Returns true if SetTempTablespaces has been called in current transaction.
2875 * (This is just so that tablespaces.c doesn't need its own per-transaction
2876 * state.)
2878 bool
2879 TempTablespacesAreSet(void)
2881 return (numTempTableSpaces >= 0);
2885 * GetTempTablespaces
2887 * Populate an array with the OIDs of the tablespaces that should be used for
2888 * temporary files. (Some entries may be InvalidOid, indicating that the
2889 * current database's default tablespace should be used.) At most numSpaces
2890 * entries will be filled.
2891 * Returns the number of OIDs that were copied into the output array.
2894 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2896 int i;
2898 Assert(TempTablespacesAreSet());
2899 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2900 tableSpaces[i] = tempTableSpaces[i];
2902 return i;
2906 * GetNextTempTableSpace
2908 * Select the next temp tablespace to use. A result of InvalidOid means
2909 * to use the current database's default tablespace.
2912 GetNextTempTableSpace(void)
2914 if (numTempTableSpaces > 0)
2916 /* Advance nextTempTableSpace counter with wraparound */
2917 if (++nextTempTableSpace >= numTempTableSpaces)
2918 nextTempTableSpace = 0;
2919 return tempTableSpaces[nextTempTableSpace];
2921 return InvalidOid;
2926 * AtEOSubXact_Files
2928 * Take care of subtransaction commit/abort. At abort, we close temp files
2929 * that the subtransaction may have opened. At commit, we reassign the
2930 * files that were opened to the parent subtransaction.
2932 void
2933 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2934 SubTransactionId parentSubid)
2936 Index i;
2938 for (i = 0; i < numAllocatedDescs; i++)
2940 if (allocatedDescs[i].create_subid == mySubid)
2942 if (isCommit)
2943 allocatedDescs[i].create_subid = parentSubid;
2944 else
2946 /* have to recheck the item after FreeDesc (ugly) */
2947 FreeDesc(&allocatedDescs[i--]);
2954 * AtEOXact_Files
2956 * This routine is called during transaction commit or abort. All still-open
2957 * per-transaction temporary file VFDs are closed, which also causes the
2958 * underlying files to be deleted (although they should've been closed already
2959 * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2960 * closed. We also forget any transaction-local temp tablespace list.
2962 * The isCommit flag is used only to decide whether to emit warnings about
2963 * unclosed files.
2965 void
2966 AtEOXact_Files(bool isCommit)
2968 CleanupTempFiles(isCommit, false);
2969 tempTableSpaces = NULL;
2970 numTempTableSpaces = -1;
2974 * BeforeShmemExit_Files
2976 * before_shmem_access hook to clean up temp files during backend shutdown.
2977 * Here, we want to clean up *all* temp files including interXact ones.
2979 static void
2980 BeforeShmemExit_Files(int code, Datum arg)
2982 CleanupTempFiles(false, true);
2984 /* prevent further temp files from being created */
2985 #ifdef USE_ASSERT_CHECKING
2986 temporary_files_allowed = false;
2987 #endif
2991 * Close temporary files and delete their underlying files.
2993 * isCommit: if true, this is normal transaction commit, and we don't
2994 * expect any remaining files; warn if there are some.
2996 * isProcExit: if true, this is being called as the backend process is
2997 * exiting. If that's the case, we should remove all temporary files; if
2998 * that's not the case, we are being called for transaction commit/abort
2999 * and should only remove transaction-local temp files. In either case,
3000 * also clean up "allocated" stdio files, dirs and fds.
3002 static void
3003 CleanupTempFiles(bool isCommit, bool isProcExit)
3005 Index i;
3008 * Careful here: at proc_exit we need extra cleanup, not just
3009 * xact_temporary files.
3011 if (isProcExit || have_xact_temporary_files)
3013 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3014 for (i = 1; i < SizeVfdCache; i++)
3016 unsigned short fdstate = VfdCache[i].fdstate;
3018 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3019 VfdCache[i].fileName != NULL)
3022 * If we're in the process of exiting a backend process, close
3023 * all temporary files. Otherwise, only close temporary files
3024 * local to the current transaction. They should be closed by
3025 * the ResourceOwner mechanism already, so this is just a
3026 * debugging cross-check.
3028 if (isProcExit)
3029 FileClose(i);
3030 else if (fdstate & FD_CLOSE_AT_EOXACT)
3032 elog(WARNING,
3033 "temporary file %s not closed at end-of-transaction",
3034 VfdCache[i].fileName);
3035 FileClose(i);
3040 have_xact_temporary_files = false;
3043 /* Complain if any allocated files remain open at commit. */
3044 if (isCommit && numAllocatedDescs > 0)
3045 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3046 numAllocatedDescs);
3048 /* Clean up "allocated" stdio files, dirs and fds. */
3049 while (numAllocatedDescs > 0)
3050 FreeDesc(&allocatedDescs[0]);
3055 * Remove temporary and temporary relation files left over from a prior
3056 * postmaster session
3058 * This should be called during postmaster startup. It will forcibly
3059 * remove any leftover files created by OpenTemporaryFile and any leftover
3060 * temporary relation files created by mdcreate.
3062 * During post-backend-crash restart cycle, this routine is called when
3063 * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3064 * queries are using temp files could result in useless storage usage that can
3065 * only be reclaimed by a service restart. The argument against enabling it is
3066 * that someone might want to examine the temporary files for debugging
3067 * purposes. This does however mean that OpenTemporaryFile had better allow for
3068 * collision with an existing temp file name.
3070 * NOTE: this function and its subroutines generally report syscall failures
3071 * with ereport(LOG) and keep going. Removing temp files is not so critical
3072 * that we should fail to start the database when we can't do it.
3074 void
3075 RemovePgTempFiles(void)
3077 char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3078 DIR *spc_dir;
3079 struct dirent *spc_de;
3082 * First process temp files in pg_default ($PGDATA/base)
3084 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3085 RemovePgTempFilesInDir(temp_path, true, false);
3086 RemovePgTempRelationFiles("base");
3089 * Cycle through temp directories for all non-default tablespaces.
3091 spc_dir = AllocateDir("pg_tblspc");
3093 while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3095 if (strcmp(spc_de->d_name, ".") == 0 ||
3096 strcmp(spc_de->d_name, "..") == 0)
3097 continue;
3099 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3100 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
3101 RemovePgTempFilesInDir(temp_path, true, false);
3103 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3104 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
3105 RemovePgTempRelationFiles(temp_path);
3108 FreeDir(spc_dir);
3111 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3112 * DataDir as well. However, that is *not* cleaned here because doing so
3113 * would create a race condition. It's done separately, earlier in
3114 * postmaster startup.
3119 * Process one pgsql_tmp directory for RemovePgTempFiles.
3121 * If missing_ok is true, it's all right for the named directory to not exist.
3122 * Any other problem results in a LOG message. (missing_ok should be true at
3123 * the top level, since pgsql_tmp directories are not created until needed.)
3125 * At the top level, this should be called with unlink_all = false, so that
3126 * only files matching the temporary name prefix will be unlinked. When
3127 * recursing it will be called with unlink_all = true to unlink everything
3128 * under a top-level temporary directory.
3130 * (These two flags could be replaced by one, but it seems clearer to keep
3131 * them separate.)
3133 void
3134 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3136 DIR *temp_dir;
3137 struct dirent *temp_de;
3138 char rm_path[MAXPGPATH * 2];
3140 temp_dir = AllocateDir(tmpdirname);
3142 if (temp_dir == NULL && errno == ENOENT && missing_ok)
3143 return;
3145 while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3147 if (strcmp(temp_de->d_name, ".") == 0 ||
3148 strcmp(temp_de->d_name, "..") == 0)
3149 continue;
3151 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3152 tmpdirname, temp_de->d_name);
3154 if (unlink_all ||
3155 strncmp(temp_de->d_name,
3156 PG_TEMP_FILE_PREFIX,
3157 strlen(PG_TEMP_FILE_PREFIX)) == 0)
3159 PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
3161 if (type == PGFILETYPE_ERROR)
3162 continue;
3163 else if (type == PGFILETYPE_DIR)
3165 /* recursively remove contents, then directory itself */
3166 RemovePgTempFilesInDir(rm_path, false, true);
3168 if (rmdir(rm_path) < 0)
3169 ereport(LOG,
3170 (errcode_for_file_access(),
3171 errmsg("could not remove directory \"%s\": %m",
3172 rm_path)));
3174 else
3176 if (unlink(rm_path) < 0)
3177 ereport(LOG,
3178 (errcode_for_file_access(),
3179 errmsg("could not remove file \"%s\": %m",
3180 rm_path)));
3183 else
3184 ereport(LOG,
3185 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3186 rm_path)));
3189 FreeDir(temp_dir);
3192 /* Process one tablespace directory, look for per-DB subdirectories */
3193 static void
3194 RemovePgTempRelationFiles(const char *tsdirname)
3196 DIR *ts_dir;
3197 struct dirent *de;
3198 char dbspace_path[MAXPGPATH * 2];
3200 ts_dir = AllocateDir(tsdirname);
3202 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3205 * We're only interested in the per-database directories, which have
3206 * numeric names. Note that this code will also (properly) ignore "."
3207 * and "..".
3209 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3210 continue;
3212 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3213 tsdirname, de->d_name);
3214 RemovePgTempRelationFilesInDbspace(dbspace_path);
3217 FreeDir(ts_dir);
3220 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3221 static void
3222 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3224 DIR *dbspace_dir;
3225 struct dirent *de;
3226 char rm_path[MAXPGPATH * 2];
3228 dbspace_dir = AllocateDir(dbspacedirname);
3230 while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3232 if (!looks_like_temp_rel_name(de->d_name))
3233 continue;
3235 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3236 dbspacedirname, de->d_name);
3238 if (unlink(rm_path) < 0)
3239 ereport(LOG,
3240 (errcode_for_file_access(),
3241 errmsg("could not remove file \"%s\": %m",
3242 rm_path)));
3245 FreeDir(dbspace_dir);
3248 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3249 bool
3250 looks_like_temp_rel_name(const char *name)
3252 int pos;
3253 int savepos;
3255 /* Must start with "t". */
3256 if (name[0] != 't')
3257 return false;
3259 /* Followed by a non-empty string of digits and then an underscore. */
3260 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3262 if (pos == 1 || name[pos] != '_')
3263 return false;
3265 /* Followed by another nonempty string of digits. */
3266 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3268 if (savepos == pos)
3269 return false;
3271 /* We might have _forkname or .segment or both. */
3272 if (name[pos] == '_')
3274 int forkchar = forkname_chars(&name[pos + 1], NULL);
3276 if (forkchar <= 0)
3277 return false;
3278 pos += forkchar + 1;
3280 if (name[pos] == '.')
3282 int segchar;
3284 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3286 if (segchar <= 1)
3287 return false;
3288 pos += segchar;
3291 /* Now we should be at the end. */
3292 if (name[pos] != '\0')
3293 return false;
3294 return true;
3297 #ifdef HAVE_SYNCFS
3298 static void
3299 do_syncfs(const char *path)
3301 int fd;
3303 ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3304 path);
3306 fd = OpenTransientFile(path, O_RDONLY);
3307 if (fd < 0)
3309 ereport(LOG,
3310 (errcode_for_file_access(),
3311 errmsg("could not open file \"%s\": %m", path)));
3312 return;
3314 if (syncfs(fd) < 0)
3315 ereport(LOG,
3316 (errcode_for_file_access(),
3317 errmsg("could not synchronize file system for file \"%s\": %m", path)));
3318 CloseTransientFile(fd);
3320 #endif
3323 * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3324 * all potential filesystem, depending on recovery_init_sync_method setting.
3326 * We fsync regular files and directories wherever they are, but we
3327 * follow symlinks only for pg_wal and immediately under pg_tblspc.
3328 * Other symlinks are presumed to point at files we're not responsible
3329 * for fsyncing, and might not have privileges to write at all.
3331 * Errors are logged but not considered fatal; that's because this is used
3332 * only during database startup, to deal with the possibility that there are
3333 * issued-but-unsynced writes pending against the data directory. We want to
3334 * ensure that such writes reach disk before anything that's done in the new
3335 * run. However, aborting on error would result in failure to start for
3336 * harmless cases such as read-only files in the data directory, and that's
3337 * not good either.
3339 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3340 * rewriting all changes again during recovery.
3342 * Note we assume we're chdir'd into PGDATA to begin with.
3344 void
3345 SyncDataDirectory(void)
3347 bool xlog_is_symlink;
3349 /* We can skip this whole thing if fsync is disabled. */
3350 if (!enableFsync)
3351 return;
3354 * If pg_wal is a symlink, we'll need to recurse into it separately,
3355 * because the first walkdir below will ignore it.
3357 xlog_is_symlink = false;
3360 struct stat st;
3362 if (lstat("pg_wal", &st) < 0)
3363 ereport(LOG,
3364 (errcode_for_file_access(),
3365 errmsg("could not stat file \"%s\": %m",
3366 "pg_wal")));
3367 else if (S_ISLNK(st.st_mode))
3368 xlog_is_symlink = true;
3371 #ifdef HAVE_SYNCFS
3372 if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS)
3374 DIR *dir;
3375 struct dirent *de;
3378 * On Linux, we don't have to open every single file one by one. We
3379 * can use syncfs() to sync whole filesystems. We only expect
3380 * filesystem boundaries to exist where we tolerate symlinks, namely
3381 * pg_wal and the tablespaces, so we call syncfs() for each of those
3382 * directories.
3385 /* Prepare to report progress syncing the data directory via syncfs. */
3386 begin_startup_progress_phase();
3388 /* Sync the top level pgdata directory. */
3389 do_syncfs(".");
3390 /* If any tablespaces are configured, sync each of those. */
3391 dir = AllocateDir("pg_tblspc");
3392 while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
3394 char path[MAXPGPATH];
3396 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3397 continue;
3399 snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
3400 do_syncfs(path);
3402 FreeDir(dir);
3403 /* If pg_wal is a symlink, process that too. */
3404 if (xlog_is_symlink)
3405 do_syncfs("pg_wal");
3406 return;
3408 #endif /* !HAVE_SYNCFS */
3410 #ifdef PG_FLUSH_DATA_WORKS
3411 /* Prepare to report progress of the pre-fsync phase. */
3412 begin_startup_progress_phase();
3415 * If possible, hint to the kernel that we're soon going to fsync the data
3416 * directory and its contents. Errors in this step are even less
3417 * interesting than normal, so log them only at DEBUG1.
3419 walkdir(".", pre_sync_fname, false, DEBUG1);
3420 if (xlog_is_symlink)
3421 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3422 walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3423 #endif
3425 /* Prepare to report progress syncing the data directory via fsync. */
3426 begin_startup_progress_phase();
3429 * Now we do the fsync()s in the same order.
3431 * The main call ignores symlinks, so in addition to specially processing
3432 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3433 * process_symlinks = true. Note that if there are any plain directories
3434 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3435 * so we don't worry about optimizing it.
3437 walkdir(".", datadir_fsync_fname, false, LOG);
3438 if (xlog_is_symlink)
3439 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3440 walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3444 * walkdir: recursively walk a directory, applying the action to each
3445 * regular file and directory (including the named directory itself).
3447 * If process_symlinks is true, the action and recursion are also applied
3448 * to regular files and directories that are pointed to by symlinks in the
3449 * given directory; otherwise symlinks are ignored. Symlinks are always
3450 * ignored in subdirectories, ie we intentionally don't pass down the
3451 * process_symlinks flag to recursive calls.
3453 * Errors are reported at level elevel, which might be ERROR or less.
3455 * See also walkdir in file_utils.c, which is a frontend version of this
3456 * logic.
3458 static void
3459 walkdir(const char *path,
3460 void (*action) (const char *fname, bool isdir, int elevel),
3461 bool process_symlinks,
3462 int elevel)
3464 DIR *dir;
3465 struct dirent *de;
3467 dir = AllocateDir(path);
3469 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3471 char subpath[MAXPGPATH * 2];
3473 CHECK_FOR_INTERRUPTS();
3475 if (strcmp(de->d_name, ".") == 0 ||
3476 strcmp(de->d_name, "..") == 0)
3477 continue;
3479 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3481 switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3483 case PGFILETYPE_REG:
3484 (*action) (subpath, false, elevel);
3485 break;
3486 case PGFILETYPE_DIR:
3487 walkdir(subpath, action, false, elevel);
3488 break;
3489 default:
3492 * Errors are already reported directly by get_dirent_type(),
3493 * and any remaining symlinks and unknown file types are
3494 * ignored.
3496 break;
3500 FreeDir(dir); /* we ignore any error here */
3503 * It's important to fsync the destination directory itself as individual
3504 * file fsyncs don't guarantee that the directory entry for the file is
3505 * synced. However, skip this if AllocateDir failed; the action function
3506 * might not be robust against that.
3508 if (dir)
3509 (*action) (path, true, elevel);
3514 * Hint to the OS that it should get ready to fsync() this file.
3516 * Ignores errors trying to open unreadable files, and logs other errors at a
3517 * caller-specified level.
3519 #ifdef PG_FLUSH_DATA_WORKS
3521 static void
3522 pre_sync_fname(const char *fname, bool isdir, int elevel)
3524 int fd;
3526 /* Don't try to flush directories, it'll likely just fail */
3527 if (isdir)
3528 return;
3530 ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3531 fname);
3533 fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3535 if (fd < 0)
3537 if (errno == EACCES)
3538 return;
3539 ereport(elevel,
3540 (errcode_for_file_access(),
3541 errmsg("could not open file \"%s\": %m", fname)));
3542 return;
3546 * pg_flush_data() ignores errors, which is ok because this is only a
3547 * hint.
3549 pg_flush_data(fd, 0, 0);
3551 if (CloseTransientFile(fd) != 0)
3552 ereport(elevel,
3553 (errcode_for_file_access(),
3554 errmsg("could not close file \"%s\": %m", fname)));
3557 #endif /* PG_FLUSH_DATA_WORKS */
3559 static void
3560 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3562 ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3563 fname);
3566 * We want to silently ignoring errors about unreadable files. Pass that
3567 * desire on to fsync_fname_ext().
3569 fsync_fname_ext(fname, isdir, true, elevel);
3572 static void
3573 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3575 if (isdir)
3577 if (rmdir(fname) != 0 && errno != ENOENT)
3578 ereport(elevel,
3579 (errcode_for_file_access(),
3580 errmsg("could not remove directory \"%s\": %m", fname)));
3582 else
3584 /* Use PathNameDeleteTemporaryFile to report filesize */
3585 PathNameDeleteTemporaryFile(fname, false);
3590 * fsync_fname_ext -- Try to fsync a file or directory
3592 * If ignore_perm is true, ignore errors upon trying to open unreadable
3593 * files. Logs other errors at a caller-specified level.
3595 * Returns 0 if the operation succeeded, -1 otherwise.
3598 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3600 int fd;
3601 int flags;
3602 int returncode;
3605 * Some OSs require directories to be opened read-only whereas other
3606 * systems don't allow us to fsync files opened read-only; so we need both
3607 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3608 * not writable by our userid, but we assume that's OK.
3610 flags = PG_BINARY;
3611 if (!isdir)
3612 flags |= O_RDWR;
3613 else
3614 flags |= O_RDONLY;
3616 fd = OpenTransientFile(fname, flags);
3619 * Some OSs don't allow us to open directories at all (Windows returns
3620 * EACCES), just ignore the error in that case. If desired also silently
3621 * ignoring errors about unreadable files. Log others.
3623 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3624 return 0;
3625 else if (fd < 0 && ignore_perm && errno == EACCES)
3626 return 0;
3627 else if (fd < 0)
3629 ereport(elevel,
3630 (errcode_for_file_access(),
3631 errmsg("could not open file \"%s\": %m", fname)));
3632 return -1;
3635 returncode = pg_fsync(fd);
3638 * Some OSes don't allow us to fsync directories at all, so we can ignore
3639 * those errors. Anything else needs to be logged.
3641 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3643 int save_errno;
3645 /* close file upon error, might not be in transaction context */
3646 save_errno = errno;
3647 (void) CloseTransientFile(fd);
3648 errno = save_errno;
3650 ereport(elevel,
3651 (errcode_for_file_access(),
3652 errmsg("could not fsync file \"%s\": %m", fname)));
3653 return -1;
3656 if (CloseTransientFile(fd) != 0)
3658 ereport(elevel,
3659 (errcode_for_file_access(),
3660 errmsg("could not close file \"%s\": %m", fname)));
3661 return -1;
3664 return 0;
3668 * fsync_parent_path -- fsync the parent path of a file or directory
3670 * This is aimed at making file operations persistent on disk in case of
3671 * an OS crash or power failure.
3673 static int
3674 fsync_parent_path(const char *fname, int elevel)
3676 char parentpath[MAXPGPATH];
3678 strlcpy(parentpath, fname, MAXPGPATH);
3679 get_parent_directory(parentpath);
3682 * get_parent_directory() returns an empty string if the input argument is
3683 * just a file name (see comments in path.c), so handle that as being the
3684 * current directory.
3686 if (strlen(parentpath) == 0)
3687 strlcpy(parentpath, ".", MAXPGPATH);
3689 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3690 return -1;
3692 return 0;
3696 * Create a PostgreSQL data sub-directory
3698 * The data directory itself, and most of its sub-directories, are created at
3699 * initdb time, but we do have some occasions when we create directories in
3700 * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3701 * make sure that those directories are created consistently. Today, that means
3702 * making sure that the created directory has the correct permissions, which is
3703 * what pg_dir_create_mode tracks for us.
3705 * Note that we also set the umask() based on what we understand the correct
3706 * permissions to be (see file_perm.c).
3708 * For permissions other than the default, mkdir() can be used directly, but
3709 * be sure to consider carefully such cases -- a sub-directory with incorrect
3710 * permissions in a PostgreSQL data directory could cause backups and other
3711 * processes to fail.
3714 MakePGDirectory(const char *directoryName)
3716 return mkdir(directoryName, pg_dir_create_mode);
3720 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3722 * Failure to fsync any data file is cause for immediate panic, unless
3723 * data_sync_retry is enabled. Data may have been written to the operating
3724 * system and removed from our buffer pool already, and if we are running on
3725 * an operating system that forgets dirty data on write-back failure, there
3726 * may be only one copy of the data remaining: in the WAL. A later attempt to
3727 * fsync again might falsely report success. Therefore we must not allow any
3728 * further checkpoints to be attempted. data_sync_retry can in theory be
3729 * enabled on systems known not to drop dirty buffered data on write-back
3730 * failure (with the likely outcome that checkpoints will continue to fail
3731 * until the underlying problem is fixed).
3733 * Any code that reports a failure from fsync() or related functions should
3734 * filter the error level with this function.
3737 data_sync_elevel(int elevel)
3739 return data_sync_retry ? elevel : PANIC;
3743 * A convenience wrapper for pwritev() that retries on partial write. If an
3744 * error is returned, it is unspecified how much has been written.
3746 ssize_t
3747 pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
3749 struct iovec iov_copy[PG_IOV_MAX];
3750 ssize_t sum = 0;
3751 ssize_t part;
3753 /* We'd better have space to make a copy, in case we need to retry. */
3754 if (iovcnt > PG_IOV_MAX)
3756 errno = EINVAL;
3757 return -1;
3760 for (;;)
3762 /* Write as much as we can. */
3763 part = pwritev(fd, iov, iovcnt, offset);
3764 if (part < 0)
3765 return -1;
3767 #ifdef SIMULATE_SHORT_WRITE
3768 part = Min(part, 4096);
3769 #endif
3771 /* Count our progress. */
3772 sum += part;
3773 offset += part;
3775 /* Step over iovecs that are done. */
3776 while (iovcnt > 0 && iov->iov_len <= part)
3778 part -= iov->iov_len;
3779 ++iov;
3780 --iovcnt;
3783 /* Are they all done? */
3784 if (iovcnt == 0)
3786 /* We don't expect the kernel to write more than requested. */
3787 Assert(part == 0);
3788 break;
3792 * Move whatever's left to the front of our mutable copy and adjust
3793 * the leading iovec.
3795 Assert(iovcnt > 0);
3796 memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
3797 Assert(iov->iov_len > part);
3798 iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
3799 iov_copy[0].iov_len -= part;
3800 iov = iov_copy;
3803 return sum;