1 /*-------------------------------------------------------------------------
4 * This code manages relations that reside on magnetic disk.
6 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
13 *-------------------------------------------------------------------------
21 #include "catalog/catalog.h"
22 #include "miscadmin.h"
23 #include "postmaster/bgwriter.h"
24 #include "storage/fd.h"
25 #include "storage/bufmgr.h"
26 #include "storage/relfilenode.h"
27 #include "storage/smgr.h"
28 #include "utils/hsearch.h"
29 #include "utils/memutils.h"
33 /* interval for calling AbsorbFsyncRequests in mdsync */
34 #define FSYNCS_PER_ABSORB 10
36 /* special values for the segno arg to RememberFsyncRequest */
37 #define FORGET_RELATION_FSYNC (InvalidBlockNumber)
38 #define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
39 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
42 * On Windows, we have to interpret EACCES as possibly meaning the same as
43 * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
44 * that's what you get. Ugh. This code is designed so that we don't
45 * actually believe these cases are okay without further evidence (namely,
46 * a pending fsync request getting revoked ... see mdsync).
49 #define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
51 #define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES)
55 * The magnetic disk storage manager keeps track of open file
56 * descriptors in its own descriptor pool. This is done to make it
57 * easier to support relations that are larger than the operating
58 * system's file size limit (often 2GBytes). In order to do that,
59 * we break relations up into "segment" files that are each shorter than
60 * the OS file size limit. The segment size is set by the RELSEG_SIZE
61 * configuration constant in pg_config.h.
63 * On disk, a relation must consist of consecutively numbered segment
64 * files in the pattern
65 * -- Zero or more full segments of exactly RELSEG_SIZE blocks each
66 * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
67 * -- Optionally, any number of inactive segments of size 0 blocks.
68 * The full and partial segments are collectively the "active" segments.
69 * Inactive segments are those that once contained data but are currently
70 * not needed because of an mdtruncate() operation. The reason for leaving
71 * them present at size zero, rather than unlinking them, is that other
72 * backends and/or the bgwriter might be holding open file references to
73 * such segments. If the relation expands again after mdtruncate(), such
74 * that a deactivated segment becomes active again, it is important that
75 * such file references still be valid --- else data might get written
76 * out to an unlinked old copy of a segment file that will eventually
79 * The file descriptor pointer (md_fd field) stored in the SMgrRelation
80 * cache is, therefore, just the head of a list of MdfdVec objects, one
81 * per segment. But note the md_fd pointer can be NULL, indicating
84 * Also note that mdfd_chain == NULL does not necessarily mean the relation
85 * doesn't have another segment after this one; we may just not have
86 * opened the next segment yet. (We could not have "all segments are
87 * in the chain" as an invariant anyway, since another backend could
88 * extend the relation when we weren't looking.) We do not make chain
89 * entries for inactive segments, however; as soon as we find a partial
90 * segment, we assume that any subsequent segments are inactive.
92 * All MdfdVec objects are palloc'd in the MdCxt memory context.
95 typedef struct _MdfdVec
97 File mdfd_vfd
; /* fd number in fd.c's pool */
98 BlockNumber mdfd_segno
; /* segment number, from 0 */
99 struct _MdfdVec
*mdfd_chain
; /* next segment, or NULL */
102 static MemoryContext MdCxt
; /* context for all md.c allocations */
106 * In some contexts (currently, standalone backends and the bgwriter process)
107 * we keep track of pending fsync operations: we need to remember all relation
108 * segments that have been written since the last checkpoint, so that we can
109 * fsync them down to disk before completing the next checkpoint. This hash
110 * table remembers the pending operations. We use a hash table mostly as
111 * a convenient way of eliminating duplicate requests.
113 * We use a similar mechanism to remember no-longer-needed files that can
114 * be deleted after the next checkpoint, but we use a linked list instead of
115 * a hash table, because we don't expect there to be any duplicate requests.
117 * (Regular backends do not track pending operations locally, but forward
118 * them to the bgwriter.)
122 RelFileNode rnode
; /* the targeted relation */
124 BlockNumber segno
; /* which segment */
125 } PendingOperationTag
;
127 typedef uint16 CycleCtr
; /* can be any convenient integer size */
131 PendingOperationTag tag
; /* hash table key (must be first!) */
132 bool canceled
; /* T => request canceled, not yet removed */
133 CycleCtr cycle_ctr
; /* mdsync_cycle_ctr when request was made */
134 } PendingOperationEntry
;
138 RelFileNode rnode
; /* the dead relation to delete */
139 CycleCtr cycle_ctr
; /* mdckpt_cycle_ctr when request was made */
140 } PendingUnlinkEntry
;
142 static HTAB
*pendingOpsTable
= NULL
;
143 static List
*pendingUnlinks
= NIL
;
145 static CycleCtr mdsync_cycle_ctr
= 0;
146 static CycleCtr mdckpt_cycle_ctr
= 0;
149 typedef enum /* behavior for mdopen & _mdfd_getseg */
151 EXTENSION_FAIL
, /* ereport if segment not present */
152 EXTENSION_RETURN_NULL
, /* return NULL if not present */
153 EXTENSION_CREATE
/* create new segments as needed */
157 static MdfdVec
*mdopen(SMgrRelation reln
, ForkNumber forknum
,
158 ExtensionBehavior behavior
);
159 static void register_dirty_segment(SMgrRelation reln
, ForkNumber forknum
,
161 static void register_unlink(RelFileNode rnode
);
162 static MdfdVec
*_fdvec_alloc(void);
163 static MdfdVec
*_mdfd_openseg(SMgrRelation reln
, ForkNumber forkno
,
164 BlockNumber segno
, int oflags
);
165 static MdfdVec
*_mdfd_getseg(SMgrRelation reln
, ForkNumber forkno
,
166 BlockNumber blkno
, bool isTemp
, ExtensionBehavior behavior
);
167 static BlockNumber
_mdnblocks(SMgrRelation reln
, ForkNumber forknum
,
172 * mdinit() -- Initialize private state for magnetic disk storage manager.
177 MdCxt
= AllocSetContextCreate(TopMemoryContext
,
179 ALLOCSET_DEFAULT_MINSIZE
,
180 ALLOCSET_DEFAULT_INITSIZE
,
181 ALLOCSET_DEFAULT_MAXSIZE
);
184 * Create pending-operations hashtable if we need it. Currently, we need
185 * it if we are standalone (not under a postmaster) OR if we are a
186 * bootstrap-mode subprocess of a postmaster (that is, a startup or
189 if (!IsUnderPostmaster
|| IsBootstrapProcessingMode())
193 MemSet(&hash_ctl
, 0, sizeof(hash_ctl
));
194 hash_ctl
.keysize
= sizeof(PendingOperationTag
);
195 hash_ctl
.entrysize
= sizeof(PendingOperationEntry
);
196 hash_ctl
.hash
= tag_hash
;
197 hash_ctl
.hcxt
= MdCxt
;
198 pendingOpsTable
= hash_create("Pending Ops Table",
201 HASH_ELEM
| HASH_FUNCTION
| HASH_CONTEXT
);
202 pendingUnlinks
= NIL
;
207 * In archive recovery, we rely on bgwriter to do fsyncs, but we will have
208 * already created the pendingOpsTable during initialization of the startup
209 * process. Calling this function drops the local pendingOpsTable so that
210 * subsequent requests will be forwarded to bgwriter.
213 SetForwardFsyncRequests(void)
215 /* Perform any pending ops we may have queued up */
218 pendingOpsTable
= NULL
;
222 * mdexists() -- Does the physical file exist?
224 * Note: this will return true for lingering files, with pending deletions
227 mdexists(SMgrRelation reln
, ForkNumber forkNum
)
230 * Close it first, to ensure that we notice if the fork has been unlinked
231 * since we opened it.
233 mdclose(reln
, forkNum
);
235 return (mdopen(reln
, forkNum
, EXTENSION_RETURN_NULL
) != NULL
);
239 * mdcreate() -- Create a new relation on magnetic disk.
241 * If isRedo is true, it's okay for the relation to exist already.
244 mdcreate(SMgrRelation reln
, ForkNumber forkNum
, bool isRedo
)
249 if (isRedo
&& reln
->md_fd
[forkNum
] != NULL
)
250 return; /* created and opened already... */
252 Assert(reln
->md_fd
[forkNum
] == NULL
);
254 path
= relpath(reln
->smgr_rnode
, forkNum
);
256 fd
= PathNameOpenFile(path
, O_RDWR
| O_CREAT
| O_EXCL
| PG_BINARY
, 0600);
260 int save_errno
= errno
;
263 * During bootstrap, there are cases where a system relation will be
264 * accessed (by internal backend processes) before the bootstrap
265 * script nominally creates it. Therefore, allow the file to exist
266 * already, even if isRedo is not set. (See also mdopen)
268 if (isRedo
|| IsBootstrapProcessingMode())
269 fd
= PathNameOpenFile(path
, O_RDWR
| PG_BINARY
, 0600);
272 /* be sure to report the error reported by create, not open */
275 (errcode_for_file_access(),
276 errmsg("could not create relation %s: %m", path
)));
282 reln
->md_fd
[forkNum
] = _fdvec_alloc();
284 reln
->md_fd
[forkNum
]->mdfd_vfd
= fd
;
285 reln
->md_fd
[forkNum
]->mdfd_segno
= 0;
286 reln
->md_fd
[forkNum
]->mdfd_chain
= NULL
;
290 * mdunlink() -- Unlink a relation.
292 * Note that we're passed a RelFileNode --- by the time this is called,
293 * there won't be an SMgrRelation hashtable entry anymore.
295 * Actually, we don't unlink the first segment file of the relation, but
296 * just truncate it to zero length, and record a request to unlink it after
297 * the next checkpoint. Additional segments can be unlinked immediately,
298 * however. Leaving the empty file in place prevents that relfilenode
299 * number from being reused. The scenario this protects us from is:
300 * 1. We delete a relation (and commit, and actually remove its file).
301 * 2. We create a new relation, which by chance gets the same relfilenode as
302 * the just-deleted one (OIDs must've wrapped around for that to happen).
303 * 3. We crash before another checkpoint occurs.
304 * During replay, we would delete the file and then recreate it, which is fine
305 * if the contents of the file were repopulated by subsequent WAL entries.
306 * But if we didn't WAL-log insertions, but instead relied on fsyncing the
307 * file after populating it (as for instance CLUSTER and CREATE INDEX do),
308 * the contents of the file would be lost forever. By leaving the empty file
309 * until after the next checkpoint, we prevent reassignment of the relfilenode
310 * number until it's safe, because relfilenode assignment skips over any
313 * If isRedo is true, it's okay for the relation to be already gone.
314 * Also, we should remove the file immediately instead of queuing a request
315 * for later, since during redo there's no possibility of creating a
316 * conflicting relation.
318 * Note: any failure should be reported as WARNING not ERROR, because
319 * we are usually not in a transaction anymore when this is called.
322 mdunlink(RelFileNode rnode
, ForkNumber forkNum
, bool isRedo
)
328 * We have to clean out any pending fsync requests for the doomed
329 * relation, else the next mdsync() will fail.
331 ForgetRelationFsyncRequests(rnode
, forkNum
);
333 path
= relpath(rnode
, forkNum
);
336 * Delete or truncate the first segment.
338 if (isRedo
|| forkNum
!= MAIN_FORKNUM
)
342 /* truncate(2) would be easier here, but Windows hasn't got it */
345 fd
= BasicOpenFile(path
, O_RDWR
| PG_BINARY
, 0);
350 ret
= ftruncate(fd
, 0);
360 if (!isRedo
|| errno
!= ENOENT
)
362 (errcode_for_file_access(),
363 errmsg("could not remove relation %s: %m", path
)));
367 * Delete any additional segments.
371 char *segpath
= (char *) palloc(strlen(path
) + 12);
375 * Note that because we loop until getting ENOENT, we will correctly
376 * remove all inactive segments as well as active ones.
378 for (segno
= 1;; segno
++)
380 sprintf(segpath
, "%s.%u", path
, segno
);
381 if (unlink(segpath
) < 0)
383 /* ENOENT is expected after the last segment... */
386 (errcode_for_file_access(),
387 errmsg("could not remove segment %u of relation %s: %m",
397 /* Register request to unlink first segment later */
398 if (!isRedo
&& forkNum
== MAIN_FORKNUM
)
399 register_unlink(rnode
);
403 * mdextend() -- Add a block to the specified relation.
405 * The semantics are nearly the same as mdwrite(): write at the
406 * specified position. However, this is to be used for the case of
407 * extending a relation (i.e., blocknum is at or beyond the current
408 * EOF). Note that we assume writing a block beyond current EOF
409 * causes intervening file space to become filled with zeroes.
412 mdextend(SMgrRelation reln
, ForkNumber forknum
, BlockNumber blocknum
,
413 char *buffer
, bool isTemp
)
419 /* This assert is too expensive to have on normally ... */
420 #ifdef CHECK_WRITE_VS_EXTEND
421 Assert(blocknum
>= mdnblocks(reln
, forknum
));
425 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
426 * more --- we mustn't create a block whose number actually is
427 * InvalidBlockNumber.
429 if (blocknum
== InvalidBlockNumber
)
431 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
432 errmsg("cannot extend relation %s beyond %u blocks",
433 relpath(reln
->smgr_rnode
, forknum
),
434 InvalidBlockNumber
)));
436 v
= _mdfd_getseg(reln
, forknum
, blocknum
, isTemp
, EXTENSION_CREATE
);
438 seekpos
= (off_t
) BLCKSZ
*(blocknum
% ((BlockNumber
) RELSEG_SIZE
));
440 Assert(seekpos
< (off_t
) BLCKSZ
* RELSEG_SIZE
);
443 * Note: because caller usually obtained blocknum by calling mdnblocks,
444 * which did a seek(SEEK_END), this seek is often redundant and will be
445 * optimized away by fd.c. It's not redundant, however, if there is a
446 * partial page at the end of the file. In that case we want to try to
447 * overwrite the partial page with a full page. It's also not redundant
448 * if bufmgr.c had to dump another buffer of the same file to make room
449 * for the new page's buffer.
451 if (FileSeek(v
->mdfd_vfd
, seekpos
, SEEK_SET
) != seekpos
)
453 (errcode_for_file_access(),
454 errmsg("could not seek to block %u of relation %s: %m",
456 relpath(reln
->smgr_rnode
, forknum
))));
458 if ((nbytes
= FileWrite(v
->mdfd_vfd
, buffer
, BLCKSZ
)) != BLCKSZ
)
462 (errcode_for_file_access(),
463 errmsg("could not extend relation %s: %m",
464 relpath(reln
->smgr_rnode
, forknum
)),
465 errhint("Check free disk space.")));
466 /* short write: complain appropriately */
468 (errcode(ERRCODE_DISK_FULL
),
469 errmsg("could not extend relation %s: wrote only %d of %d bytes at block %u",
470 relpath(reln
->smgr_rnode
, forknum
),
471 nbytes
, BLCKSZ
, blocknum
),
472 errhint("Check free disk space.")));
476 register_dirty_segment(reln
, forknum
, v
);
478 Assert(_mdnblocks(reln
, forknum
, v
) <= ((BlockNumber
) RELSEG_SIZE
));
482 * mdopen() -- Open the specified relation.
484 * Note we only open the first segment, when there are multiple segments.
486 * If first segment is not present, either ereport or return NULL according
487 * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
488 * EXTENSION_CREATE means it's OK to extend an existing relation, not to
489 * invent one out of whole cloth.
492 mdopen(SMgrRelation reln
, ForkNumber forknum
, ExtensionBehavior behavior
)
498 /* No work if already open */
499 if (reln
->md_fd
[forknum
])
500 return reln
->md_fd
[forknum
];
502 path
= relpath(reln
->smgr_rnode
, forknum
);
504 fd
= PathNameOpenFile(path
, O_RDWR
| PG_BINARY
, 0600);
509 * During bootstrap, there are cases where a system relation will be
510 * accessed (by internal backend processes) before the bootstrap
511 * script nominally creates it. Therefore, accept mdopen() as a
512 * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
514 if (IsBootstrapProcessingMode())
515 fd
= PathNameOpenFile(path
, O_RDWR
| O_CREAT
| O_EXCL
| PG_BINARY
, 0600);
518 if (behavior
== EXTENSION_RETURN_NULL
&&
519 FILE_POSSIBLY_DELETED(errno
))
525 (errcode_for_file_access(),
526 errmsg("could not open relation %s: %m", path
)));
532 reln
->md_fd
[forknum
] = mdfd
= _fdvec_alloc();
535 mdfd
->mdfd_segno
= 0;
536 mdfd
->mdfd_chain
= NULL
;
537 Assert(_mdnblocks(reln
, forknum
, mdfd
) <= ((BlockNumber
) RELSEG_SIZE
));
543 * mdclose() -- Close the specified relation, if it isn't closed already.
546 mdclose(SMgrRelation reln
, ForkNumber forknum
)
548 MdfdVec
*v
= reln
->md_fd
[forknum
];
550 /* No work if already closed */
554 reln
->md_fd
[forknum
] = NULL
; /* prevent dangling pointer after error */
560 /* if not closed already */
561 if (v
->mdfd_vfd
>= 0)
562 FileClose(v
->mdfd_vfd
);
563 /* Now free vector */
570 * mdprefetch() -- Initiate asynchronous read of the specified block of a relation
573 mdprefetch(SMgrRelation reln
, ForkNumber forknum
, BlockNumber blocknum
)
579 v
= _mdfd_getseg(reln
, forknum
, blocknum
, false, EXTENSION_FAIL
);
581 seekpos
= (off_t
) BLCKSZ
*(blocknum
% ((BlockNumber
) RELSEG_SIZE
));
583 Assert(seekpos
< (off_t
) BLCKSZ
* RELSEG_SIZE
);
585 (void) FilePrefetch(v
->mdfd_vfd
, seekpos
, BLCKSZ
);
586 #endif /* USE_PREFETCH */
591 * mdread() -- Read the specified block from a relation.
594 mdread(SMgrRelation reln
, ForkNumber forknum
, BlockNumber blocknum
,
601 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum
, blocknum
,
602 reln
->smgr_rnode
.spcNode
,
603 reln
->smgr_rnode
.dbNode
,
604 reln
->smgr_rnode
.relNode
);
606 v
= _mdfd_getseg(reln
, forknum
, blocknum
, false, EXTENSION_FAIL
);
608 seekpos
= (off_t
) BLCKSZ
*(blocknum
% ((BlockNumber
) RELSEG_SIZE
));
610 Assert(seekpos
< (off_t
) BLCKSZ
* RELSEG_SIZE
);
612 if (FileSeek(v
->mdfd_vfd
, seekpos
, SEEK_SET
) != seekpos
)
614 (errcode_for_file_access(),
615 errmsg("could not seek to block %u of relation %s: %m",
616 blocknum
, relpath(reln
->smgr_rnode
, forknum
))));
618 nbytes
= FileRead(v
->mdfd_vfd
, buffer
, BLCKSZ
);
620 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum
, blocknum
,
621 reln
->smgr_rnode
.spcNode
,
622 reln
->smgr_rnode
.dbNode
,
623 reln
->smgr_rnode
.relNode
,
627 if (nbytes
!= BLCKSZ
)
631 (errcode_for_file_access(),
632 errmsg("could not read block %u of relation %s: %m",
633 blocknum
, relpath(reln
->smgr_rnode
, forknum
))));
636 * Short read: we are at or past EOF, or we read a partial block at
637 * EOF. Normally this is an error; upper levels should never try to
638 * read a nonexistent block. However, if zero_damaged_pages is ON or
639 * we are InRecovery, we should instead return zeroes without
640 * complaining. This allows, for example, the case of trying to
641 * update a block that was later truncated away.
643 if (zero_damaged_pages
|| InRecovery
)
644 MemSet(buffer
, 0, BLCKSZ
);
647 (errcode(ERRCODE_DATA_CORRUPTED
),
648 errmsg("could not read block %u of relation %s: read only %d of %d bytes",
649 blocknum
, relpath(reln
->smgr_rnode
, forknum
),
655 * mdwrite() -- Write the supplied block at the appropriate location.
657 * This is to be used only for updating already-existing blocks of a
658 * relation (ie, those before the current EOF). To extend a relation,
662 mdwrite(SMgrRelation reln
, ForkNumber forknum
, BlockNumber blocknum
,
663 char *buffer
, bool isTemp
)
669 /* This assert is too expensive to have on normally ... */
670 #ifdef CHECK_WRITE_VS_EXTEND
671 Assert(blocknum
< mdnblocks(reln
, forknum
));
674 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum
, blocknum
,
675 reln
->smgr_rnode
.spcNode
,
676 reln
->smgr_rnode
.dbNode
,
677 reln
->smgr_rnode
.relNode
);
679 v
= _mdfd_getseg(reln
, forknum
, blocknum
, isTemp
, EXTENSION_FAIL
);
681 seekpos
= (off_t
) BLCKSZ
*(blocknum
% ((BlockNumber
) RELSEG_SIZE
));
683 Assert(seekpos
< (off_t
) BLCKSZ
* RELSEG_SIZE
);
685 if (FileSeek(v
->mdfd_vfd
, seekpos
, SEEK_SET
) != seekpos
)
687 (errcode_for_file_access(),
688 errmsg("could not seek to block %u of relation %s: %m",
689 blocknum
, relpath(reln
->smgr_rnode
, forknum
))));
691 nbytes
= FileWrite(v
->mdfd_vfd
, buffer
, BLCKSZ
);
693 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum
, blocknum
,
694 reln
->smgr_rnode
.spcNode
,
695 reln
->smgr_rnode
.dbNode
,
696 reln
->smgr_rnode
.relNode
,
700 if (nbytes
!= BLCKSZ
)
704 (errcode_for_file_access(),
705 errmsg("could not write block %u of relation %s: %m",
706 blocknum
, relpath(reln
->smgr_rnode
, forknum
))));
707 /* short write: complain appropriately */
709 (errcode(ERRCODE_DISK_FULL
),
710 errmsg("could not write block %u of relation %s: wrote only %d of %d bytes",
712 relpath(reln
->smgr_rnode
, forknum
),
714 errhint("Check free disk space.")));
718 register_dirty_segment(reln
, forknum
, v
);
722 * mdnblocks() -- Get the number of blocks stored in a relation.
724 * Important side effect: all active segments of the relation are opened
725 * and added to the mdfd_chain list. If this routine has not been
726 * called, then only segments up to the last one actually touched
727 * are present in the chain.
730 mdnblocks(SMgrRelation reln
, ForkNumber forknum
)
732 MdfdVec
*v
= mdopen(reln
, forknum
, EXTENSION_FAIL
);
734 BlockNumber segno
= 0;
737 * Skip through any segments that aren't the last one, to avoid redundant
738 * seeks on them. We have previously verified that these segments are
739 * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
741 * NOTE: this assumption could only be wrong if another backend has
742 * truncated the relation. We rely on higher code levels to handle that
743 * scenario by closing and re-opening the md fd, which is handled via
744 * relcache flush. (Since the bgwriter doesn't participate in relcache
745 * flush, it could have segment chain entries for inactive segments;
746 * that's OK because the bgwriter never needs to compute relation size.)
748 while (v
->mdfd_chain
!= NULL
)
756 nblocks
= _mdnblocks(reln
, forknum
, v
);
757 if (nblocks
> ((BlockNumber
) RELSEG_SIZE
))
758 elog(FATAL
, "segment too big");
759 if (nblocks
< ((BlockNumber
) RELSEG_SIZE
))
760 return (segno
* ((BlockNumber
) RELSEG_SIZE
)) + nblocks
;
763 * If segment is exactly RELSEG_SIZE, advance to next one.
767 if (v
->mdfd_chain
== NULL
)
770 * Because we pass O_CREAT, we will create the next segment (with
771 * zero length) immediately, if the last segment is of length
772 * RELSEG_SIZE. While perhaps not strictly necessary, this keeps
775 v
->mdfd_chain
= _mdfd_openseg(reln
, forknum
, segno
, O_CREAT
);
776 if (v
->mdfd_chain
== NULL
)
778 (errcode_for_file_access(),
779 errmsg("could not open segment %u of relation %s: %m",
781 relpath(reln
->smgr_rnode
, forknum
))));
789 * mdtruncate() -- Truncate relation to specified number of blocks.
792 mdtruncate(SMgrRelation reln
, ForkNumber forknum
, BlockNumber nblocks
,
797 BlockNumber priorblocks
;
800 * NOTE: mdnblocks makes sure we have opened all active segments, so that
801 * truncation loop will get them all!
803 curnblk
= mdnblocks(reln
, forknum
);
804 if (nblocks
> curnblk
)
806 /* Bogus request ... but no complaint if InRecovery */
810 (errmsg("could not truncate relation %s to %u blocks: it's only %u blocks now",
811 relpath(reln
->smgr_rnode
, forknum
),
814 if (nblocks
== curnblk
)
815 return; /* no work */
817 v
= mdopen(reln
, forknum
, EXTENSION_FAIL
);
824 if (priorblocks
> nblocks
)
827 * This segment is no longer active (and has already been unlinked
828 * from the mdfd_chain). We truncate the file, but do not delete
829 * it, for reasons explained in the header comments.
831 if (FileTruncate(v
->mdfd_vfd
, 0) < 0)
833 (errcode_for_file_access(),
834 errmsg("could not truncate relation %s to %u blocks: %m",
835 relpath(reln
->smgr_rnode
, forknum
),
838 register_dirty_segment(reln
, forknum
, v
);
840 Assert(ov
!= reln
->md_fd
[forknum
]); /* we never drop the 1st
844 else if (priorblocks
+ ((BlockNumber
) RELSEG_SIZE
) > nblocks
)
847 * This is the last segment we want to keep. Truncate the file to
848 * the right length, and clear chain link that points to any
849 * remaining segments (which we shall zap). NOTE: if nblocks is
850 * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
851 * segment to 0 length but keep it. This adheres to the invariant
852 * given in the header comments.
854 BlockNumber lastsegblocks
= nblocks
- priorblocks
;
856 if (FileTruncate(v
->mdfd_vfd
, (off_t
) lastsegblocks
* BLCKSZ
) < 0)
858 (errcode_for_file_access(),
859 errmsg("could not truncate relation %s to %u blocks: %m",
860 relpath(reln
->smgr_rnode
, forknum
),
863 register_dirty_segment(reln
, forknum
, v
);
865 ov
->mdfd_chain
= NULL
;
870 * We still need this segment and 0 or more blocks beyond it, so
871 * nothing to do here.
875 priorblocks
+= RELSEG_SIZE
;
880 * mdimmedsync() -- Immediately sync a relation to stable storage.
882 * Note that only writes already issued are synced; this routine knows
883 * nothing of dirty buffers that may exist inside the buffer manager.
886 mdimmedsync(SMgrRelation reln
, ForkNumber forknum
)
892 * NOTE: mdnblocks makes sure we have opened all active segments, so that
893 * fsync loop will get them all!
895 curnblk
= mdnblocks(reln
, forknum
);
897 v
= mdopen(reln
, forknum
, EXTENSION_FAIL
);
901 if (FileSync(v
->mdfd_vfd
) < 0)
903 (errcode_for_file_access(),
904 errmsg("could not fsync segment %u of relation %s: %m",
906 relpath(reln
->smgr_rnode
, forknum
))));
912 * mdsync() -- Sync previous writes to stable storage.
917 static bool mdsync_in_progress
= false;
919 HASH_SEQ_STATUS hstat
;
920 PendingOperationEntry
*entry
;
924 * This is only called during checkpoints, and checkpoints should only
925 * occur in processes that have created a pendingOpsTable.
927 if (!pendingOpsTable
)
928 elog(ERROR
, "cannot sync without a pendingOpsTable");
931 * If we are in the bgwriter, the sync had better include all fsync
932 * requests that were queued by backends up to this point. The tightest
933 * race condition that could occur is that a buffer that must be written
934 * and fsync'd for the checkpoint could have been dumped by a backend just
935 * before it was visited by BufferSync(). We know the backend will have
936 * queued an fsync request before clearing the buffer's dirtybit, so we
937 * are safe as long as we do an Absorb after completing BufferSync().
939 AbsorbFsyncRequests();
942 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
943 * checkpoint), we want to ignore fsync requests that are entered into the
944 * hashtable after this point --- they should be processed next time,
945 * instead. We use mdsync_cycle_ctr to tell old entries apart from new
946 * ones: new ones will have cycle_ctr equal to the incremented value of
949 * In normal circumstances, all entries present in the table at this point
950 * will have cycle_ctr exactly equal to the current (about to be old)
951 * value of mdsync_cycle_ctr. However, if we fail partway through the
952 * fsync'ing loop, then older values of cycle_ctr might remain when we
953 * come back here to try again. Repeated checkpoint failures would
954 * eventually wrap the counter around to the point where an old entry
955 * might appear new, causing us to skip it, possibly allowing a checkpoint
956 * to succeed that should not have. To forestall wraparound, any time the
957 * previous mdsync() failed to complete, run through the table and
958 * forcibly set cycle_ctr = mdsync_cycle_ctr.
960 * Think not to merge this loop with the main loop, as the problem is
961 * exactly that that loop may fail before having visited all the entries.
962 * From a performance point of view it doesn't matter anyway, as this path
963 * will never be taken in a system that's functioning normally.
965 if (mdsync_in_progress
)
967 /* prior try failed, so update any stale cycle_ctr values */
968 hash_seq_init(&hstat
, pendingOpsTable
);
969 while ((entry
= (PendingOperationEntry
*) hash_seq_search(&hstat
)) != NULL
)
971 entry
->cycle_ctr
= mdsync_cycle_ctr
;
975 /* Advance counter so that new hashtable entries are distinguishable */
978 /* Set flag to detect failure if we don't reach the end of the loop */
979 mdsync_in_progress
= true;
981 /* Now scan the hashtable for fsync requests to process */
982 absorb_counter
= FSYNCS_PER_ABSORB
;
983 hash_seq_init(&hstat
, pendingOpsTable
);
984 while ((entry
= (PendingOperationEntry
*) hash_seq_search(&hstat
)) != NULL
)
987 * If the entry is new then don't process it this time. Note that
988 * "continue" bypasses the hash-remove call at the bottom of the loop.
990 if (entry
->cycle_ctr
== mdsync_cycle_ctr
)
993 /* Else assert we haven't missed it */
994 Assert((CycleCtr
) (entry
->cycle_ctr
+ 1) == mdsync_cycle_ctr
);
997 * If fsync is off then we don't have to bother opening the file at
998 * all. (We delay checking until this point so that changing fsync on
999 * the fly behaves sensibly.) Also, if the entry is marked canceled,
1000 * fall through to delete it.
1002 if (enableFsync
&& !entry
->canceled
)
1007 * If in bgwriter, we want to absorb pending requests every so
1008 * often to prevent overflow of the fsync request queue. It is
1009 * unspecified whether newly-added entries will be visited by
1010 * hash_seq_search, but we don't care since we don't need to
1011 * process them anyway.
1013 if (--absorb_counter
<= 0)
1015 AbsorbFsyncRequests();
1016 absorb_counter
= FSYNCS_PER_ABSORB
;
1020 * The fsync table could contain requests to fsync segments that
1021 * have been deleted (unlinked) by the time we get to them. Rather
1022 * than just hoping an ENOENT (or EACCES on Windows) error can be
1023 * ignored, what we do on error is absorb pending requests and
1024 * then retry. Since mdunlink() queues a "revoke" message before
1025 * actually unlinking, the fsync request is guaranteed to be
1026 * marked canceled after the absorb if it really was this case.
1027 * DROP DATABASE likewise has to tell us to forget fsync requests
1028 * before it starts deletions.
1030 for (failures
= 0;; failures
++) /* loop exits at "break" */
1037 * Find or create an smgr hash entry for this relation. This
1038 * may seem a bit unclean -- md calling smgr? But it's really
1039 * the best solution. It ensures that the open file reference
1040 * isn't permanently leaked if we get an error here. (You may
1041 * say "but an unreferenced SMgrRelation is still a leak!" Not
1042 * really, because the only case in which a checkpoint is done
1043 * by a process that isn't about to shut down is in the
1044 * bgwriter, and it will periodically do smgrcloseall(). This
1045 * fact justifies our not closing the reln in the success path
1046 * either, which is a good thing since in non-bgwriter cases
1047 * we couldn't safely do that.) Furthermore, in many cases
1048 * the relation will have been dirtied through this same smgr
1049 * relation, and so we can save a file open/close cycle.
1051 reln
= smgropen(entry
->tag
.rnode
);
1054 * It is possible that the relation has been dropped or
1055 * truncated since the fsync request was entered. Therefore,
1056 * allow ENOENT, but only if we didn't fail already on this
1057 * file. This applies both during _mdfd_getseg() and during
1058 * FileSync, since fd.c might have closed the file behind our
1061 seg
= _mdfd_getseg(reln
, entry
->tag
.forknum
,
1062 entry
->tag
.segno
* ((BlockNumber
) RELSEG_SIZE
),
1063 false, EXTENSION_RETURN_NULL
);
1065 FileSync(seg
->mdfd_vfd
) >= 0)
1066 break; /* success; break out of retry loop */
1069 * XXX is there any point in allowing more than one retry?
1070 * Don't see one at the moment, but easy to change the test
1073 path
= relpath(entry
->tag
.rnode
, entry
->tag
.forknum
);
1074 if (!FILE_POSSIBLY_DELETED(errno
) ||
1077 (errcode_for_file_access(),
1078 errmsg("could not fsync segment %u of relation %s: %m",
1079 entry
->tag
.segno
, path
)));
1082 (errcode_for_file_access(),
1083 errmsg("could not fsync segment %u of relation %s but retrying: %m",
1084 entry
->tag
.segno
, path
)));
1088 * Absorb incoming requests and check to see if canceled.
1090 AbsorbFsyncRequests();
1091 absorb_counter
= FSYNCS_PER_ABSORB
; /* might as well... */
1093 if (entry
->canceled
)
1095 } /* end retry loop */
1099 * If we get here, either we fsync'd successfully, or we don't have to
1100 * because enableFsync is off, or the entry is (now) marked canceled.
1101 * Okay to delete it.
1103 if (hash_search(pendingOpsTable
, &entry
->tag
,
1104 HASH_REMOVE
, NULL
) == NULL
)
1105 elog(ERROR
, "pendingOpsTable corrupted");
1106 } /* end loop over hashtable entries */
1108 /* Flag successful completion of mdsync */
1109 mdsync_in_progress
= false;
1113 * mdpreckpt() -- Do pre-checkpoint work
1115 * To distinguish unlink requests that arrived before this checkpoint
1116 * started from those that arrived during the checkpoint, we use a cycle
1117 * counter similar to the one we use for fsync requests. That cycle
1118 * counter is incremented here.
1120 * This must be called *before* the checkpoint REDO point is determined.
1121 * That ensures that we won't delete files too soon.
1123 * Note that we can't do anything here that depends on the assumption
1124 * that the checkpoint will be completed.
1132 * In case the prior checkpoint wasn't completed, stamp all entries in the
1133 * list with the current cycle counter. Anything that's in the list at
1134 * the start of checkpoint can surely be deleted after the checkpoint is
1135 * finished, regardless of when the request was made.
1137 foreach(cell
, pendingUnlinks
)
1139 PendingUnlinkEntry
*entry
= (PendingUnlinkEntry
*) lfirst(cell
);
1141 entry
->cycle_ctr
= mdckpt_cycle_ctr
;
1145 * Any unlink requests arriving after this point will be assigned the next
1146 * cycle counter, and won't be unlinked until next checkpoint.
1152 * mdpostckpt() -- Do post-checkpoint work
1154 * Remove any lingering files that can now be safely removed.
1159 while (pendingUnlinks
!= NIL
)
1161 PendingUnlinkEntry
*entry
= (PendingUnlinkEntry
*) linitial(pendingUnlinks
);
1165 * New entries are appended to the end, so if the entry is new we've
1166 * reached the end of old entries.
1168 if (entry
->cycle_ctr
== mdckpt_cycle_ctr
)
1171 /* Else assert we haven't missed it */
1172 Assert((CycleCtr
) (entry
->cycle_ctr
+ 1) == mdckpt_cycle_ctr
);
1174 /* Unlink the file */
1175 path
= relpath(entry
->rnode
, MAIN_FORKNUM
);
1176 if (unlink(path
) < 0)
1179 * There's a race condition, when the database is dropped at the
1180 * same time that we process the pending unlink requests. If the
1181 * DROP DATABASE deletes the file before we do, we will get ENOENT
1182 * here. rmtree() also has to ignore ENOENT errors, to deal with
1183 * the possibility that we delete the file first.
1185 if (errno
!= ENOENT
)
1187 (errcode_for_file_access(),
1188 errmsg("could not remove relation %s: %m", path
)));
1192 pendingUnlinks
= list_delete_first(pendingUnlinks
);
1198 * register_dirty_segment() -- Mark a relation segment as needing fsync
1200 * If there is a local pending-ops table, just make an entry in it for
1201 * mdsync to process later. Otherwise, try to pass off the fsync request
1202 * to the background writer process. If that fails, just do the fsync
1203 * locally before returning (we expect this will not happen often enough
1204 * to be a performance problem).
1207 register_dirty_segment(SMgrRelation reln
, ForkNumber forknum
, MdfdVec
*seg
)
1209 if (pendingOpsTable
)
1211 /* push it into local pending-ops table */
1212 RememberFsyncRequest(reln
->smgr_rnode
, forknum
, seg
->mdfd_segno
);
1216 if (ForwardFsyncRequest(reln
->smgr_rnode
, forknum
, seg
->mdfd_segno
))
1217 return; /* passed it off successfully */
1219 if (FileSync(seg
->mdfd_vfd
) < 0)
1221 (errcode_for_file_access(),
1222 errmsg("could not fsync segment %u of relation %s: %m",
1224 relpath(reln
->smgr_rnode
, forknum
))));
1229 * register_unlink() -- Schedule a file to be deleted after next checkpoint
1231 * As with register_dirty_segment, this could involve either a local or
1232 * a remote pending-ops table.
1235 register_unlink(RelFileNode rnode
)
1237 if (pendingOpsTable
)
1239 /* push it into local pending-ops table */
1240 RememberFsyncRequest(rnode
, MAIN_FORKNUM
, UNLINK_RELATION_REQUEST
);
1245 * Notify the bgwriter about it. If we fail to queue the request
1246 * message, we have to sleep and try again, because we can't simply
1247 * delete the file now. Ugly, but hopefully won't happen often.
1249 * XXX should we just leave the file orphaned instead?
1251 Assert(IsUnderPostmaster
);
1252 while (!ForwardFsyncRequest(rnode
, MAIN_FORKNUM
,
1253 UNLINK_RELATION_REQUEST
))
1254 pg_usleep(10000L); /* 10 msec seems a good number */
1259 * RememberFsyncRequest() -- callback from bgwriter side of fsync request
1261 * We stuff most fsync requests into the local hash table for execution
1262 * during the bgwriter's next checkpoint. UNLINK requests go into a
1263 * separate linked list, however, because they get processed separately.
1265 * The range of possible segment numbers is way less than the range of
1266 * BlockNumber, so we can reserve high values of segno for special purposes.
1268 * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1269 * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1270 * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1273 * (Handling the FORGET_* requests is a tad slow because the hash table has
1274 * to be searched linearly, but it doesn't seem worth rethinking the table
1275 * structure for them.)
1278 RememberFsyncRequest(RelFileNode rnode
, ForkNumber forknum
, BlockNumber segno
)
1280 Assert(pendingOpsTable
);
1282 if (segno
== FORGET_RELATION_FSYNC
)
1284 /* Remove any pending requests for the entire relation */
1285 HASH_SEQ_STATUS hstat
;
1286 PendingOperationEntry
*entry
;
1288 hash_seq_init(&hstat
, pendingOpsTable
);
1289 while ((entry
= (PendingOperationEntry
*) hash_seq_search(&hstat
)) != NULL
)
1291 if (RelFileNodeEquals(entry
->tag
.rnode
, rnode
) &&
1292 entry
->tag
.forknum
== forknum
)
1294 /* Okay, cancel this entry */
1295 entry
->canceled
= true;
1299 else if (segno
== FORGET_DATABASE_FSYNC
)
1301 /* Remove any pending requests for the entire database */
1302 HASH_SEQ_STATUS hstat
;
1303 PendingOperationEntry
*entry
;
1308 /* Remove fsync requests */
1309 hash_seq_init(&hstat
, pendingOpsTable
);
1310 while ((entry
= (PendingOperationEntry
*) hash_seq_search(&hstat
)) != NULL
)
1312 if (entry
->tag
.rnode
.dbNode
== rnode
.dbNode
)
1314 /* Okay, cancel this entry */
1315 entry
->canceled
= true;
1319 /* Remove unlink requests */
1321 for (cell
= list_head(pendingUnlinks
); cell
; cell
= next
)
1323 PendingUnlinkEntry
*entry
= (PendingUnlinkEntry
*) lfirst(cell
);
1326 if (entry
->rnode
.dbNode
== rnode
.dbNode
)
1328 pendingUnlinks
= list_delete_cell(pendingUnlinks
, cell
, prev
);
1335 else if (segno
== UNLINK_RELATION_REQUEST
)
1337 /* Unlink request: put it in the linked list */
1338 MemoryContext oldcxt
= MemoryContextSwitchTo(MdCxt
);
1339 PendingUnlinkEntry
*entry
;
1341 entry
= palloc(sizeof(PendingUnlinkEntry
));
1342 entry
->rnode
= rnode
;
1343 entry
->cycle_ctr
= mdckpt_cycle_ctr
;
1345 pendingUnlinks
= lappend(pendingUnlinks
, entry
);
1347 MemoryContextSwitchTo(oldcxt
);
1351 /* Normal case: enter a request to fsync this segment */
1352 PendingOperationTag key
;
1353 PendingOperationEntry
*entry
;
1356 /* ensure any pad bytes in the hash key are zeroed */
1357 MemSet(&key
, 0, sizeof(key
));
1359 key
.forknum
= forknum
;
1362 entry
= (PendingOperationEntry
*) hash_search(pendingOpsTable
,
1366 /* if new or previously canceled entry, initialize it */
1367 if (!found
|| entry
->canceled
)
1369 entry
->canceled
= false;
1370 entry
->cycle_ctr
= mdsync_cycle_ctr
;
1374 * NB: it's intentional that we don't change cycle_ctr if the entry
1375 * already exists. The fsync request must be treated as old, even
1376 * though the new request will be satisfied too by any subsequent
1379 * However, if the entry is present but is marked canceled, we should
1380 * act just as though it wasn't there. The only case where this could
1381 * happen would be if a file had been deleted, we received but did not
1382 * yet act on the cancel request, and the same relfilenode was then
1383 * assigned to a new file. We mustn't lose the new request, but it
1384 * should be considered new not old.
1390 * ForgetRelationFsyncRequests -- forget any fsyncs for a rel
1393 ForgetRelationFsyncRequests(RelFileNode rnode
, ForkNumber forknum
)
1395 if (pendingOpsTable
)
1397 /* standalone backend or startup process: fsync state is local */
1398 RememberFsyncRequest(rnode
, forknum
, FORGET_RELATION_FSYNC
);
1400 else if (IsUnderPostmaster
)
1403 * Notify the bgwriter about it. If we fail to queue the revoke
1404 * message, we have to sleep and try again ... ugly, but hopefully
1405 * won't happen often.
1407 * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
1408 * error would leave the no-longer-used file still present on disk,
1409 * which would be bad, so I'm inclined to assume that the bgwriter
1410 * will always empty the queue soon.
1412 while (!ForwardFsyncRequest(rnode
, forknum
, FORGET_RELATION_FSYNC
))
1413 pg_usleep(10000L); /* 10 msec seems a good number */
1416 * Note we don't wait for the bgwriter to actually absorb the revoke
1417 * message; see mdsync() for the implications.
1423 * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
1426 ForgetDatabaseFsyncRequests(Oid dbid
)
1430 rnode
.dbNode
= dbid
;
1434 if (pendingOpsTable
)
1436 /* standalone backend or startup process: fsync state is local */
1437 RememberFsyncRequest(rnode
, InvalidForkNumber
, FORGET_DATABASE_FSYNC
);
1439 else if (IsUnderPostmaster
)
1441 /* see notes in ForgetRelationFsyncRequests */
1442 while (!ForwardFsyncRequest(rnode
, InvalidForkNumber
,
1443 FORGET_DATABASE_FSYNC
))
1444 pg_usleep(10000L); /* 10 msec seems a good number */
1450 * _fdvec_alloc() -- Make a MdfdVec object.
1455 return (MdfdVec
*) MemoryContextAlloc(MdCxt
, sizeof(MdfdVec
));
1459 * Open the specified segment of the relation,
1460 * and make a MdfdVec object for it. Returns NULL on failure.
1463 _mdfd_openseg(SMgrRelation reln
, ForkNumber forknum
, BlockNumber segno
,
1471 path
= relpath(reln
->smgr_rnode
, forknum
);
1475 /* be sure we have enough space for the '.segno' */
1476 fullpath
= (char *) palloc(strlen(path
) + 12);
1477 sprintf(fullpath
, "%s.%u", path
, segno
);
1484 fd
= PathNameOpenFile(fullpath
, O_RDWR
| PG_BINARY
| oflags
, 0600);
1491 /* allocate an mdfdvec entry for it */
1494 /* fill the entry */
1496 v
->mdfd_segno
= segno
;
1497 v
->mdfd_chain
= NULL
;
1498 Assert(_mdnblocks(reln
, forknum
, v
) <= ((BlockNumber
) RELSEG_SIZE
));
1505 * _mdfd_getseg() -- Find the segment of the relation holding the
1508 * If the segment doesn't exist, we ereport, return NULL, or create the
1509 * segment, according to "behavior". Note: isTemp need only be correct
1510 * in the EXTENSION_CREATE case.
1513 _mdfd_getseg(SMgrRelation reln
, ForkNumber forknum
, BlockNumber blkno
,
1514 bool isTemp
, ExtensionBehavior behavior
)
1516 MdfdVec
*v
= mdopen(reln
, forknum
, behavior
);
1517 BlockNumber targetseg
;
1518 BlockNumber nextsegno
;
1521 return NULL
; /* only possible if EXTENSION_RETURN_NULL */
1523 targetseg
= blkno
/ ((BlockNumber
) RELSEG_SIZE
);
1524 for (nextsegno
= 1; nextsegno
<= targetseg
; nextsegno
++)
1526 Assert(nextsegno
== v
->mdfd_segno
+ 1);
1528 if (v
->mdfd_chain
== NULL
)
1531 * Normally we will create new segments only if authorized by the
1532 * caller (i.e., we are doing mdextend()). But when doing WAL
1533 * recovery, create segments anyway; this allows cases such as
1534 * replaying WAL data that has a write into a high-numbered
1535 * segment of a relation that was later deleted. We want to go
1536 * ahead and create the segments so we can finish out the replay.
1538 * We have to maintain the invariant that segments before the last
1539 * active segment are of size RELSEG_SIZE; therefore, pad them out
1540 * with zeroes if needed. (This only matters if caller is
1541 * extending the relation discontiguously, but that can happen in
1544 if (behavior
== EXTENSION_CREATE
|| InRecovery
)
1546 if (_mdnblocks(reln
, forknum
, v
) < RELSEG_SIZE
)
1548 char *zerobuf
= palloc0(BLCKSZ
);
1550 mdextend(reln
, forknum
,
1551 nextsegno
* ((BlockNumber
) RELSEG_SIZE
) - 1,
1555 v
->mdfd_chain
= _mdfd_openseg(reln
, forknum
, +nextsegno
, O_CREAT
);
1559 /* We won't create segment if not existent */
1560 v
->mdfd_chain
= _mdfd_openseg(reln
, forknum
, nextsegno
, 0);
1562 if (v
->mdfd_chain
== NULL
)
1564 if (behavior
== EXTENSION_RETURN_NULL
&&
1565 FILE_POSSIBLY_DELETED(errno
))
1568 (errcode_for_file_access(),
1569 errmsg("could not open segment %u of relation %s (target block %u): %m",
1571 relpath(reln
->smgr_rnode
, forknum
),
1581 * Get number of blocks present in a single disk file
1584 _mdnblocks(SMgrRelation reln
, ForkNumber forknum
, MdfdVec
*seg
)
1588 len
= FileSeek(seg
->mdfd_vfd
, 0L, SEEK_END
);
1591 (errcode_for_file_access(),
1592 errmsg("could not seek to end of segment %u of relation %s: %m",
1593 seg
->mdfd_segno
, relpath(reln
->smgr_rnode
, forknum
))));
1594 /* note that this calculation will ignore any partial block at EOF */
1595 return (BlockNumber
) (len
/ BLCKSZ
);