1 /*-------------------------------------------------------------------------
4 * Functions for internal use by the TOAST system.
6 * Copyright (c) 2000-2025, PostgreSQL Global Development Group
9 * src/backend/access/common/toast_internals.c
11 *-------------------------------------------------------------------------
16 #include "access/detoast.h"
17 #include "access/genam.h"
18 #include "access/heapam.h"
19 #include "access/heaptoast.h"
20 #include "access/table.h"
21 #include "access/toast_internals.h"
22 #include "access/xact.h"
23 #include "catalog/catalog.h"
24 #include "miscadmin.h"
25 #include "utils/fmgroids.h"
26 #include "utils/rel.h"
27 #include "utils/snapmgr.h"
29 static bool toastrel_valueid_exists(Relation toastrel
, Oid valueid
);
30 static bool toastid_valueid_exists(Oid toastrelid
, Oid valueid
);
33 * toast_compress_datum -
35 * Create a compressed version of a varlena datum
37 * If we fail (ie, compressed result is actually bigger than original)
38 * then return NULL. We must not use compressed data if it'd expand
41 * We use VAR{SIZE,DATA}_ANY so we can handle short varlenas here without
42 * copying them. But we can't handle external or compressed datums.
46 toast_compress_datum(Datum value
, char cmethod
)
48 struct varlena
*tmp
= NULL
;
50 ToastCompressionId cmid
= TOAST_INVALID_COMPRESSION_ID
;
52 Assert(!VARATT_IS_EXTERNAL(DatumGetPointer(value
)));
53 Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(value
)));
55 valsize
= VARSIZE_ANY_EXHDR(DatumGetPointer(value
));
57 /* If the compression method is not valid, use the current default */
58 if (!CompressionMethodIsValid(cmethod
))
59 cmethod
= default_toast_compression
;
62 * Call appropriate compression routine for the compression method.
66 case TOAST_PGLZ_COMPRESSION
:
67 tmp
= pglz_compress_datum((const struct varlena
*) value
);
68 cmid
= TOAST_PGLZ_COMPRESSION_ID
;
70 case TOAST_LZ4_COMPRESSION
:
71 tmp
= lz4_compress_datum((const struct varlena
*) value
);
72 cmid
= TOAST_LZ4_COMPRESSION_ID
;
75 elog(ERROR
, "invalid compression method %c", cmethod
);
79 return PointerGetDatum(NULL
);
82 * We recheck the actual size even if compression reports success, because
83 * it might be satisfied with having saved as little as one byte in the
84 * compressed data --- which could turn into a net loss once you consider
85 * header and alignment padding. Worst case, the compressed format might
86 * require three padding bytes (plus header, which is included in
87 * VARSIZE(tmp)), whereas the uncompressed format would take only one
88 * header byte and no padding if the value is short enough. So we insist
89 * on a savings of more than 2 bytes to ensure we have a gain.
91 if (VARSIZE(tmp
) < valsize
- 2)
93 /* successful compression */
94 Assert(cmid
!= TOAST_INVALID_COMPRESSION_ID
);
95 TOAST_COMPRESS_SET_SIZE_AND_COMPRESS_METHOD(tmp
, valsize
, cmid
);
96 return PointerGetDatum(tmp
);
100 /* incompressible data */
102 return PointerGetDatum(NULL
);
109 * Save one single datum into the secondary relation and return
110 * a Datum reference for it.
112 * rel: the main relation we're working with (not the toast rel!)
113 * value: datum to be pushed to toast storage
114 * oldexternal: if not NULL, toast pointer previously representing the datum
115 * options: options to be passed to heap_insert() for toast rows
119 toast_save_datum(Relation rel
, Datum value
,
120 struct varlena
*oldexternal
, int options
)
125 TupleDesc toasttupDesc
;
128 CommandId mycid
= GetCurrentCommandId(true);
129 struct varlena
*result
;
130 struct varatt_external toast_pointer
;
134 /* this is to make the union big enough for a chunk: */
135 char data
[TOAST_MAX_CHUNK_SIZE
+ VARHDRSZ
];
136 /* ensure union is aligned well enough: */
143 Pointer dval
= DatumGetPointer(value
);
147 Assert(!VARATT_IS_EXTERNAL(value
));
150 * Open the toast relation and its indexes. We can use the index to check
151 * uniqueness of the OID we assign to the toasted item, even though it has
152 * additional columns besides OID.
154 toastrel
= table_open(rel
->rd_rel
->reltoastrelid
, RowExclusiveLock
);
155 toasttupDesc
= toastrel
->rd_att
;
157 /* Open all the toast indexes and look for the valid one */
158 validIndex
= toast_open_indexes(toastrel
,
164 * Get the data pointer and length, and compute va_rawsize and va_extinfo.
166 * va_rawsize is the size of the equivalent fully uncompressed datum, so
167 * we have to adjust for short headers.
169 * va_extinfo stored the actual size of the data payload in the toast
170 * records and the compression method in first 2 bits if data is
173 if (VARATT_IS_SHORT(dval
))
175 data_p
= VARDATA_SHORT(dval
);
176 data_todo
= VARSIZE_SHORT(dval
) - VARHDRSZ_SHORT
;
177 toast_pointer
.va_rawsize
= data_todo
+ VARHDRSZ
; /* as if not short */
178 toast_pointer
.va_extinfo
= data_todo
;
180 else if (VARATT_IS_COMPRESSED(dval
))
182 data_p
= VARDATA(dval
);
183 data_todo
= VARSIZE(dval
) - VARHDRSZ
;
184 /* rawsize in a compressed datum is just the size of the payload */
185 toast_pointer
.va_rawsize
= VARDATA_COMPRESSED_GET_EXTSIZE(dval
) + VARHDRSZ
;
187 /* set external size and compression method */
188 VARATT_EXTERNAL_SET_SIZE_AND_COMPRESS_METHOD(toast_pointer
, data_todo
,
189 VARDATA_COMPRESSED_GET_COMPRESS_METHOD(dval
));
190 /* Assert that the numbers look like it's compressed */
191 Assert(VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer
));
195 data_p
= VARDATA(dval
);
196 data_todo
= VARSIZE(dval
) - VARHDRSZ
;
197 toast_pointer
.va_rawsize
= VARSIZE(dval
);
198 toast_pointer
.va_extinfo
= data_todo
;
202 * Insert the correct table OID into the result TOAST pointer.
204 * Normally this is the actual OID of the target toast table, but during
205 * table-rewriting operations such as CLUSTER, we have to insert the OID
206 * of the table's real permanent toast table instead. rd_toastoid is set
207 * if we have to substitute such an OID.
209 if (OidIsValid(rel
->rd_toastoid
))
210 toast_pointer
.va_toastrelid
= rel
->rd_toastoid
;
212 toast_pointer
.va_toastrelid
= RelationGetRelid(toastrel
);
215 * Choose an OID to use as the value ID for this toast value.
217 * Normally we just choose an unused OID within the toast table. But
218 * during table-rewriting operations where we are preserving an existing
219 * toast table OID, we want to preserve toast value OIDs too. So, if
220 * rd_toastoid is set and we had a prior external value from that same
221 * toast table, re-use its value ID. If we didn't have a prior external
222 * value (which is a corner case, but possible if the table's attstorage
223 * options have been changed), we have to pick a value ID that doesn't
224 * conflict with either new or existing toast value OIDs.
226 if (!OidIsValid(rel
->rd_toastoid
))
228 /* normal case: just choose an unused OID */
229 toast_pointer
.va_valueid
=
230 GetNewOidWithIndex(toastrel
,
231 RelationGetRelid(toastidxs
[validIndex
]),
236 /* rewrite case: check to see if value was in old toast table */
237 toast_pointer
.va_valueid
= InvalidOid
;
238 if (oldexternal
!= NULL
)
240 struct varatt_external old_toast_pointer
;
242 Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal
));
243 /* Must copy to access aligned fields */
244 VARATT_EXTERNAL_GET_POINTER(old_toast_pointer
, oldexternal
);
245 if (old_toast_pointer
.va_toastrelid
== rel
->rd_toastoid
)
247 /* This value came from the old toast table; reuse its OID */
248 toast_pointer
.va_valueid
= old_toast_pointer
.va_valueid
;
251 * There is a corner case here: the table rewrite might have
252 * to copy both live and recently-dead versions of a row, and
253 * those versions could easily reference the same toast value.
254 * When we copy the second or later version of such a row,
255 * reusing the OID will mean we select an OID that's already
256 * in the new toast table. Check for that, and if so, just
257 * fall through without writing the data again.
259 * While annoying and ugly-looking, this is a good thing
260 * because it ensures that we wind up with only one copy of
261 * the toast value when there is only one copy in the old
262 * toast table. Before we detected this case, we'd have made
263 * multiple copies, wasting space; and what's worse, the
264 * copies belonging to already-deleted heap tuples would not
265 * be reclaimed by VACUUM.
267 if (toastrel_valueid_exists(toastrel
,
268 toast_pointer
.va_valueid
))
270 /* Match, so short-circuit the data storage loop below */
275 if (toast_pointer
.va_valueid
== InvalidOid
)
278 * new value; must choose an OID that doesn't conflict in either
279 * old or new toast table
283 toast_pointer
.va_valueid
=
284 GetNewOidWithIndex(toastrel
,
285 RelationGetRelid(toastidxs
[validIndex
]),
287 } while (toastid_valueid_exists(rel
->rd_toastoid
,
288 toast_pointer
.va_valueid
));
293 * Initialize constant parts of the tuple data
295 t_values
[0] = ObjectIdGetDatum(toast_pointer
.va_valueid
);
296 t_values
[2] = PointerGetDatum(&chunk_data
);
302 * Split up the item into chunks
304 while (data_todo
> 0)
308 CHECK_FOR_INTERRUPTS();
311 * Calculate the size of this chunk
313 chunk_size
= Min(TOAST_MAX_CHUNK_SIZE
, data_todo
);
316 * Build a tuple and store it
318 t_values
[1] = Int32GetDatum(chunk_seq
++);
319 SET_VARSIZE(&chunk_data
, chunk_size
+ VARHDRSZ
);
320 memcpy(VARDATA(&chunk_data
), data_p
, chunk_size
);
321 toasttup
= heap_form_tuple(toasttupDesc
, t_values
, t_isnull
);
323 heap_insert(toastrel
, toasttup
, mycid
, options
, NULL
);
326 * Create the index entry. We cheat a little here by not using
327 * FormIndexDatum: this relies on the knowledge that the index columns
328 * are the same as the initial columns of the table for all the
329 * indexes. We also cheat by not providing an IndexInfo: this is okay
330 * for now because btree doesn't need one, but we might have to be
331 * more honest someday.
333 * Note also that there had better not be any user-created index on
334 * the TOAST table, since we don't bother to update anything else.
336 for (i
= 0; i
< num_indexes
; i
++)
338 /* Only index relations marked as ready can be updated */
339 if (toastidxs
[i
]->rd_index
->indisready
)
340 index_insert(toastidxs
[i
], t_values
, t_isnull
,
343 toastidxs
[i
]->rd_index
->indisunique
?
344 UNIQUE_CHECK_YES
: UNIQUE_CHECK_NO
,
351 heap_freetuple(toasttup
);
354 * Move on to next chunk
356 data_todo
-= chunk_size
;
357 data_p
+= chunk_size
;
361 * Done - close toast relation and its indexes but keep the lock until
362 * commit, so as a concurrent reindex done directly on the toast relation
363 * would be able to wait for this transaction.
365 toast_close_indexes(toastidxs
, num_indexes
, NoLock
);
366 table_close(toastrel
, NoLock
);
369 * Create the TOAST pointer value that we'll return
371 result
= (struct varlena
*) palloc(TOAST_POINTER_SIZE
);
372 SET_VARTAG_EXTERNAL(result
, VARTAG_ONDISK
);
373 memcpy(VARDATA_EXTERNAL(result
), &toast_pointer
, sizeof(toast_pointer
));
375 return PointerGetDatum(result
);
379 * toast_delete_datum -
381 * Delete a single external stored value.
385 toast_delete_datum(Relation rel
, Datum value
, bool is_speculative
)
387 struct varlena
*attr
= (struct varlena
*) DatumGetPointer(value
);
388 struct varatt_external toast_pointer
;
391 ScanKeyData toastkey
;
392 SysScanDesc toastscan
;
397 if (!VARATT_IS_EXTERNAL_ONDISK(attr
))
400 /* Must copy to access aligned fields */
401 VARATT_EXTERNAL_GET_POINTER(toast_pointer
, attr
);
404 * Open the toast relation and its indexes
406 toastrel
= table_open(toast_pointer
.va_toastrelid
, RowExclusiveLock
);
408 /* Fetch valid relation used for process */
409 validIndex
= toast_open_indexes(toastrel
,
415 * Setup a scan key to find chunks with matching va_valueid
417 ScanKeyInit(&toastkey
,
419 BTEqualStrategyNumber
, F_OIDEQ
,
420 ObjectIdGetDatum(toast_pointer
.va_valueid
));
423 * Find all the chunks. (We don't actually care whether we see them in
424 * sequence or not, but since we've already locked the index we might as
425 * well use systable_beginscan_ordered.)
427 toastscan
= systable_beginscan_ordered(toastrel
, toastidxs
[validIndex
],
428 get_toast_snapshot(), 1, &toastkey
);
429 while ((toasttup
= systable_getnext_ordered(toastscan
, ForwardScanDirection
)) != NULL
)
432 * Have a chunk, delete it
435 heap_abort_speculative(toastrel
, &toasttup
->t_self
);
437 simple_heap_delete(toastrel
, &toasttup
->t_self
);
441 * End scan and close relations but keep the lock until commit, so as a
442 * concurrent reindex done directly on the toast relation would be able to
443 * wait for this transaction.
445 systable_endscan_ordered(toastscan
);
446 toast_close_indexes(toastidxs
, num_indexes
, NoLock
);
447 table_close(toastrel
, NoLock
);
451 * toastrel_valueid_exists -
453 * Test whether a toast value with the given ID exists in the toast relation.
454 * For safety, we consider a value to exist if there are either live or dead
455 * toast rows with that ID; see notes for GetNewOidWithIndex().
459 toastrel_valueid_exists(Relation toastrel
, Oid valueid
)
462 ScanKeyData toastkey
;
463 SysScanDesc toastscan
;
468 /* Fetch a valid index relation */
469 validIndex
= toast_open_indexes(toastrel
,
475 * Setup a scan key to find chunks with matching va_valueid
477 ScanKeyInit(&toastkey
,
479 BTEqualStrategyNumber
, F_OIDEQ
,
480 ObjectIdGetDatum(valueid
));
483 * Is there any such chunk?
485 toastscan
= systable_beginscan(toastrel
,
486 RelationGetRelid(toastidxs
[validIndex
]),
487 true, SnapshotAny
, 1, &toastkey
);
489 if (systable_getnext(toastscan
) != NULL
)
492 systable_endscan(toastscan
);
495 toast_close_indexes(toastidxs
, num_indexes
, RowExclusiveLock
);
501 * toastid_valueid_exists -
503 * As above, but work from toast rel's OID not an open relation
507 toastid_valueid_exists(Oid toastrelid
, Oid valueid
)
512 toastrel
= table_open(toastrelid
, AccessShareLock
);
514 result
= toastrel_valueid_exists(toastrel
, valueid
);
516 table_close(toastrel
, AccessShareLock
);
522 * toast_get_valid_index
524 * Get OID of valid index associated to given toast relation. A toast
525 * relation can have only one valid index at the same time.
528 toast_get_valid_index(Oid toastoid
, LOCKMODE lock
)
536 /* Open the toast relation */
537 toastrel
= table_open(toastoid
, lock
);
539 /* Look for the valid index of the toast relation */
540 validIndex
= toast_open_indexes(toastrel
,
544 validIndexOid
= RelationGetRelid(toastidxs
[validIndex
]);
546 /* Close the toast relation and all its indexes */
547 toast_close_indexes(toastidxs
, num_indexes
, NoLock
);
548 table_close(toastrel
, NoLock
);
550 return validIndexOid
;
556 * Get an array of the indexes associated to the given toast relation
557 * and return as well the position of the valid index used by the toast
558 * relation in this array. It is the responsibility of the caller of this
559 * function to close the indexes as well as free them.
562 toast_open_indexes(Relation toastrel
,
564 Relation
**toastidxs
,
573 /* Get index list of the toast relation */
574 indexlist
= RelationGetIndexList(toastrel
);
575 Assert(indexlist
!= NIL
);
577 *num_indexes
= list_length(indexlist
);
579 /* Open all the index relations */
580 *toastidxs
= (Relation
*) palloc(*num_indexes
* sizeof(Relation
));
581 foreach(lc
, indexlist
)
582 (*toastidxs
)[i
++] = index_open(lfirst_oid(lc
), lock
);
584 /* Fetch the first valid index in list */
585 for (i
= 0; i
< *num_indexes
; i
++)
587 Relation toastidx
= (*toastidxs
)[i
];
589 if (toastidx
->rd_index
->indisvalid
)
598 * Free index list, not necessary anymore as relations are opened and a
599 * valid index has been found.
601 list_free(indexlist
);
604 * The toast relation should have one valid index, so something is going
605 * wrong if there is nothing.
608 elog(ERROR
, "no valid index found for toast relation with Oid %u",
609 RelationGetRelid(toastrel
));
615 * toast_close_indexes
617 * Close an array of indexes for a toast relation and free it. This should
618 * be called for a set of indexes opened previously with toast_open_indexes.
621 toast_close_indexes(Relation
*toastidxs
, int num_indexes
, LOCKMODE lock
)
625 /* Close relations and clean up things */
626 for (i
= 0; i
< num_indexes
; i
++)
627 index_close(toastidxs
[i
], lock
);
634 * Return the TOAST snapshot. Detoasting *must* happen in the same
635 * transaction that originally fetched the toast pointer.
638 get_toast_snapshot(void)
641 * We cannot directly check that detoasting happens in the same
642 * transaction that originally fetched the toast pointer, but at least
643 * check that the session has some active snapshots. It might not if, for
644 * example, a procedure fetches a toasted value into a local variable,
645 * commits, and then tries to detoast the value. Such coding is unsafe,
646 * because once we commit there is nothing to prevent the toast data from
647 * being deleted. (This is not very much protection, because in many
648 * scenarios the procedure would have already created a new transaction
649 * snapshot, preventing us from detecting the problem. But it's better
652 if (!HaveRegisteredOrActiveSnapshot())
653 elog(ERROR
, "cannot fetch toast data without an active snapshot");
655 return &SnapshotToastData
;