1 /*-------------------------------------------------------------------------
4 * delete & vacuum routines for the postgres GIN
7 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
12 *-------------------------------------------------------------------------
17 #include "access/genam.h"
18 #include "access/gin.h"
19 #include "commands/vacuum.h"
20 #include "miscadmin.h"
21 #include "storage/bufmgr.h"
22 #include "storage/freespace.h"
23 #include "storage/indexfsm.h"
24 #include "storage/lmgr.h"
29 IndexBulkDeleteResult
*result
;
30 IndexBulkDeleteCallback callback
;
33 BufferAccessStrategy strategy
;
38 * Cleans array of ItemPointer (removes dead pointers)
39 * Results are always stored in *cleaned, which will be allocated
40 * if it's needed. In case of *cleaned!=NULL caller is responsible to
41 * have allocated enough space. *cleaned and items may point to the same
46 ginVacuumPostingList(GinVacuumState
*gvs
, ItemPointerData
*items
, uint32 nitem
, ItemPointerData
**cleaned
)
52 * just scan over ItemPointer array
55 for (i
= 0; i
< nitem
; i
++)
57 if (gvs
->callback(items
+ i
, gvs
->callback_state
))
59 gvs
->result
->tuples_removed
+= 1;
62 *cleaned
= (ItemPointerData
*) palloc(sizeof(ItemPointerData
) * nitem
);
64 memcpy(*cleaned
, items
, sizeof(ItemPointerData
) * i
);
69 gvs
->result
->num_index_tuples
+= 1;
71 (*cleaned
)[j
] = items
[i
];
80 * fills WAL record for vacuum leaf page
83 xlogVacuumPage(Relation index
, Buffer buffer
)
85 Page page
= BufferGetPage(buffer
);
88 ginxlogVacuumPage data
;
93 Assert(GinPageIsLeaf(page
));
98 data
.node
= index
->rd_node
;
99 data
.blkno
= BufferGetBlockNumber(buffer
);
101 if (GinPageIsData(page
))
103 backup
= GinDataPageGetData(page
);
104 data
.nitem
= GinPageGetOpaque(page
)->maxoff
;
106 len
= MAXALIGN(sizeof(ItemPointerData
) * data
.nitem
);
113 ptr
= backup
= itups
;
114 for (i
= FirstOffsetNumber
; i
<= PageGetMaxOffsetNumber(page
); i
++)
116 IndexTuple itup
= (IndexTuple
) PageGetItem(page
, PageGetItemId(page
, i
));
118 memcpy(ptr
, itup
, IndexTupleSize(itup
));
119 ptr
+= MAXALIGN(IndexTupleSize(itup
));
122 data
.nitem
= PageGetMaxOffsetNumber(page
);
126 rdata
[0].buffer
= buffer
;
127 rdata
[0].buffer_std
= (GinPageIsData(page
)) ? FALSE
: TRUE
;
129 rdata
[0].data
= NULL
;
130 rdata
[0].next
= rdata
+ 1;
132 rdata
[1].buffer
= InvalidBuffer
;
133 rdata
[1].len
= sizeof(ginxlogVacuumPage
);
134 rdata
[1].data
= (char *) &data
;
138 rdata
[1].next
= NULL
;
142 rdata
[1].next
= rdata
+ 2;
144 rdata
[2].buffer
= InvalidBuffer
;
146 rdata
[2].data
= backup
;
147 rdata
[2].next
= NULL
;
150 recptr
= XLogInsert(RM_GIN_ID
, XLOG_GIN_VACUUM_PAGE
, rdata
);
151 PageSetLSN(page
, recptr
);
152 PageSetTLI(page
, ThisTimeLineID
);
156 ginVacuumPostingTreeLeaves(GinVacuumState
*gvs
, BlockNumber blkno
, bool isRoot
, Buffer
*rootBuffer
)
158 Buffer buffer
= ReadBufferWithStrategy(gvs
->index
, blkno
, gvs
->strategy
);
159 Page page
= BufferGetPage(buffer
);
160 bool hasVoidPage
= FALSE
;
163 * We should be sure that we don't concurrent with inserts, insert process
164 * never release root page until end (but it can unlock it and lock
165 * again). New scan can't start but previously started ones work
170 LockBufferForCleanup(buffer
);
172 LockBuffer(buffer
, GIN_EXCLUSIVE
);
174 Assert(GinPageIsData(page
));
176 if (GinPageIsLeaf(page
))
178 OffsetNumber newMaxOff
,
179 oldMaxOff
= GinPageGetOpaque(page
)->maxoff
;
180 ItemPointerData
*cleaned
= NULL
;
182 newMaxOff
= ginVacuumPostingList(gvs
,
183 (ItemPointer
) GinDataPageGetData(page
), oldMaxOff
, &cleaned
);
185 /* saves changes about deleted tuple ... */
186 if (oldMaxOff
!= newMaxOff
)
189 START_CRIT_SECTION();
192 memcpy(GinDataPageGetData(page
), cleaned
, sizeof(ItemPointerData
) * newMaxOff
);
194 GinPageGetOpaque(page
)->maxoff
= newMaxOff
;
196 MarkBufferDirty(buffer
);
197 xlogVacuumPage(gvs
->index
, buffer
);
201 /* if root is a leaf page, we don't desire further processing */
202 if (!isRoot
&& GinPageGetOpaque(page
)->maxoff
< FirstOffsetNumber
)
209 bool isChildHasVoid
= FALSE
;
211 for (i
= FirstOffsetNumber
; i
<= GinPageGetOpaque(page
)->maxoff
; i
++)
213 PostingItem
*pitem
= (PostingItem
*) GinDataPageGetItem(page
, i
);
215 if (ginVacuumPostingTreeLeaves(gvs
, PostingItemGetBlockNumber(pitem
), FALSE
, NULL
))
216 isChildHasVoid
= TRUE
;
224 * if we have root and theres void pages in tree, then we don't release
225 * lock to go further processing and guarantee that tree is unused
227 if (!(isRoot
&& hasVoidPage
))
229 UnlockReleaseBuffer(buffer
);
234 *rootBuffer
= buffer
;
241 ginDeletePage(GinVacuumState
*gvs
, BlockNumber deleteBlkno
, BlockNumber leftBlkno
,
242 BlockNumber parentBlkno
, OffsetNumber myoff
, bool isParentRoot
)
244 Buffer dBuffer
= ReadBufferWithStrategy(gvs
->index
, deleteBlkno
, gvs
->strategy
);
245 Buffer lBuffer
= (leftBlkno
== InvalidBlockNumber
) ?
246 InvalidBuffer
: ReadBufferWithStrategy(gvs
->index
, leftBlkno
, gvs
->strategy
);
247 Buffer pBuffer
= ReadBufferWithStrategy(gvs
->index
, parentBlkno
, gvs
->strategy
);
251 LockBuffer(dBuffer
, GIN_EXCLUSIVE
);
252 if (!isParentRoot
) /* parent is already locked by
253 * LockBufferForCleanup() */
254 LockBuffer(pBuffer
, GIN_EXCLUSIVE
);
255 if (leftBlkno
!= InvalidBlockNumber
)
256 LockBuffer(lBuffer
, GIN_EXCLUSIVE
);
258 START_CRIT_SECTION();
260 if (leftBlkno
!= InvalidBlockNumber
)
262 BlockNumber rightlink
;
264 page
= BufferGetPage(dBuffer
);
265 rightlink
= GinPageGetOpaque(page
)->rightlink
;
267 page
= BufferGetPage(lBuffer
);
268 GinPageGetOpaque(page
)->rightlink
= rightlink
;
271 parentPage
= BufferGetPage(pBuffer
);
272 #ifdef USE_ASSERT_CHECKING
275 PostingItem
*tod
= (PostingItem
*) GinDataPageGetItem(parentPage
, myoff
);
277 Assert(PostingItemGetBlockNumber(tod
) == deleteBlkno
);
280 PageDeletePostingItem(parentPage
, myoff
);
282 page
= BufferGetPage(dBuffer
);
285 * we shouldn't change rightlink field to save workability of running
288 GinPageGetOpaque(page
)->flags
= GIN_DELETED
;
290 MarkBufferDirty(pBuffer
);
291 if (leftBlkno
!= InvalidBlockNumber
)
292 MarkBufferDirty(lBuffer
);
293 MarkBufferDirty(dBuffer
);
295 if (!gvs
->index
->rd_istemp
)
298 XLogRecData rdata
[4];
299 ginxlogDeletePage data
;
302 data
.node
= gvs
->index
->rd_node
;
303 data
.blkno
= deleteBlkno
;
304 data
.parentBlkno
= parentBlkno
;
305 data
.parentOffset
= myoff
;
306 data
.leftBlkno
= leftBlkno
;
307 data
.rightLink
= GinPageGetOpaque(page
)->rightlink
;
309 rdata
[0].buffer
= dBuffer
;
310 rdata
[0].buffer_std
= FALSE
;
311 rdata
[0].data
= NULL
;
313 rdata
[0].next
= rdata
+ 1;
315 rdata
[1].buffer
= pBuffer
;
316 rdata
[1].buffer_std
= FALSE
;
317 rdata
[1].data
= NULL
;
319 rdata
[1].next
= rdata
+ 2;
321 if (leftBlkno
!= InvalidBlockNumber
)
323 rdata
[2].buffer
= lBuffer
;
324 rdata
[2].buffer_std
= FALSE
;
325 rdata
[2].data
= NULL
;
327 rdata
[2].next
= rdata
+ 3;
333 rdata
[n
].buffer
= InvalidBuffer
;
334 rdata
[n
].buffer_std
= FALSE
;
335 rdata
[n
].len
= sizeof(ginxlogDeletePage
);
336 rdata
[n
].data
= (char *) &data
;
337 rdata
[n
].next
= NULL
;
339 recptr
= XLogInsert(RM_GIN_ID
, XLOG_GIN_DELETE_PAGE
, rdata
);
340 PageSetLSN(page
, recptr
);
341 PageSetTLI(page
, ThisTimeLineID
);
342 PageSetLSN(parentPage
, recptr
);
343 PageSetTLI(parentPage
, ThisTimeLineID
);
344 if (leftBlkno
!= InvalidBlockNumber
)
346 page
= BufferGetPage(lBuffer
);
347 PageSetLSN(page
, recptr
);
348 PageSetTLI(page
, ThisTimeLineID
);
353 LockBuffer(pBuffer
, GIN_UNLOCK
);
354 ReleaseBuffer(pBuffer
);
356 if (leftBlkno
!= InvalidBlockNumber
)
357 UnlockReleaseBuffer(lBuffer
);
359 UnlockReleaseBuffer(dBuffer
);
363 gvs
->result
->pages_deleted
++;
366 typedef struct DataPageDeleteStack
368 struct DataPageDeleteStack
*child
;
369 struct DataPageDeleteStack
*parent
;
371 BlockNumber blkno
; /* current block number */
372 BlockNumber leftBlkno
; /* rightest non-deleted page on left */
374 } DataPageDeleteStack
;
377 * scans posting tree and deletes empty pages
380 ginScanToDelete(GinVacuumState
*gvs
, BlockNumber blkno
, bool isRoot
, DataPageDeleteStack
*parent
, OffsetNumber myoff
)
382 DataPageDeleteStack
*me
;
385 bool meDelete
= FALSE
;
395 me
= (DataPageDeleteStack
*) palloc0(sizeof(DataPageDeleteStack
));
398 me
->leftBlkno
= InvalidBlockNumber
;
404 buffer
= ReadBufferWithStrategy(gvs
->index
, blkno
, gvs
->strategy
);
405 page
= BufferGetPage(buffer
);
407 Assert(GinPageIsData(page
));
409 if (!GinPageIsLeaf(page
))
414 for (i
= FirstOffsetNumber
; i
<= GinPageGetOpaque(page
)->maxoff
; i
++)
416 PostingItem
*pitem
= (PostingItem
*) GinDataPageGetItem(page
, i
);
418 if (ginScanToDelete(gvs
, PostingItemGetBlockNumber(pitem
), FALSE
, me
, i
))
423 if (GinPageGetOpaque(page
)->maxoff
< FirstOffsetNumber
)
425 if (!(me
->leftBlkno
== InvalidBlockNumber
&& GinPageRightMost(page
)))
427 /* we never delete right most branch */
429 if (GinPageGetOpaque(page
)->maxoff
< FirstOffsetNumber
)
431 ginDeletePage(gvs
, blkno
, me
->leftBlkno
, me
->parent
->blkno
, myoff
, me
->parent
->isRoot
);
437 ReleaseBuffer(buffer
);
440 me
->leftBlkno
= blkno
;
446 ginVacuumPostingTree(GinVacuumState
*gvs
, BlockNumber rootBlkno
)
448 Buffer rootBuffer
= InvalidBuffer
;
449 DataPageDeleteStack root
,
453 if (ginVacuumPostingTreeLeaves(gvs
, rootBlkno
, TRUE
, &rootBuffer
) == FALSE
)
455 Assert(rootBuffer
== InvalidBuffer
);
459 memset(&root
, 0, sizeof(DataPageDeleteStack
));
460 root
.leftBlkno
= InvalidBlockNumber
;
463 vacuum_delay_point();
465 ginScanToDelete(gvs
, rootBlkno
, TRUE
, &root
, InvalidOffsetNumber
);
475 UnlockReleaseBuffer(rootBuffer
);
479 * returns modified page or NULL if page isn't modified.
480 * Function works with original page until first change is occurred,
481 * then page is copied into temporary one.
484 ginVacuumEntryPage(GinVacuumState
*gvs
, Buffer buffer
, BlockNumber
*roots
, uint32
*nroot
)
486 Page origpage
= BufferGetPage(buffer
),
489 maxoff
= PageGetMaxOffsetNumber(origpage
);
495 for (i
= FirstOffsetNumber
; i
<= maxoff
; i
++)
497 IndexTuple itup
= (IndexTuple
) PageGetItem(tmppage
, PageGetItemId(tmppage
, i
));
499 if (GinIsPostingTree(itup
))
502 * store posting tree's roots for further processing, we can't
503 * vacuum it just now due to risk of deadlocks with scans/inserts
505 roots
[*nroot
] = GinItemPointerGetBlockNumber(&itup
->t_tid
);
508 else if (GinGetNPosting(itup
) > 0)
511 * if we already create temporary page, we will make changes in
514 ItemPointerData
*cleaned
= (tmppage
== origpage
) ? NULL
: GinGetPosting(itup
);
515 uint32 newN
= ginVacuumPostingList(gvs
, GinGetPosting(itup
), GinGetNPosting(itup
), &cleaned
);
517 if (GinGetNPosting(itup
) != newN
)
523 * Some ItemPointers was deleted, so we should remake our
527 if (tmppage
== origpage
)
530 * On first difference we create temporary page in memory
531 * and copies content in to it.
533 tmppage
= GinPageGetCopyPage(origpage
);
537 Size pos
= ((char *) GinGetPosting(itup
)) - ((char *) origpage
);
539 memcpy(tmppage
+ pos
, cleaned
, sizeof(ItemPointerData
) * newN
);
544 /* set itup pointer to new page */
545 itup
= (IndexTuple
) PageGetItem(tmppage
, PageGetItemId(tmppage
, i
));
548 value
= gin_index_getattr(&gvs
->ginstate
, itup
);
549 attnum
= gintuple_get_attrnum(&gvs
->ginstate
, itup
);
550 itup
= GinFormTuple(&gvs
->ginstate
, attnum
, value
, GinGetPosting(itup
), newN
);
551 PageIndexTupleDelete(tmppage
, i
);
553 if (PageAddItem(tmppage
, (Item
) itup
, IndexTupleSize(itup
), i
, false, false) != i
)
554 elog(ERROR
, "failed to add item to index page in \"%s\"",
555 RelationGetRelationName(gvs
->index
));
562 return (tmppage
== origpage
) ? NULL
: tmppage
;
566 ginbulkdelete(PG_FUNCTION_ARGS
)
568 IndexVacuumInfo
*info
= (IndexVacuumInfo
*) PG_GETARG_POINTER(0);
569 IndexBulkDeleteResult
*stats
= (IndexBulkDeleteResult
*) PG_GETARG_POINTER(1);
570 IndexBulkDeleteCallback callback
= (IndexBulkDeleteCallback
) PG_GETARG_POINTER(2);
571 void *callback_state
= (void *) PG_GETARG_POINTER(3);
572 Relation index
= info
->index
;
573 BlockNumber blkno
= GIN_ROOT_BLKNO
;
576 BlockNumber rootOfPostingTree
[BLCKSZ
/ (sizeof(IndexTupleData
) + sizeof(ItemId
))];
579 /* first time through? */
581 stats
= (IndexBulkDeleteResult
*) palloc0(sizeof(IndexBulkDeleteResult
));
582 /* we'll re-count the tuples each time */
583 stats
->num_index_tuples
= 0;
587 gvs
.callback
= callback
;
588 gvs
.callback_state
= callback_state
;
589 gvs
.strategy
= info
->strategy
;
590 initGinState(&gvs
.ginstate
, index
);
592 buffer
= ReadBufferWithStrategy(index
, blkno
, info
->strategy
);
597 Page page
= BufferGetPage(buffer
);
600 LockBuffer(buffer
, GIN_SHARE
);
602 Assert(!GinPageIsData(page
));
604 if (GinPageIsLeaf(page
))
606 LockBuffer(buffer
, GIN_UNLOCK
);
607 LockBuffer(buffer
, GIN_EXCLUSIVE
);
609 if (blkno
== GIN_ROOT_BLKNO
&& !GinPageIsLeaf(page
))
611 LockBuffer(buffer
, GIN_UNLOCK
);
612 continue; /* check it one more */
617 Assert(PageGetMaxOffsetNumber(page
) >= FirstOffsetNumber
);
619 itup
= (IndexTuple
) PageGetItem(page
, PageGetItemId(page
, FirstOffsetNumber
));
620 blkno
= GinItemPointerGetBlockNumber(&(itup
)->t_tid
);
621 Assert(blkno
!= InvalidBlockNumber
);
623 UnlockReleaseBuffer(buffer
);
624 buffer
= ReadBufferWithStrategy(index
, blkno
, info
->strategy
);
627 /* right now we found leftmost page in entry's BTree */
631 Page page
= BufferGetPage(buffer
);
635 Assert(!GinPageIsData(page
));
637 resPage
= ginVacuumEntryPage(&gvs
, buffer
, rootOfPostingTree
, &nRoot
);
639 blkno
= GinPageGetOpaque(page
)->rightlink
;
643 START_CRIT_SECTION();
644 PageRestoreTempPage(resPage
, page
);
645 MarkBufferDirty(buffer
);
646 xlogVacuumPage(gvs
.index
, buffer
);
647 UnlockReleaseBuffer(buffer
);
652 UnlockReleaseBuffer(buffer
);
655 vacuum_delay_point();
657 for (i
= 0; i
< nRoot
; i
++)
659 ginVacuumPostingTree(&gvs
, rootOfPostingTree
[i
]);
660 vacuum_delay_point();
663 if (blkno
== InvalidBlockNumber
) /* rightmost page */
666 buffer
= ReadBufferWithStrategy(index
, blkno
, info
->strategy
);
667 LockBuffer(buffer
, GIN_EXCLUSIVE
);
670 PG_RETURN_POINTER(gvs
.result
);
674 ginvacuumcleanup(PG_FUNCTION_ARGS
)
676 IndexVacuumInfo
*info
= (IndexVacuumInfo
*) PG_GETARG_POINTER(0);
677 IndexBulkDeleteResult
*stats
= (IndexBulkDeleteResult
*) PG_GETARG_POINTER(1);
678 Relation index
= info
->index
;
682 BlockNumber totFreePages
;
683 BlockNumber lastBlock
= GIN_ROOT_BLKNO
,
684 lastFilledBlock
= GIN_ROOT_BLKNO
;
686 /* Set up all-zero stats if ginbulkdelete wasn't called */
688 stats
= (IndexBulkDeleteResult
*) palloc0(sizeof(IndexBulkDeleteResult
));
691 * XXX we always report the heap tuple count as the number of index
692 * entries. This is bogus if the index is partial, but it's real hard to
693 * tell how many distinct heap entries are referenced by a GIN index.
695 stats
->num_index_tuples
= info
->num_heap_tuples
;
698 * If vacuum full, we already have exclusive lock on the index. Otherwise,
699 * need lock unless it's local to this backend.
701 if (info
->vacuum_full
)
704 needLock
= !RELATION_IS_LOCAL(index
);
707 LockRelationForExtension(index
, ExclusiveLock
);
708 npages
= RelationGetNumberOfBlocks(index
);
710 UnlockRelationForExtension(index
, ExclusiveLock
);
714 for (blkno
= GIN_ROOT_BLKNO
+ 1; blkno
< npages
; blkno
++)
719 vacuum_delay_point();
721 buffer
= ReadBufferWithStrategy(index
, blkno
, info
->strategy
);
722 LockBuffer(buffer
, GIN_SHARE
);
723 page
= (Page
) BufferGetPage(buffer
);
725 if (GinPageIsDeleted(page
))
727 RecordFreeIndexPage(index
, blkno
);
731 lastFilledBlock
= blkno
;
733 UnlockReleaseBuffer(buffer
);
735 lastBlock
= npages
- 1;
737 if (info
->vacuum_full
&& lastBlock
> lastFilledBlock
)
739 /* try to truncate index */
740 FreeSpaceMapTruncateRel(index
, lastFilledBlock
+ 1);
741 RelationTruncate(index
, lastFilledBlock
+ 1);
743 stats
->pages_removed
= lastBlock
- lastFilledBlock
;
744 totFreePages
= totFreePages
- stats
->pages_removed
;
747 /* Finally, vacuum the FSM */
748 IndexFreeSpaceMapVacuum(info
->index
);
750 stats
->pages_free
= totFreePages
;
753 LockRelationForExtension(index
, ExclusiveLock
);
754 stats
->num_pages
= RelationGetNumberOfBlocks(index
);
756 UnlockRelationForExtension(index
, ExclusiveLock
);
758 PG_RETURN_POINTER(stats
);