1 /*-------------------------------------------------------------------------
4 * fetch tuples from a GIN scan.
7 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
12 *-------------------------------------------------------------------------
17 #include "access/gin.h"
18 #include "access/relscan.h"
19 #include "catalog/index.h"
20 #include "miscadmin.h"
21 #include "storage/bufmgr.h"
22 #include "utils/datum.h"
23 #include "utils/memutils.h"
27 * Tries to refind previously taken ItemPointer on page.
30 findItemInPage(Page page
, ItemPointer item
, OffsetNumber
*off
)
32 OffsetNumber maxoff
= GinPageGetOpaque(page
)->maxoff
;
35 if (GinPageGetOpaque(page
)->flags
& GIN_DELETED
)
36 /* page was deleted by concurrent vacuum */
40 * scan page to find equal or first greater value
43 for (*off
= FirstOffsetNumber
; *off
<= maxoff
; (*off
)++)
45 res
= compareItemPointers(item
, (ItemPointer
) GinDataPageGetItem(page
, *off
));
55 * Goes to the next page if current offset is outside of bounds
58 moveRightIfItNeeded( GinBtreeData
*btree
, GinBtreeStack
*stack
)
60 Page page
= BufferGetPage(stack
->buffer
);
62 if ( stack
->off
> PageGetMaxOffsetNumber(page
) )
65 * We scanned the whole page, so we should take right page
67 stack
->blkno
= GinPageGetOpaque(page
)->rightlink
;
69 if ( GinPageRightMost(page
) )
70 return false; /* no more pages */
72 LockBuffer(stack
->buffer
, GIN_UNLOCK
);
73 stack
->buffer
= ReleaseAndReadBuffer(stack
->buffer
, btree
->index
, stack
->blkno
);
74 LockBuffer(stack
->buffer
, GIN_SHARE
);
75 stack
->off
= FirstOffsetNumber
;
82 * Does fullscan of posting tree and saves ItemPointers
83 * in scanEntry->partialMatch TIDBitmap
86 scanForItems( Relation index
, GinScanEntry scanEntry
, BlockNumber rootPostingTree
)
88 GinPostingTreeScan
*gdi
;
93 gdi
= prepareScanPostingTree(index
, rootPostingTree
, TRUE
);
95 buffer
= scanBeginPostingTree(gdi
);
96 IncrBufferRefCount(buffer
); /* prevent unpin in freeGinBtreeStack */
98 freeGinBtreeStack(gdi
->stack
);
102 * Goes through all leaves
106 page
= BufferGetPage(buffer
);
108 if ((GinPageGetOpaque(page
)->flags
& GIN_DELETED
) == 0 && GinPageGetOpaque(page
)->maxoff
>= FirstOffsetNumber
)
110 tbm_add_tuples( scanEntry
->partialMatch
,
111 (ItemPointer
)GinDataPageGetItem(page
, FirstOffsetNumber
),
112 GinPageGetOpaque(page
)->maxoff
, false);
113 scanEntry
->predictNumberResult
+= GinPageGetOpaque(page
)->maxoff
;
116 blkno
= GinPageGetOpaque(page
)->rightlink
;
117 if ( GinPageRightMost(page
) )
119 UnlockReleaseBuffer(buffer
);
120 return; /* no more pages */
123 LockBuffer(buffer
, GIN_UNLOCK
);
124 buffer
= ReleaseAndReadBuffer(buffer
, index
, blkno
);
125 LockBuffer(buffer
, GIN_SHARE
);
130 * Collects all ItemPointer into the TIDBitmap struct
131 * for entries partially matched to search entry.
133 * Returns true if done, false if it's needed to restart scan from scratch
136 computePartialMatchList( GinBtreeData
*btree
, GinBtreeStack
*stack
, GinScanEntry scanEntry
)
143 scanEntry
->partialMatch
= tbm_create( work_mem
* 1024L );
148 * stack->off points to the interested entry, buffer is already locked
150 if ( moveRightIfItNeeded(btree
, stack
) == false )
153 page
= BufferGetPage(stack
->buffer
);
154 itup
= (IndexTuple
) PageGetItem(page
, PageGetItemId(page
, stack
->off
));
157 * If tuple stores another attribute then stop scan
159 if ( gintuple_get_attrnum( btree
->ginstate
, itup
) != scanEntry
->attnum
)
162 idatum
= gin_index_getattr( btree
->ginstate
, itup
);
166 * Check of partial match.
167 * case cmp == 0 => match
168 * case cmp > 0 => not match and finish scan
169 * case cmp < 0 => not match and continue scan
172 cmp
= DatumGetInt32(FunctionCall3(&btree
->ginstate
->comparePartialFn
[scanEntry
->attnum
-1],
175 UInt16GetDatum(scanEntry
->strategy
)));
185 if ( GinIsPostingTree(itup
) )
187 BlockNumber rootPostingTree
= GinGetPostingTree(itup
);
189 savedDatum
= datumCopy (
191 btree
->ginstate
->origTupdesc
->attrs
[scanEntry
->attnum
-1]->attbyval
,
192 btree
->ginstate
->origTupdesc
->attrs
[scanEntry
->attnum
-1]->attlen
195 * We should unlock current page (but not unpin) during
196 * tree scan to prevent deadlock with vacuum processes.
198 * We save current entry value (savedDatum) to be able to refind
199 * our tuple after re-locking
201 LockBuffer(stack
->buffer
, GIN_UNLOCK
);
202 scanForItems( btree
->index
, scanEntry
, rootPostingTree
);
205 * We lock again the entry page and while it was unlocked
206 * insert might occured, so we need to refind our position
208 LockBuffer(stack
->buffer
, GIN_SHARE
);
209 page
= BufferGetPage(stack
->buffer
);
210 if ( !GinPageIsLeaf(page
) )
213 * Root page becomes non-leaf while we unlock it. We
214 * will start again, this situation doesn't cause
215 * often - root can became a non-leaf only one per
224 if ( moveRightIfItNeeded(btree
, stack
) == false )
225 elog(ERROR
, "lost saved point in index"); /* must not happen !!! */
227 page
= BufferGetPage(stack
->buffer
);
228 itup
= (IndexTuple
) PageGetItem(page
, PageGetItemId(page
, stack
->off
));
229 newDatum
= gin_index_getattr( btree
->ginstate
, itup
);
231 if ( gintuple_get_attrnum( btree
->ginstate
, itup
) != scanEntry
->attnum
)
232 elog(ERROR
, "lost saved point in index"); /* must not happen !!! */
234 if ( compareEntries(btree
->ginstate
, scanEntry
->attnum
, newDatum
, savedDatum
) == 0 )
237 if ( btree
->ginstate
->origTupdesc
->attrs
[scanEntry
->attnum
-1]->attbyval
== false )
238 pfree( DatumGetPointer(savedDatum
) );
247 tbm_add_tuples( scanEntry
->partialMatch
, GinGetPosting(itup
), GinGetNPosting(itup
), false);
248 scanEntry
->predictNumberResult
+= GinGetNPosting(itup
);
252 * Ok, we save ItemPointers, go to the next entry
261 * Start* functions setup begining state of searches: finds correct buffer and pins it.
264 startScanEntry(Relation index
, GinState
*ginstate
, GinScanEntry entry
)
266 GinBtreeData btreeEntry
;
267 GinBtreeStack
*stackEntry
;
269 bool needUnlock
= TRUE
;
271 if (entry
->master
!= NULL
)
273 entry
->isFinished
= entry
->master
->isFinished
;
278 * we should find entry, and begin scan of posting tree
279 * or just store posting list in memory
282 prepareEntryScan(&btreeEntry
, index
, entry
->attnum
, entry
->entry
, ginstate
);
283 btreeEntry
.searchMode
= TRUE
;
284 stackEntry
= ginFindLeafPage(&btreeEntry
, NULL
);
285 page
= BufferGetPage(stackEntry
->buffer
);
287 entry
->isFinished
= TRUE
;
288 entry
->buffer
= InvalidBuffer
;
289 entry
->offset
= InvalidOffsetNumber
;
292 entry
->partialMatch
= NULL
;
293 entry
->partialMatchResult
= NULL
;
294 entry
->reduceResult
= FALSE
;
295 entry
->predictNumberResult
= 0;
297 if ( entry
->isPartialMatch
)
300 * btreeEntry.findItem points to the first equal or greater value
301 * than needed. So we will scan further and collect all
304 btreeEntry
.findItem(&btreeEntry
, stackEntry
);
305 if ( computePartialMatchList( &btreeEntry
, stackEntry
, entry
) == false )
308 * GIN tree was seriously restructured, so we will
309 * cleanup all found data and rescan. See comments near
310 * 'return false' in computePartialMatchList()
312 if ( entry
->partialMatch
)
314 tbm_free( entry
->partialMatch
);
315 entry
->partialMatch
= NULL
;
317 LockBuffer(stackEntry
->buffer
, GIN_UNLOCK
);
318 freeGinBtreeStack(stackEntry
);
320 startScanEntry(index
, ginstate
, entry
);
324 if ( entry
->partialMatch
&& !tbm_is_empty(entry
->partialMatch
) )
326 tbm_begin_iterate(entry
->partialMatch
);
327 entry
->isFinished
= FALSE
;
330 else if (btreeEntry
.findItem(&btreeEntry
, stackEntry
))
332 IndexTuple itup
= (IndexTuple
) PageGetItem(page
, PageGetItemId(page
, stackEntry
->off
));
334 if (GinIsPostingTree(itup
))
336 BlockNumber rootPostingTree
= GinGetPostingTree(itup
);
337 GinPostingTreeScan
*gdi
;
341 * We should unlock entry page before make deal with
342 * posting tree to prevent deadlocks with vacuum processes.
343 * Because entry is never deleted from page and posting tree is
344 * never reduced to the posting list, we can unlock page after
345 * getting BlockNumber of root of posting tree.
347 LockBuffer(stackEntry
->buffer
, GIN_UNLOCK
);
349 gdi
= prepareScanPostingTree(index
, rootPostingTree
, TRUE
);
351 entry
->buffer
= scanBeginPostingTree(gdi
);
353 * We keep buffer pinned because we need to prevent deletition
354 * page during scan. See GIN's vacuum implementation. RefCount
355 * is increased to keep buffer pinned after freeGinBtreeStack() call.
357 IncrBufferRefCount(entry
->buffer
);
359 page
= BufferGetPage(entry
->buffer
);
360 entry
->predictNumberResult
= gdi
->stack
->predictNumber
* GinPageGetOpaque(page
)->maxoff
;
363 * Keep page content in memory to prevent durable page locking
365 entry
->list
= (ItemPointerData
*) palloc( BLCKSZ
);
366 entry
->nlist
= GinPageGetOpaque(page
)->maxoff
;
367 memcpy( entry
->list
, GinDataPageGetItem(page
, FirstOffsetNumber
),
368 GinPageGetOpaque(page
)->maxoff
* sizeof(ItemPointerData
) );
370 LockBuffer(entry
->buffer
, GIN_UNLOCK
);
371 freeGinBtreeStack(gdi
->stack
);
373 entry
->isFinished
= FALSE
;
375 else if (GinGetNPosting(itup
) > 0)
377 entry
->nlist
= GinGetNPosting(itup
);
378 entry
->list
= (ItemPointerData
*) palloc(sizeof(ItemPointerData
) * entry
->nlist
);
379 memcpy(entry
->list
, GinGetPosting(itup
), sizeof(ItemPointerData
) * entry
->nlist
);
380 entry
->isFinished
= FALSE
;
385 LockBuffer(stackEntry
->buffer
, GIN_UNLOCK
);
386 freeGinBtreeStack(stackEntry
);
390 startScanKey(Relation index
, GinState
*ginstate
, GinScanKey key
)
397 for (i
= 0; i
< key
->nentries
; i
++)
398 startScanEntry(index
, ginstate
, key
->scanEntry
+ i
);
400 memset(key
->entryRes
, TRUE
, sizeof(bool) * key
->nentries
);
401 key
->isFinished
= FALSE
;
402 key
->firstCall
= FALSE
;
404 if (GinFuzzySearchLimit
> 0)
407 * If all of keys more than threshold we will try to reduce
408 * result, we hope (and only hope, for intersection operation of
409 * array our supposition isn't true), that total result will not
410 * more than minimal predictNumberResult.
413 for (i
= 0; i
< key
->nentries
; i
++)
414 if (key
->scanEntry
[i
].predictNumberResult
<= key
->nentries
* GinFuzzySearchLimit
)
417 for (i
= 0; i
< key
->nentries
; i
++)
418 if (key
->scanEntry
[i
].predictNumberResult
> key
->nentries
* GinFuzzySearchLimit
)
420 key
->scanEntry
[i
].predictNumberResult
/= key
->nentries
;
421 key
->scanEntry
[i
].reduceResult
= TRUE
;
427 startScan(IndexScanDesc scan
)
430 GinScanOpaque so
= (GinScanOpaque
) scan
->opaque
;
432 for (i
= 0; i
< so
->nkeys
; i
++)
433 startScanKey(scan
->indexRelation
, &so
->ginstate
, so
->keys
+ i
);
437 * Gets next ItemPointer from PostingTree. Note, that we copy
438 * page into GinScanEntry->list array and unlock page, but keep it pinned
439 * to prevent interference with vacuum
442 entryGetNextItem(Relation index
, GinScanEntry entry
)
451 if (entry
->offset
<= entry
->nlist
)
453 entry
->curItem
= entry
->list
[entry
->offset
- 1];
457 LockBuffer(entry
->buffer
, GIN_SHARE
);
458 page
= BufferGetPage(entry
->buffer
);
462 * It's needed to go by right link. During that we should refind
463 * first ItemPointer greater that stored
466 blkno
= GinPageGetOpaque(page
)->rightlink
;
468 LockBuffer(entry
->buffer
, GIN_UNLOCK
);
469 if (blkno
== InvalidBlockNumber
)
471 ReleaseBuffer(entry
->buffer
);
472 ItemPointerSet(&entry
->curItem
, InvalidBlockNumber
, InvalidOffsetNumber
);
473 entry
->buffer
= InvalidBuffer
;
474 entry
->isFinished
= TRUE
;
478 entry
->buffer
= ReleaseAndReadBuffer(entry
->buffer
, index
, blkno
);
479 LockBuffer(entry
->buffer
, GIN_SHARE
);
480 page
= BufferGetPage(entry
->buffer
);
482 entry
->offset
= InvalidOffsetNumber
;
483 if (!ItemPointerIsValid(&entry
->curItem
) || findItemInPage(page
, &entry
->curItem
, &entry
->offset
))
486 * Found position equal to or greater than stored
488 entry
->nlist
= GinPageGetOpaque(page
)->maxoff
;
489 memcpy( entry
->list
, GinDataPageGetItem(page
, FirstOffsetNumber
),
490 GinPageGetOpaque(page
)->maxoff
* sizeof(ItemPointerData
) );
492 LockBuffer(entry
->buffer
, GIN_UNLOCK
);
494 if ( !ItemPointerIsValid(&entry
->curItem
) ||
495 compareItemPointers( &entry
->curItem
, entry
->list
+ entry
->offset
- 1 ) == 0 )
498 * First pages are deleted or empty, or we found exact position,
499 * so break inner loop and continue outer one.
506 * Find greater than entry->curItem position, store it.
508 entry
->curItem
= entry
->list
[entry
->offset
- 1];
516 #define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE))
517 #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
520 * Sets entry->curItem to new found heap item pointer for one
521 * entry of one scan key
524 entryGetItem(Relation index
, GinScanEntry entry
)
528 entry
->isFinished
= entry
->master
->isFinished
;
529 entry
->curItem
= entry
->master
->curItem
;
531 else if ( entry
->partialMatch
)
535 if ( entry
->partialMatchResult
== NULL
|| entry
->offset
>= entry
->partialMatchResult
->ntuples
)
537 entry
->partialMatchResult
= tbm_iterate( entry
->partialMatch
);
539 if ( entry
->partialMatchResult
== NULL
)
541 ItemPointerSet(&entry
->curItem
, InvalidBlockNumber
, InvalidOffsetNumber
);
542 entry
->isFinished
= TRUE
;
545 else if ( entry
->partialMatchResult
->ntuples
< 0 )
547 /* bitmap became lossy */
549 (errcode(ERRCODE_OUT_OF_MEMORY
),
550 errmsg("not enough memory to store result of partial match operator" ),
551 errhint("Increase the \"work_mem\" parameter.")));
556 ItemPointerSet(&entry
->curItem
,
557 entry
->partialMatchResult
->blockno
,
558 entry
->partialMatchResult
->offsets
[ entry
->offset
]);
561 } while (entry
->isFinished
== FALSE
&& entry
->reduceResult
== TRUE
&& dropItem(entry
));
563 else if (!BufferIsValid(entry
->buffer
))
566 if (entry
->offset
<= entry
->nlist
)
567 entry
->curItem
= entry
->list
[entry
->offset
- 1];
570 ItemPointerSet(&entry
->curItem
, InvalidBlockNumber
, InvalidOffsetNumber
);
571 entry
->isFinished
= TRUE
;
578 entryGetNextItem(index
, entry
);
579 } while (entry
->isFinished
== FALSE
&& entry
->reduceResult
== TRUE
&& dropItem(entry
));
582 return entry
->isFinished
;
586 * restart from saved position. Actually it's needed only for
587 * partial match. function is called only by ginrestpos()
590 ginrestartentry(GinScanEntry entry
)
592 ItemPointerData stopItem
= entry
->curItem
;
593 bool savedReduceResult
;
595 if ( entry
->master
|| entry
->partialMatch
== NULL
)
596 return; /* entry is slave or not a partial match type*/
598 if ( entry
->isFinished
)
599 return; /* entry was finished before ginmarkpos() call */
601 if ( ItemPointerGetBlockNumber(&stopItem
) == InvalidBlockNumber
)
602 return; /* entry wasn't began before ginmarkpos() call */
607 tbm_begin_iterate( entry
->partialMatch
);
608 entry
->partialMatchResult
= NULL
;
612 * Temporary reset reduceResult flag to guarantee refinding
615 savedReduceResult
= entry
->reduceResult
;
616 entry
->reduceResult
= FALSE
;
621 * We can use null instead of index because
622 * partial match doesn't use it
624 if ( entryGetItem( NULL
, entry
) == false )
625 elog(ERROR
, "cannot refind scan position"); /* must not be here! */
626 } while( compareItemPointers( &stopItem
, &entry
->curItem
) != 0 );
628 Assert( entry
->isFinished
== FALSE
);
630 entry
->reduceResult
= savedReduceResult
;
634 * Sets key->curItem to new found heap item pointer for one scan key
635 * Returns isFinished, ie TRUE means we did NOT get a new item pointer!
636 * Also, *keyrecheck is set true if recheck is needed for this scan key.
639 keyGetItem(Relation index
, GinState
*ginstate
, MemoryContext tempCtx
,
640 GinScanKey key
, bool *keyrecheck
)
645 MemoryContext oldCtx
;
653 * move forward from previously value and set new curItem, which is
654 * minimal from entries->curItems
656 ItemPointerSetMax(&key
->curItem
);
657 for (i
= 0; i
< key
->nentries
; i
++)
659 entry
= key
->scanEntry
+ i
;
661 if (key
->entryRes
[i
])
663 if (entry
->isFinished
== FALSE
&& entryGetItem(index
, entry
) == FALSE
)
665 if (compareItemPointers(&entry
->curItem
, &key
->curItem
) < 0)
666 key
->curItem
= entry
->curItem
;
669 key
->entryRes
[i
] = FALSE
;
671 else if (entry
->isFinished
== FALSE
)
673 if (compareItemPointers(&entry
->curItem
, &key
->curItem
) < 0)
674 key
->curItem
= entry
->curItem
;
678 if (ItemPointerIsMax(&key
->curItem
))
680 /* all entries are finished */
681 key
->isFinished
= TRUE
;
686 * if key->nentries == 1 then the consistentFn should always succeed,
687 * but we must call it anyway to find out the recheck status.
690 /* setting up array for consistentFn */
691 for (i
= 0; i
< key
->nentries
; i
++)
693 entry
= key
->scanEntry
+ i
;
695 if (entry
->isFinished
== FALSE
&&
696 compareItemPointers(&entry
->curItem
, &key
->curItem
) == 0)
697 key
->entryRes
[i
] = TRUE
;
699 key
->entryRes
[i
] = FALSE
;
703 * Initialize *keyrecheck in case the consistentFn doesn't know it
704 * should set it. The safe assumption in that case is to force
709 oldCtx
= MemoryContextSwitchTo(tempCtx
);
710 res
= DatumGetBool(FunctionCall4(&ginstate
->consistentFn
[key
->attnum
-1],
711 PointerGetDatum(key
->entryRes
),
712 UInt16GetDatum(key
->strategy
),
714 PointerGetDatum(keyrecheck
)));
715 MemoryContextSwitchTo(oldCtx
);
716 MemoryContextReset(tempCtx
);
723 * Get heap item pointer from scan
724 * returns true if found
727 scanGetItem(IndexScanDesc scan
, ItemPointerData
*item
, bool *recheck
)
729 GinScanOpaque so
= (GinScanOpaque
) scan
->opaque
;
734 * We return recheck = true if any of the keyGetItem calls return
735 * keyrecheck = true. Note that because the second loop might advance
736 * some keys, this could theoretically be too conservative. In practice
737 * though, we expect that a consistentFn's recheck result will depend
738 * only on the operator and the query, so for any one key it should
739 * stay the same regardless of advancing to new items. So it's not
740 * worth working harder.
744 ItemPointerSetMin(item
);
745 for (i
= 0; i
< so
->nkeys
; i
++)
747 GinScanKey key
= so
->keys
+ i
;
749 if (keyGetItem(scan
->indexRelation
, &so
->ginstate
, so
->tempCtx
,
751 return FALSE
; /* finished one of keys */
752 if (compareItemPointers(item
, &key
->curItem
) < 0)
753 *item
= key
->curItem
;
754 *recheck
|= keyrecheck
;
757 for (i
= 1; i
<= so
->nkeys
; i
++)
759 GinScanKey key
= so
->keys
+ i
- 1;
763 int cmp
= compareItemPointers(item
, &key
->curItem
);
769 if (keyGetItem(scan
->indexRelation
, &so
->ginstate
, so
->tempCtx
,
771 return FALSE
; /* finished one of keys */
772 *recheck
|= keyrecheck
;
775 { /* returns to begin */
776 *item
= key
->curItem
;
786 #define GinIsNewKey(s) ( ((GinScanOpaque) scan->opaque)->keys == NULL )
787 #define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes == true )
790 gingetbitmap(PG_FUNCTION_ARGS
)
792 IndexScanDesc scan
= (IndexScanDesc
) PG_GETARG_POINTER(0);
793 TIDBitmap
*tbm
= (TIDBitmap
*) PG_GETARG_POINTER(1);
796 if (GinIsNewKey(scan
))
799 if (GinIsVoidRes(scan
))
807 ItemPointerData iptr
;
810 CHECK_FOR_INTERRUPTS();
812 if (!scanGetItem(scan
, &iptr
, &recheck
))
815 tbm_add_tuples(tbm
, &iptr
, 1, recheck
);
819 PG_RETURN_INT64(ntids
);
823 gingettuple(PG_FUNCTION_ARGS
)
825 IndexScanDesc scan
= (IndexScanDesc
) PG_GETARG_POINTER(0);
826 ScanDirection dir
= (ScanDirection
) PG_GETARG_INT32(1);
829 if (dir
!= ForwardScanDirection
)
830 elog(ERROR
, "GIN doesn't support other scan directions than forward");
832 if (GinIsNewKey(scan
))
835 if (GinIsVoidRes(scan
))
836 PG_RETURN_BOOL(false);
839 res
= scanGetItem(scan
, &scan
->xs_ctup
.t_self
, &scan
->xs_recheck
);