4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
22 /* Copyright (c) 1988 AT&T */
23 /* All Rights Reserved */
27 * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
28 * Use is subject to license terms.
31 #pragma ident "%Z%%M% %I% %E% SMI"
35 #include <sys/types.h>
36 #include <sys/sysmacros.h>
37 #include <sys/byteorder.h>
46 #define DEBUG 0 /* debugging code and realloc messages */
47 #define BLOCKSIZE 2 * BUFSIZ /* logical block size */
48 #define LINEMAX 1000 /* sorted posting line max size */
49 #define POSTINC 10000 /* posting buffer size increment */
50 #define SEP ' ' /* sorted posting field separator */
51 #define SETINC 100 /* posting set size increment */
52 #define STATS 0 /* print statistics */
53 #define SUPERINC 10000 /* super index size increment */
54 #define TERMMAX 512 /* term max size */
55 #define VERSION 1 /* inverted index format version */
56 #define ZIPFSIZE 200 /* zipf curve size */
57 #define FREAD "r" /* fopen for reading */
58 #define FREADP "r+" /* fopen for update */
59 #define FWRITE "w" /* fopen truncate or create for writing */
60 #define FWRITEP "w+" /* fopen truncate or create for update */
62 extern char *argv0
; /* command name (must be set in main function) */
67 int showzipf
; /* show postings per term distribution */
70 static POSTING
*item
, *enditem
, *item1
= NULL
, *item2
= NULL
;
71 static unsigned setsize1
, setsize2
;
72 static long numitems
, totterm
, zerolong
;
73 static char *indexfile
, *postingfile
;
74 static FILE *outfile
, *fpost
;
75 static unsigned supersize
= SUPERINC
, supintsize
;
76 static int numpost
, numlogblk
, amtused
, nextpost
,
77 lastinblk
, numinvitems
;
78 static POSTING
*POST
, *postptr
;
79 static unsigned long *SUPINT
, *supint
, nextsupfing
;
80 static char *SUPFING
, *supfing
;
81 static char thisterm
[TERMMAX
];
83 long invblk
[BLOCKSIZE
/ sizeof (long)];
84 char chrblk
[BLOCKSIZE
];
92 static int zipf
[ZIPFSIZE
+ 1];
95 static void invcannotalloc(size_t n
);
96 static void invcannotopen(char *file
);
97 static void invcannotwrite(char *file
);
98 static int invnewterm(void);
99 static int boolready(void);
102 invmake(char *invname
, char *invpost
, FILE *infile
)
108 unsigned postsize
= POSTINC
* sizeof (POSTING
);
109 unsigned long *intptr
;
116 unsigned maxtermlen
= 0;
119 if ((outfile
= vpfopen(invname
, FWRITEP
)) == NULL
) {
120 invcannotopen(invname
);
124 (void) fseek(outfile
, (long)BUFSIZ
, 0);
127 if ((fpost
= vpfopen(invpost
, FWRITE
)) == NULL
) {
128 invcannotopen(invpost
);
131 postingfile
= invpost
;
133 /* get space for the postings list */
134 if ((POST
= (POSTING
*)malloc(postsize
)) == NULL
) {
135 invcannotalloc(postsize
);
139 /* get space for the superfinger (superindex) */
140 if ((SUPFING
= malloc(supersize
)) == NULL
) {
141 invcannotalloc(supersize
);
145 supintsize
= supersize
/ 40;
146 /* also for the superfinger index */
147 if ((SUPINT
= malloc(supintsize
* sizeof (long))) == NULL
) {
148 invcannotalloc(supintsize
* sizeof (long));
152 supint
++; /* leave first term open for a count */
153 /* initialize using an empty term */
154 (void) strcpy(thisterm
, "");
166 * set up as though a block had come and gone, i.e., set up for
169 amtused
= 16; /* leave no space - init 3 words + one for luck */
172 lastinblk
= BLOCKSIZE
;
174 /* now loop as long as more to read (till eof) */
175 while (fgets(line
, LINEMAX
, infile
) != NULL
) {
179 s
= (unsigned char *) strchr(line
, SEP
);
180 if (s
== NULL
) /* where did this line come from ??? */
181 continue; /* workaround: just skip it */
184 if ((i
= strlen(line
)) > maxtermlen
) {
189 (void) printf("%ld: %s ", totpost
, line
);
190 (void) fflush(stdout
);
192 if (strcmp(thisterm
, line
) == 0) {
193 if (postptr
+ 10 > POST
+ postsize
/ sizeof (POSTING
)) {
195 postsize
+= POSTINC
* sizeof (POSTING
);
196 if ((POST
= realloc(POST
, postsize
)) == NULL
) {
197 invcannotalloc(postsize
);
202 (void) printf("reallocated post space to %u, "
203 "totpost=%ld\n", postsize
, totpost
);
208 /* have a new term */
212 (void) strcpy(thisterm
, line
);
217 /* get the new posting */
221 num
= BASE
* num
+ *++s
- '!';
222 } while (++i
< PRECISION
);
223 posting
.lineoffset
= num
;
224 while (++fileindex
< nsrcoffset
&& num
> srcoffset
[fileindex
]) {
227 posting
.fileindex
= --fileindex
;
232 while (*++s
!= '\n') {
233 num
= BASE
* num
+ *s
- '!';
235 posting
.fcnoffset
= num
;
237 posting
.fcnoffset
= 0;
239 *postptr
++ = posting
;
241 (void) printf("%ld %ld %ld %ld\n", posting
.fileindex
,
242 posting
.fcnoffset
, posting
.lineoffset
, posting
.type
);
243 (void) fflush(stdout
);
249 /* now clean up final block */
250 logicalblk
.invblk
[0] = numinvitems
;
251 /* loops pointer around to start */
252 logicalblk
.invblk
[1] = 0;
253 logicalblk
.invblk
[2] = numlogblk
- 1;
254 if (fwrite((char *)&logicalblk
, BLOCKSIZE
, 1, outfile
) == 0) {
258 /* write out block to save space. what in it doesn't matter */
259 if (fwrite((char *)&logicalblk
, BLOCKSIZE
, 1, outfile
) == 0) {
262 /* finish up the super finger */
264 /* add to the offsets the size of the offset pointers */
265 intptr
= (SUPINT
+ 1);
266 i
= (char *)supint
- (char *)SUPINT
;
267 while (intptr
< supint
)
269 /* write out the offsets (1 for the N at start) and the super finger */
270 if (fwrite((char *)SUPINT
, sizeof (*SUPINT
), numlogblk
+ 1,
272 fwrite(SUPFING
, 1, supfing
- SUPFING
, outfile
) == 0) {
275 /* save the size for reference later */
276 nextsupfing
= sizeof (long) + sizeof (long) * numlogblk
+
279 * make sure the file ends at a logical block boundary. This is
280 * necessary for invinsert to correctly create extended blocks
282 i
= nextsupfing
% BLOCKSIZE
;
283 /* write out junk to fill log blk */
284 if (fwrite(SUPFING
, BLOCKSIZE
- i
, 1, outfile
) == 0 ||
285 fflush(outfile
) == EOF
) {
286 /* rewind doesn't check for write failure */
289 /* write the control area */
291 param
.version
= VERSION
;
293 param
.sizeblk
= BLOCKSIZE
;
294 param
.startbyte
= (numlogblk
+ 1) * BLOCKSIZE
+ BUFSIZ
;
295 param
.supsize
= nextsupfing
;
296 param
.cntlsize
= BUFSIZ
;
298 if (fwrite((char *)¶m
, sizeof (param
), 1, outfile
) == 0) {
301 for (i
= 0; i
< 10; i
++) /* for future use */
302 if (fwrite((char *)&zerolong
, sizeof (zerolong
),
307 /* make first block loop backwards to last block */
308 if (fflush(outfile
) == EOF
) {
309 /* fseek doesn't check for write failure */
312 /* get to second word first block */
313 (void) fseek(outfile
, (long)BUFSIZ
+ 8, 0);
314 tlong
= numlogblk
- 1;
315 if (fwrite((char *)&tlong
, sizeof (tlong
), 1, outfile
) == 0 ||
316 fclose(outfile
) == EOF
) {
318 invcannotwrite(invname
);
321 if (fclose(fpost
) == EOF
) {
322 invcannotwrite(postingfile
);
325 --totterm
; /* don't count null term */
327 (void) printf("logical blocks = %d, postings = %ld, terms = %ld, "
328 "max term length = %d\n", numlogblk
, totpost
, totterm
, maxtermlen
);
331 "\n************* ZIPF curve ****************\n");
332 for (j
= ZIPFSIZE
; j
> 1; j
--)
335 for (i
= 1; i
< j
; ++i
) {
336 (void) printf("%3d -%6d ", i
, zipf
[i
]);
337 if (i
% 6 == 0) (void) putchar('\n');
339 (void) printf(">%d-%6d\n", ZIPFSIZE
, zipf
[0]);
342 /* free all malloc'd memory */
349 /* add a term to the data base */
354 int backupflag
, i
, j
, maxback
, holditems
, gooditems
, howfar
;
355 int len
, numwilluse
, wdlen
;
356 char *tptr
, *tptr2
, *tptr3
;
358 unsigned long packword
[2];
364 /* keep zipfian info on the distribution */
365 if (numpost
<= ZIPFSIZE
)
370 len
= strlen(thisterm
);
371 wdlen
= (len
+ (sizeof (long) - 1)) / sizeof (long);
372 numwilluse
= (wdlen
+ 3) * sizeof (long);
373 /* new block if at least 1 item in block */
374 if (numinvitems
&& numwilluse
+ amtused
> BLOCKSIZE
) {
375 /* set up new block */
376 if (supfing
+ 500 > SUPFING
+ supersize
) {
377 i
= supfing
- SUPFING
;
379 if ((SUPFING
= realloc(SUPFING
, supersize
)) == NULL
) {
380 invcannotalloc(supersize
);
383 supfing
= i
+ SUPFING
;
385 (void) printf("reallocated superfinger space to %d, "
386 "totpost=%ld\n", supersize
, totpost
);
389 /* check that room for the offset as well */
390 if ((numlogblk
+ 10) > supintsize
) {
392 supintsize
+= SUPERINC
;
393 if ((SUPINT
= realloc((char *)SUPINT
,
394 supintsize
* sizeof (long))) == NULL
) {
395 invcannotalloc(supintsize
* sizeof (long));
400 (void) printf("reallocated superfinger offset to %d, "
401 "totpost = %ld\n", supintsize
* sizeof (long),
405 /* See if backup is efficatious */
407 maxback
= strlen(thisterm
) / 10;
408 holditems
= numinvitems
;
409 if (maxback
> numinvitems
)
410 maxback
= numinvitems
- 2;
412 while (--maxback
> 0) {
414 iteminfo
.packword
[0] =
415 logicalblk
.invblk
[--holditems
* 2 +
416 (sizeof (long) - 1)];
417 if ((i
= iteminfo
.e
.size
/ 10) < maxback
) {
420 gooditems
= holditems
;
421 tptr2
= logicalblk
.chrblk
+ iteminfo
.e
.offset
;
424 /* see if backup will occur */
426 numinvitems
= gooditems
;
428 logicalblk
.invblk
[0] = numinvitems
;
429 /* set forward pointer pointing to next */
430 logicalblk
.invblk
[1] = numlogblk
+ 1;
431 /* set back pointer to last block */
432 logicalblk
.invblk
[2] = numlogblk
- 1;
433 if (fwrite((char *)logicalblk
.chrblk
, 1,
434 BLOCKSIZE
, outfile
) == 0) {
435 invcannotwrite(indexfile
);
440 /* check if had to back up, if so do it */
442 /* find out where the end of the new block is */
443 iteminfo
.packword
[0] =
444 logicalblk
.invblk
[numinvitems
* 2 + 1];
445 tptr3
= logicalblk
.chrblk
+ iteminfo
.e
.offset
;
446 /* move the index for this block */
447 for (i
= 3; i
<= (backupflag
* 2 + 2); i
++) {
448 logicalblk
.invblk
[i
] =
449 logicalblk
.invblk
[numinvitems
* 2+i
];
451 /* move the word into the super index */
452 iteminfo
.packword
[0] = logicalblk
.invblk
[3];
453 iteminfo
.packword
[1] = logicalblk
.invblk
[4];
454 tptr2
= logicalblk
.chrblk
+ iteminfo
.e
.offset
;
455 (void) strncpy(supfing
, tptr2
, (int)iteminfo
.e
.size
);
456 *(supfing
+ iteminfo
.e
.size
) = '\0';
458 (void) printf("backup %d at term=%s to term=%s\n",
459 backupflag
, thisterm
, supfing
);
461 *supint
++ = nextsupfing
;
462 nextsupfing
+= strlen(supfing
) + 1;
463 supfing
+= strlen(supfing
) + 1;
464 /* now fix up the logical block */
465 tptr
= logicalblk
.chrblk
+ lastinblk
;
466 lastinblk
= BLOCKSIZE
;
467 tptr2
= logicalblk
.chrblk
+ lastinblk
;
472 amtused
+= (8 * backupflag
+ j
);
473 for (i
= 3; i
< (backupflag
* 2 + 2); i
+= 2) {
474 iteminfo
.packword
[0] = logicalblk
.invblk
[i
];
475 iteminfo
.e
.offset
+= (tptr2
- tptr3
);
476 logicalblk
.invblk
[i
] = iteminfo
.packword
[0];
478 numinvitems
= backupflag
;
479 } else { /* no backup needed */
481 lastinblk
= BLOCKSIZE
;
482 /* add new term to superindex */
483 (void) strcpy(supfing
, thisterm
);
484 supfing
+= strlen(thisterm
) + 1;
485 *supint
++ = nextsupfing
;
486 nextsupfing
+= strlen(thisterm
) + 1;
489 lastinblk
-= (numwilluse
- 8);
490 iteminfo
.e
.offset
= lastinblk
;
491 iteminfo
.e
.size
= (char)len
;
492 iteminfo
.e
.space
= 0;
493 iteminfo
.e
.post
= numpost
;
494 (void) strncpy(logicalblk
.chrblk
+ lastinblk
, thisterm
, len
);
495 amtused
+= numwilluse
;
496 logicalblk
.invblk
[(lastinblk
/sizeof (long))+wdlen
] = nextpost
;
497 if ((i
= postptr
- POST
) > 0) {
498 if (fwrite((char *)POST
, sizeof (POSTING
), i
, fpost
) == 0) {
499 invcannotwrite(postingfile
);
502 nextpost
+= i
* sizeof (POSTING
);
504 logicalblk
.invblk
[3+2*numinvitems
++] = iteminfo
.packword
[0];
505 logicalblk
.invblk
[2+2*numinvitems
] = iteminfo
.packword
[1];
510 swap_ints(void *p
, size_t sz
)
513 uint32_t *e
= (uint32_t *)p
+ (sz
/ sizeof (uint32_t));
515 for (s
= p
; s
< e
; s
++)
520 write_param(INVCONTROL
*invcntl
)
523 swap_ints(&invcntl
->param
, sizeof (invcntl
->param
));
525 rewind(invcntl
->invfile
);
526 (void) fwrite((char *)&invcntl
->param
, sizeof (invcntl
->param
), 1,
530 swap_ints(&invcntl
->param
, sizeof (invcntl
->param
));
534 read_superfinger(INVCONTROL
*invcntl
)
538 (void) fseek(invcntl
->invfile
, invcntl
->param
.startbyte
, SEEK_SET
);
539 (void) fread(invcntl
->iindex
, (int)invcntl
->param
.supsize
,
540 1, invcntl
->invfile
);
544 * The superfinger consists of a count, followed by
545 * count offsets, followed by a string table (which
546 * the offsets reference).
548 * We need to swap the count and the offsets.
550 count
= 1 + BSWAP_32(*(uint32_t *)invcntl
->iindex
);
551 swap_ints(invcntl
->iindex
, count
* sizeof (unsigned long));
556 read_logblock(INVCONTROL
*invcntl
, int block
)
558 /* note always fetch it if the file is busy */
559 if ((block
!= invcntl
->numblk
) ||
560 (invcntl
->param
.filestat
>= INVBUSY
)) {
561 (void) fseek(invcntl
->invfile
,
562 (block
* invcntl
->param
.sizeblk
) + invcntl
->param
.cntlsize
,
564 invcntl
->numblk
= block
;
565 (void) fread(invcntl
->logblk
, (int)invcntl
->param
.sizeblk
,
566 1, invcntl
->invfile
);
574 * A logblock consists of a count, a next block id,
575 * and a previous block id, followed by count
576 * ENTRYs, followed by alternating strings and
579 swap_ints(invcntl
->logblk
, 3 * sizeof (unsigned long));
581 count
= *(uint32_t *)invcntl
->logblk
;
583 ecur
= (ENTRY
*)((uint32_t *)invcntl
->logblk
+ 3);
586 for (; ecur
< eend
; ecur
++) {
587 ecur
->offset
= BSWAP_16(ecur
->offset
);
588 ecur
->post
= BSWAP_32(ecur
->post
);
591 * After the string is the posting offset.
593 postptr
= (uint32_t *)(invcntl
->logblk
+
595 P2ROUNDUP(ecur
->size
, sizeof (long)));
597 *postptr
= BSWAP_32(*postptr
);
604 read_next_posting(INVCONTROL
*invcntl
, POSTING
*posting
)
606 (void) fread((char *)posting
, sizeof (*posting
), 1, invcntl
->postfile
);
608 posting
->lineoffset
= BSWAP_32(posting
->lineoffset
);
609 posting
->fcnoffset
= BSWAP_32(posting
->fcnoffset
);
611 * fileindex is a 24-bit field, so shift it before swapping
613 posting
->fileindex
= BSWAP_32(posting
->fileindex
<< 8);
618 invopen(INVCONTROL
*invcntl
, char *invname
, char *invpost
, int stat
)
622 if ((invcntl
->invfile
= vpfopen(invname
,
623 ((stat
== 0) ? FREAD
: FREADP
))) == NULL
) {
624 invcannotopen(invname
);
627 if (fread((char *)&invcntl
->param
, sizeof (invcntl
->param
), 1,
628 invcntl
->invfile
) == 0) {
629 (void) fprintf(stderr
, "%s: empty inverted file\n", argv0
);
632 if (invcntl
->param
.version
!= VERSION
&&
633 BSWAP_32(invcntl
->param
.version
) != VERSION
) {
634 (void) fprintf(stderr
,
635 "%s: cannot read old index format; use -U option to "
636 "force database to rebuild\n", argv0
);
639 invcntl
->swap
= (invcntl
->param
.version
!= VERSION
);
641 swap_ints(&invcntl
->param
, sizeof (invcntl
->param
));
643 if (stat
== 0 && invcntl
->param
.filestat
== INVALONE
) {
644 (void) fprintf(stderr
, "%s: inverted file is locked\n", argv0
);
647 if ((invcntl
->postfile
= vpfopen(invpost
,
648 ((stat
== 0) ? FREAD
: FREADP
))) == NULL
) {
649 invcannotopen(invpost
);
652 /* allocate core for a logical block */
653 if ((invcntl
->logblk
= malloc(invcntl
->param
.sizeblk
)) == NULL
) {
654 invcannotalloc((unsigned)invcntl
->param
.sizeblk
);
657 /* allocate for and read in superfinger */
659 invcntl
->iindex
= NULL
;
661 if (invcntl
->param
.share
== 1) {
662 key_t
ftok(), shm_key
;
663 struct shmid_ds shm_buf
;
667 /* see if the shared segment exists */
668 shm_key
= ftok(invname
, 2);
669 shm_id
= shmget(shm_key
, 0, 0);
671 * Failure simply means (hopefully) that segment doesn't
676 * Have to give general write permission due to AMdahl
677 * not having protected segments
679 shm_id
= shmget(shm_key
,
680 invcntl
->param
.supsize
+ sizeof (long),
683 perror("Could not create shared "
689 invcntl
->iindex
= shmat(shm_id
, 0,
690 ((read_index
) ? 0 : SHM_RDONLY
));
691 if (invcntl
->iindex
== (char *)ERR
) {
692 (void) fprintf(stderr
,
693 "%s: shared memory link failed\n", argv0
);
694 invcntl
->iindex
= NULL
;
700 if (invcntl
->iindex
== NULL
)
701 invcntl
->iindex
= malloc(invcntl
->param
.supsize
+ 16);
702 if (invcntl
->iindex
== NULL
) {
703 invcannotalloc((unsigned)invcntl
->param
.supsize
);
704 free(invcntl
->logblk
);
708 read_superfinger(invcntl
);
710 invcntl
->numblk
= -1;
711 if (boolready() == -1) {
713 (void) fclose(invcntl
->postfile
);
715 (void) fclose(invcntl
->invfile
);
718 /* write back out the control block if anything changed */
719 invcntl
->param
.filestat
= stat
;
720 if (stat
> invcntl
->param
.filestat
)
721 write_param(invcntl
);
725 /* invclose must be called to wrap things up and deallocate core */
727 invclose(INVCONTROL
*invcntl
)
729 /* write out the control block in case anything changed */
730 if (invcntl
->param
.filestat
> 0) {
731 invcntl
->param
.filestat
= 0;
732 write_param(invcntl
);
734 (void) fclose(invcntl
->invfile
);
735 (void) fclose(invcntl
->postfile
);
737 if (invcntl
->param
.share
> 0) {
738 shmdt(invcntl
->iindex
);
739 invcntl
->iindex
= NULL
;
742 if (invcntl
->iindex
!= NULL
)
743 free(invcntl
->iindex
);
744 free(invcntl
->logblk
);
747 /* invstep steps the inverted file forward one item */
749 invstep(INVCONTROL
*invcntl
)
751 if (invcntl
->keypnt
< (*(int *)invcntl
->logblk
- 1)) {
756 /* move forward a block else wrap */
757 read_logblock(invcntl
, *(int *)(invcntl
->logblk
+ sizeof (long)));
762 /* invforward moves forward one term in the inverted file */
764 invforward(INVCONTROL
*invcntl
)
767 /* skip things with 0 postings */
768 while (((ENTRY
*)(invcntl
->logblk
+ 12) + invcntl
->keypnt
)->post
== 0) {
771 /* Check for having wrapped - reached start of inverted file! */
772 if ((invcntl
->numblk
== 0) && (invcntl
->keypnt
== 0))
777 /* invterm gets the present term from the present logical block */
779 invterm(INVCONTROL
*invcntl
, char *term
)
783 entryptr
= (ENTRY
*)(invcntl
->logblk
+ 12) + invcntl
->keypnt
;
784 (void) strncpy(term
, entryptr
->offset
+ invcntl
->logblk
,
785 (int)entryptr
->size
);
786 *(term
+ entryptr
->size
) = '\0';
787 return (entryptr
->post
);
790 /* invfind searches for an individual item in the inverted file */
792 invfind(INVCONTROL
*invcntl
, char *searchterm
)
794 int imid
, ilow
, ihigh
;
797 unsigned long *intptr
, *intptr2
;
800 /* make sure it is initialized via invready */
801 if (invcntl
->invfile
== 0)
804 /* now search for the appropriate finger block */
805 intptr
= (unsigned long *)invcntl
->iindex
;
808 ihigh
= *intptr
++ - 1;
809 while (ilow
<= ihigh
) {
810 imid
= (ilow
+ ihigh
) / 2;
811 intptr2
= intptr
+ imid
;
812 i
= strcmp(searchterm
, (invcntl
->iindex
+ *intptr2
));
822 /* be careful about case where searchterm is after last in this block */
823 imid
= (ilow
) ? ilow
- 1 : 0;
825 /* fetch the appropriate logical block if not in core */
826 read_logblock(invcntl
, imid
);
829 /* now find the term in this block. tricky this */
830 intptr
= (unsigned long *)invcntl
->logblk
;
836 while (ilow
<= ihigh
) {
837 imid
= (ilow
+ ihigh
) / 2;
838 entryptr
= (ENTRY
*)intptr
+ imid
;
839 i
= strncmp(searchterm
, (invcntl
->logblk
+ entryptr
->offset
),
840 (int)entryptr
->size
);
842 i
= strlen(searchterm
) - entryptr
->size
;
848 num
= entryptr
->post
;
852 /* be careful about case where searchterm is after last in this block */
853 if (imid
>= *(long *)invcntl
->logblk
) {
854 invcntl
->keypnt
= *(long *)invcntl
->logblk
;
856 /* note if this happens the term could be in extended block */
857 if (invcntl
->param
.startbyte
<
858 invcntl
->numblk
* invcntl
->param
.sizeblk
)
861 invcntl
->keypnt
= imid
;
867 /* invdump dumps the block the term parameter is in */
869 invdump(INVCONTROL
*invcntl
, char *term
)
871 long i
, j
, n
, *longptr
;
873 char temp
[512], *ptr
;
875 /* dump superindex if term is "-" */
878 longptr
= (long *)invcntl
->iindex
;
880 (void) printf("Superindex dump, num blocks=%ld\n", n
);
882 while ((longptr
<= ((long *)invcntl
->iindex
) + n
) &&
884 (void) printf("%2ld %6ld %s\n", j
++, *longptr
,
885 invcntl
->iindex
+ *longptr
);
889 } else if (*term
== '#') {
891 /* fetch the appropriate logical block */
892 read_logblock(invcntl
, j
);
894 i
= abs((int)invfind(invcntl
, term
));
895 longptr
= (long *)invcntl
->logblk
;
897 (void) printf("Entry term to invdump=%s, postings=%ld, "
898 "forward ptr=%ld, back ptr=%ld\n", term
, i
, *(longptr
),
900 entryptr
= (ENTRY
*)(invcntl
->logblk
+ 12);
901 (void) printf("%ld terms in this block, block=%ld\n", n
,
903 (void) printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n");
904 for (j
= 0; j
< n
&& invbreak
== 0; j
++) {
905 ptr
= invcntl
->logblk
+ entryptr
->offset
;
906 (void) strncpy(temp
, ptr
, (int)entryptr
->size
);
907 temp
[entryptr
->size
] = '\0';
908 ptr
+= (sizeof (long) *
909 (long)((entryptr
->size
+
910 (sizeof (long) - 1)) / sizeof (long)));
911 (void) printf("%2ld %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j
, temp
,
912 entryptr
->post
, entryptr
->size
, entryptr
->offset
,
913 entryptr
->space
, *(long *)ptr
);
926 if ((item1
= (POSTING
*)malloc(SETINC
* sizeof (POSTING
))) == NULL
) {
927 invcannotalloc(SETINC
);
933 if ((item2
= (POSTING
*)malloc(SETINC
* sizeof (POSTING
))) == NULL
) {
934 invcannotalloc(SETINC
);
951 boolfile(INVCONTROL
*invcntl
, long *num
, int bool)
960 POSTING
*newsetp
, *set1p
;
961 long newsetc
, set1c
, set2c
;
963 entryptr
= (ENTRY
*) (invcntl
->logblk
+ 12) + invcntl
->keypnt
;
964 ptr
= invcntl
->logblk
+ entryptr
->offset
;
965 ptr2
= ((unsigned long *)ptr
) +
966 (entryptr
->size
+ (sizeof (long) - 1)) / sizeof (long);
967 *num
= entryptr
->post
;
976 /* make room for the new set */
981 newsetp
= set1p
= item
;
992 if ((item1
= (POSTING
*) realloc(item1
,
993 u
* sizeof (POSTING
))) == NULL
) {
1002 if ((item2
= (POSTING
*)realloc(item2
,
1003 u
* sizeof (POSTING
))) == NULL
) {
1005 invcannotalloc(u
* sizeof (POSTING
));
1017 file
= invcntl
->postfile
;
1018 (void) fseek(file
, (long)*ptr2
, SEEK_SET
);
1019 read_next_posting(invcntl
, &posting
);
1023 /* while something in both sets */
1026 for (set1c
= 0, set2c
= 0;
1027 set1c
< numitems
&& set2c
< *num
; newsetc
++) {
1028 if (set1p
->lineoffset
< posting
.lineoffset
) {
1029 *newsetp
++ = *set1p
++;
1031 } else if (set1p
->lineoffset
> posting
.lineoffset
) {
1032 *newsetp
++ = posting
;
1033 read_next_posting(invcntl
, &posting
);
1035 } else if (set1p
->type
< posting
.type
) {
1036 *newsetp
++ = *set1p
++;
1038 } else if (set1p
->type
> posting
.type
) {
1039 *newsetp
++ = posting
;
1040 read_next_posting(invcntl
, &posting
);
1042 } else { /* identical postings */
1043 *newsetp
++ = *set1p
++;
1045 read_next_posting(invcntl
, &posting
);
1049 /* find out what ran out and move the rest in */
1050 if (set1c
< numitems
) {
1051 newsetc
+= numitems
- set1c
;
1052 while (set1c
++ < numitems
) {
1053 *newsetp
++ = *set1p
++;
1056 while (set2c
++ < *num
) {
1057 *newsetp
++ = posting
;
1059 read_next_posting(invcntl
, &posting
);
1063 break; /* end of OR */
1068 while (set1c
< numitems
&& set2c
< *num
) {
1069 if (set1p
->lineoffset
< posting
.lineoffset
) {
1072 } else if (set1p
->lineoffset
> posting
.lineoffset
) {
1073 read_next_posting(invcntl
, &posting
);
1075 } else if (set1p
->type
< posting
.type
) {
1078 } else if (set1p
->type
> posting
.type
) {
1079 read_next_posting(invcntl
, &posting
);
1081 } else { /* identical postings */
1082 *newsetp
++ = *set1p
++;
1085 read_next_posting(invcntl
, &posting
);
1089 break; /* end of AND */
1094 while (set1c
< numitems
&& set2c
< *num
) {
1095 if (set1p
->lineoffset
< posting
.lineoffset
) {
1096 *newsetp
++ = *set1p
++;
1099 } else if (set1p
->lineoffset
> posting
.lineoffset
) {
1100 read_next_posting(invcntl
, &posting
);
1102 } else if (set1p
->type
< posting
.type
) {
1103 *newsetp
++ = *set1p
++;
1106 } else if (set1p
->type
> posting
.type
) {
1107 read_next_posting(invcntl
, &posting
);
1109 } else { /* identical postings */
1112 read_next_posting(invcntl
, &posting
);
1116 newsetc
+= numitems
- set1c
;
1117 while (set1c
++ < numitems
) {
1118 *newsetp
++ = *set1p
++;
1120 break; /* end of NOT */
1122 case REVERSENOT
: /* core NOT incoming set */
1125 while (set1c
< numitems
&& set2c
< *num
) {
1126 if (set1p
->lineoffset
< posting
.lineoffset
) {
1129 } else if (set1p
->lineoffset
> posting
.lineoffset
) {
1130 *newsetp
++ = posting
;
1131 read_next_posting(invcntl
, &posting
);
1133 } else if (set1p
->type
< posting
.type
) {
1136 } else if (set1p
->type
> posting
.type
) {
1137 *newsetp
++ = posting
;
1138 read_next_posting(invcntl
, &posting
);
1140 } else { /* identical postings */
1143 read_next_posting(invcntl
, &posting
);
1147 while (set2c
++ < *num
) {
1148 *newsetp
++ = posting
;
1150 read_next_posting(invcntl
, &posting
);
1153 break; /* end of REVERSENOT */
1158 enditem
= (POSTING
*)newsetp
;
1159 return ((POSTING
*)item
);
1168 POSTING
*oldstuff
, *newstuff
;
1170 if (numitems
== 0) {
1176 * if clear then give them what we have and use (void)
1177 * boolready to realloc
1181 /* free up the space we didn't give them */
1189 i
= (enditem
- item
) * sizeof (POSTING
) + 100;
1190 if ((ptr
= (POSTING
*)malloc(i
))r
== NULL
) {
1194 /* move present set into place */
1197 while (oldstuff
< enditem
)
1198 *newstuff
++ = *oldstuff
++;
1204 invcannotalloc(size_t n
)
1206 (void) fprintf(stderr
, "%s: cannot allocate %u bytes\n", argv0
, n
);
1210 invcannotopen(char *file
)
1212 (void) fprintf(stderr
, "%s: cannot open file %s\n", argv0
, file
);
1216 invcannotwrite(char *file
)
1218 (void) perror(argv0
); /* must be first to preserve errno */
1219 (void) fprintf(stderr
, "%s: write to file %s failed\n", argv0
, file
);