4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #include <_string_table.h>
33 * This file provides the interfaces to build a Str_tbl suitable for use by
34 * either the sgsmsg message system, or a standard ELF string table (SHT_STRTAB)
35 * as created by ld(1).
37 * There are two modes which can be used when constructing a string table:
40 * standard string table - no compression. This is the
41 * traditional, fast method.
43 * st_new(FLG_STTAB_COMPRESS)
44 * builds a compressed string table which both eliminates
45 * duplicate strings, and permits strings with common suffixes
46 * (atexit vs. exit) to overlap in the table. This provides space
47 * savings for many string tables. Although more work than the
48 * traditional method, the algorithms used are designed to scale
49 * and keep any overhead at a minimum.
51 * These string tables are built with a common interface in a two-pass manner.
52 * The first pass finds all of the strings required for the string-table and
53 * calculates the size required for the final string table.
55 * The second pass allocates the string table, populates the strings into the
56 * table and returns the offsets the strings have been assigned.
58 * The calling sequence to build and populate a string table is:
60 * st_new(); // initialize strtab
62 * st_insert(st1); // first pass of strings ...
63 * // calculates size required for
66 * st_delstring(st?); // remove string previously
70 * st_getstrtab_sz(); // freezes strtab and computes
73 * st_setstrbuf(); // associates a final destination
74 * // for the string table
76 * st_setstring(st1); // populate the string table
77 * ... // offsets are based off of second
78 * // pass through the string table
81 * st_destroy(); // tear down string table
84 * String Suffix Compression Algorithm:
86 * Here's a quick high level overview of the Suffix String
87 * compression algorithm used. First - the heart of the algorithm
88 * is a Hash table list which represents a dictionary of all unique
89 * strings inserted into the string table. The hash function for
90 * this table is a standard string hash except that the hash starts
91 * at the last character in the string (&str[n - 1]) and works towards
92 * the first character in the function (&str[0]). As we compute the
93 * HASH value for a given string, we also compute the hash values
94 * for all of the possible suffix strings for that string.
96 * As we compute the hash - at each character see if the current
97 * suffix string for that hash is already present in the table. If
98 * it is, and the string is a master string. Then change that
99 * string to a suffix string of the new string being inserted.
101 * When the final hash value is found (hash for str[0...n]), check
102 * to see if it is in the hash table - if so increment the reference
103 * count for the string. If it is not yet in the table, insert a
104 * new hash table entry for a master string.
106 * The above method will find all suffixes of a given string given
107 * that the strings are inserted from shortest to longest. That is
108 * why this is a two phase method, we first collect all of the
109 * strings and store them based off of their length in an AVL tree.
110 * Once all of the strings have been submitted we then start the
111 * hash table build by traversing the AVL tree in order and
112 * inserting the strings from shortest to longest as described
119 avl_len_compare(const void *n1
, const void *n2
)
123 len1
= ((LenNode
*)n1
)->ln_strlen
;
124 len2
= ((LenNode
*)n2
)->ln_strlen
;
134 avl_str_compare(const void *n1
, const void *n2
)
136 const char *str1
, *str2
;
139 str1
= ((StrNode
*)n1
)->sn_str
;
140 str2
= ((StrNode
*)n2
)->sn_str
;
142 rc
= strcmp(str1
, str2
);
151 * Return an initialized Str_tbl - returns NULL on failure.
154 * FLG_STTAB_COMPRESS - build a compressed string table
161 if ((stp
= calloc(sizeof (*stp
), 1)) == NULL
)
165 * Start with a leading '\0' - it's tradition.
167 stp
->st_strsize
= stp
->st_fullstrsize
= stp
->st_nextoff
= 1;
170 * Do we compress this string table?
172 stp
->st_flags
= flags
;
173 if ((stp
->st_flags
& FLG_STTAB_COMPRESS
) == 0)
176 if ((stp
->st_lentree
= calloc(sizeof (*stp
->st_lentree
), 1)) == NULL
)
179 avl_create(stp
->st_lentree
, &avl_len_compare
, sizeof (LenNode
),
180 SGSOFFSETOF(LenNode
, ln_avlnode
));
186 * Insert a new string into the Str_tbl. There are two AVL trees used.
188 * - The first LenNode AVL tree maintains a tree of nodes based on string
190 * - Each LenNode maintains a StrNode AVL tree for each string. Large
191 * applications have been known to contribute thousands of strings of
192 * the same size. Should strings need to be removed (-z ignore), then
193 * the string AVL tree makes this removal efficient and scalable.
196 st_insert(Str_tbl
*stp
, const char *str
)
199 StrNode
*snp
, sn
= { 0 };
200 LenNode
*lnp
, ln
= { 0 };
204 * String table can't have been cooked
206 assert((stp
->st_flags
& FLG_STTAB_COOKED
) == 0);
209 * Null strings always point to the head of the string
210 * table - no reason to keep searching.
212 if ((len
= strlen(str
)) == 0)
215 stp
->st_fullstrsize
+= len
+ 1;
218 if ((stp
->st_flags
& FLG_STTAB_COMPRESS
) == 0)
222 * From the controlling string table, determine which LenNode AVL node
223 * provides for this string length. If the node doesn't exist, insert
224 * a new node to represent this string length.
227 if ((lnp
= avl_find(stp
->st_lentree
, &ln
, &where
)) == NULL
) {
228 if ((lnp
= calloc(sizeof (*lnp
), 1)) == NULL
)
230 lnp
->ln_strlen
= len
;
231 avl_insert(stp
->st_lentree
, lnp
, where
);
233 if ((lnp
->ln_strtree
= calloc(sizeof (*lnp
->ln_strtree
), 1)) ==
237 avl_create(lnp
->ln_strtree
, &avl_str_compare
, sizeof (StrNode
),
238 SGSOFFSETOF(StrNode
, sn_avlnode
));
242 * From the string length AVL node determine whether a StrNode AVL node
243 * provides this string. If the node doesn't exist, insert a new node
244 * to represent this string.
247 if ((snp
= avl_find(lnp
->ln_strtree
, &sn
, &where
)) == NULL
) {
248 if ((snp
= calloc(sizeof (*snp
), 1)) == NULL
)
251 avl_insert(lnp
->ln_strtree
, snp
, where
);
259 * Remove a previously inserted string from the Str_tbl.
262 st_delstring(Str_tbl
*stp
, const char *str
)
265 LenNode
*lnp
, ln
= { 0 };
266 StrNode
*snp
, sn
= { 0 };
269 * String table can't have been cooked
271 assert((stp
->st_flags
& FLG_STTAB_COOKED
) == 0);
274 stp
->st_fullstrsize
-= len
+ 1;
276 if ((stp
->st_flags
& FLG_STTAB_COMPRESS
) == 0)
280 * Determine which LenNode AVL node provides for this string length.
283 if ((lnp
= avl_find(stp
->st_lentree
, &ln
, 0)) != NULL
) {
285 if ((snp
= avl_find(lnp
->ln_strtree
, &sn
, 0)) != NULL
) {
287 * Reduce the reference count, and if zero remove the
290 if (--snp
->sn_refcnt
== 0)
291 avl_remove(lnp
->ln_strtree
, snp
);
297 * No strings of this length, or no string itself - someone goofed.
303 * Tear down a String_Table structure.
306 st_destroy(Str_tbl
*stp
)
308 Str_hash
*sthash
, *psthash
;
309 Str_master
*mstr
, *pmstr
;
313 * cleanup the master strings
315 for (mstr
= stp
->st_mstrlist
, pmstr
= 0; mstr
;
316 mstr
= mstr
->sm_next
) {
324 if (stp
->st_hashbcks
) {
325 for (i
= 0; i
< stp
->st_hbckcnt
; i
++) {
326 for (sthash
= stp
->st_hashbcks
[i
], psthash
= 0;
327 sthash
; sthash
= sthash
->hi_next
) {
335 free(stp
->st_hashbcks
);
342 * For a given string - copy it into the buffer associated with
343 * the string table - and return the offset it has been assigned.
345 * If a value of '-1' is returned - the string was not found in
349 st_setstring(Str_tbl
*stp
, const char *str
, size_t *stoff
)
358 * String table *must* have been previously cooked
360 assert(stp
->st_strbuf
);
362 assert(stp
->st_flags
& FLG_STTAB_COOKED
);
365 * Null string always points to head of string table
372 if ((stp
->st_flags
& FLG_STTAB_COMPRESS
) == 0) {
375 stlen
++; /* count for trailing '\0' */
376 _stoff
= stp
->st_nextoff
;
378 * Have we overflowed our assigned buffer?
380 if ((_stoff
+ stlen
) > stp
->st_fullstrsize
)
382 memcpy(stp
->st_strbuf
+ _stoff
, str
, stlen
);
384 stp
->st_nextoff
+= stlen
;
389 * Calculate reverse hash for string.
392 for (i
= stlen
; i
>= 0; i
--) {
393 hashval
= ((hashval
<< 5) + hashval
) +
394 str
[i
]; /* h = ((h * 33) + c) */
397 for (sthash
= stp
->st_hashbcks
[hashval
% stp
->st_hbckcnt
]; sthash
;
398 sthash
= sthash
->hi_next
) {
401 if (sthash
->hi_hashval
!= hashval
)
404 hstr
= &sthash
->hi_mstr
->sm_str
[sthash
->hi_mstr
->sm_strlen
-
406 if (strcmp(str
, hstr
) == 0)
411 * Did we find the string?
417 * Has this string been copied into the string table?
419 mstr
= sthash
->hi_mstr
;
420 if (mstr
->sm_stroff
== 0) {
421 size_t mstrlen
= mstr
->sm_strlen
+ 1;
423 mstr
->sm_stroff
= stp
->st_nextoff
;
426 * Have we overflowed our assigned buffer?
428 if ((mstr
->sm_stroff
+ mstrlen
) > stp
->st_fullstrsize
)
431 (void) memcpy(stp
->st_strbuf
+ mstr
->sm_stroff
,
432 mstr
->sm_str
, mstrlen
);
433 stp
->st_nextoff
+= mstrlen
;
437 * Calculate offset of (sub)string.
439 *stoff
= mstr
->sm_stroff
+ mstr
->sm_strlen
- sthash
->hi_strlen
;
446 st_hash_insert(Str_tbl
*stp
, const char *str
, size_t len
)
449 uint_t hashval
= HASHSEED
;
450 uint_t bckcnt
= stp
->st_hbckcnt
;
451 Str_hash
**hashbcks
= stp
->st_hashbcks
;
453 Str_master
*mstr
= 0;
456 * We use a classic 'Bernstein k=33' hash function. But
457 * instead of hashing from the start of the string to the
458 * end, we do it in reverse.
460 * This way - we are essentially building all of the
461 * suffix hashvalues as we go. We can check to see if
462 * any suffixes already exist in the tree as we generate
465 for (i
= len
; i
>= 0; i
--) {
466 hashval
= ((hashval
<< 5) + hashval
) +
467 str
[i
]; /* h = ((h * 33) + c) */
469 for (sthash
= hashbcks
[hashval
% bckcnt
];
470 sthash
; sthash
= sthash
->hi_next
) {
474 if (sthash
->hi_hashval
!= hashval
)
477 _mstr
= sthash
->hi_mstr
;
478 hstr
= &_mstr
->sm_str
[_mstr
->sm_strlen
-
481 if (strcmp(&str
[i
], hstr
))
486 * Entry already in table, increment refcnt and
493 * If this 'suffix' is presently a 'master
494 * string, then take over it's record.
496 if (sthash
->hi_strlen
== _mstr
->sm_strlen
) {
498 * we should only do this once.
508 * Do we need a new master string, or can we take over
509 * one we already found in the table?
513 * allocate a new master string
515 if ((mstr
= calloc(sizeof (*mstr
), 1)) == 0)
517 mstr
->sm_next
= stp
->st_mstrlist
;
518 stp
->st_mstrlist
= mstr
;
519 stp
->st_strsize
+= len
+ 1;
522 * We are taking over a existing master string, the string size
523 * only increments by the difference between the current string
524 * and the previous master.
526 assert(len
> mstr
->sm_strlen
);
527 stp
->st_strsize
+= len
- mstr
->sm_strlen
;
530 if ((sthash
= calloc(sizeof (*sthash
), 1)) == 0)
533 mstr
->sm_hashval
= sthash
->hi_hashval
= hashval
;
534 mstr
->sm_strlen
= sthash
->hi_strlen
= len
;
536 sthash
->hi_refcnt
= 1;
537 sthash
->hi_mstr
= mstr
;
540 * Insert string element into head of hash list
542 hashval
= hashval
% bckcnt
;
543 sthash
->hi_next
= hashbcks
[hashval
];
544 hashbcks
[hashval
] = sthash
;
549 * Return amount of space required for the string table.
552 st_getstrtab_sz(Str_tbl
*stp
)
554 assert(stp
->st_fullstrsize
> 0);
556 if ((stp
->st_flags
& FLG_STTAB_COMPRESS
) == 0) {
557 stp
->st_flags
|= FLG_STTAB_COOKED
;
558 return (stp
->st_fullstrsize
);
561 if ((stp
->st_flags
& FLG_STTAB_COOKED
) == 0) {
565 stp
->st_flags
|= FLG_STTAB_COOKED
;
567 * allocate a hash table about the size of # of
570 stp
->st_hbckcnt
= findprime(stp
->st_strcnt
);
571 if ((stp
->st_hashbcks
= calloc(sizeof (*stp
->st_hashbcks
),
572 stp
->st_hbckcnt
)) == NULL
)
576 * We now walk all of the strings in the list, from shortest to
577 * longest, and insert them into the hashtable.
579 if ((lnp
= avl_first(stp
->st_lentree
)) == NULL
) {
581 * Is it possible we have an empty string table, if so,
582 * the table still contains '\0', so return the size.
584 if (avl_numnodes(stp
->st_lentree
) == 0) {
585 assert(stp
->st_strsize
== 1);
586 return (stp
->st_strsize
);
595 * Walk the string lists and insert them into the hash
596 * list. Once a string is inserted we no longer need
597 * it's entry, so the string can be freed.
599 for (snp
= avl_first(lnp
->ln_strtree
); snp
;
600 snp
= AVL_NEXT(lnp
->ln_strtree
, snp
)) {
601 if (st_hash_insert(stp
, snp
->sn_str
,
602 lnp
->ln_strlen
) == -1)
607 * Now that the strings have been copied, walk the
608 * StrNode tree and free all the AVL nodes. Note,
609 * avl_destroy_nodes() beats avl_remove() as the
610 * latter balances the nodes as they are removed.
611 * We just want to tear the whole thing down fast.
614 while ((snp
= avl_destroy_nodes(lnp
->ln_strtree
,
617 avl_destroy(lnp
->ln_strtree
);
618 free(lnp
->ln_strtree
);
619 lnp
->ln_strtree
= NULL
;
622 * Move on to the next LenNode.
624 lnp
= AVL_NEXT(stp
->st_lentree
, lnp
);
628 * Now that all of the strings have been freed, walk the
629 * LenNode tree and free all of the AVL nodes. Note,
630 * avl_destroy_nodes() beats avl_remove() as the latter
631 * balances the nodes as they are removed. We just want to
632 * tear the whole thing down fast.
635 while ((lnp
= avl_destroy_nodes(stp
->st_lentree
,
638 avl_destroy(stp
->st_lentree
);
639 free(stp
->st_lentree
);
643 assert(stp
->st_strsize
> 0);
644 assert(stp
->st_fullstrsize
>= stp
->st_strsize
);
646 return (stp
->st_strsize
);
650 * Associate a buffer with a string table.
653 st_getstrbuf(Str_tbl
*stp
)
655 return (stp
->st_strbuf
);
659 st_setstrbuf(Str_tbl
*stp
, char *stbuf
, size_t bufsize
)
661 assert(stp
->st_flags
& FLG_STTAB_COOKED
);
663 if ((stp
->st_flags
& FLG_STTAB_COMPRESS
) == 0) {
664 if (bufsize
< stp
->st_fullstrsize
)
667 if (bufsize
< stp
->st_strsize
)
671 stp
->st_strbuf
= stbuf
;
674 * for debug builds - start with a stringtable filled in
675 * with '0xff'. This makes it very easy to spot unfilled
676 * holes in the strtab.
678 memset(stbuf
, 0xff, bufsize
);
681 memset(stbuf
, 0x0, bufsize
);