2 * Copyright 2014 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
4 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
5 * at Electronni Visti IA, Kiev, Ukraine.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47 #include "localeimpl.h"
49 /* Check file format vs libc runtime. (See collatefile.h) */
50 #if COLL_WEIGHTS_MAX != COLLATE_WEIGHTS_MAX
51 #error "COLL_WEIGHTS_MAX != COLLATE_WEIGHTS_MAX"
55 * See the comments in usr/src/cmd/localedef/collate.c for further
56 * information. It would also be very helpful to have a copy of the
57 * POSIX standard for collation (in the locale format manual page)
58 * handy (www.opengroup.org).
62 * POSIX uses empty tables and falls down to strcmp.
64 struct lc_collate lc_collate_posix
= {
68 struct locdata __posix_collate_locdata
= {
70 .l_data
= { &lc_collate_posix
}
75 __lc_collate_load(const char *locname
)
84 struct locdata
*ldata
;
85 struct lc_collate
*lcc
;
88 * Slurp the locale file into the cache.
91 (void) snprintf(buf
, sizeof (buf
), "%s/%s/LC_COLLATE/LCL_DATA",
92 _PathLocale
, locname
);
94 if ((fd
= open(buf
, O_RDONLY
)) < 0) {
98 if (fstat(fd
, &sbuf
) < 0) {
103 if (sbuf
.st_size
< (COLLATE_STR_LEN
+ sizeof (info
))) {
108 map
= mmap(NULL
, sbuf
.st_size
, PROT_READ
, MAP_PRIVATE
, fd
, 0);
110 if ((TMP
= map
) == NULL
) {
115 if (strncmp(TMP
, COLLATE_VERSION
, COLLATE_STR_LEN
) != 0) {
116 (void) munmap(map
, sbuf
.st_size
);
120 TMP
+= COLLATE_STR_LEN
;
123 TMP
+= sizeof (*info
);
125 if ((info
->directive_count
< 1) ||
126 (info
->directive_count
>= COLL_WEIGHTS_MAX
) ||
127 ((chains
= info
->chain_count
) < 0)) {
128 (void) munmap(map
, sbuf
.st_size
);
133 i
= (sizeof (collate_char_t
) * (UCHAR_MAX
+ 1)) +
134 (sizeof (collate_chain_t
) * chains
) +
135 (sizeof (collate_large_t
) * info
->large_count
);
136 for (z
= 0; z
< info
->directive_count
; z
++) {
137 i
+= sizeof (collate_subst_t
) * info
->subst_count
[z
];
139 if (i
!= (sbuf
.st_size
- (TMP
- map
))) {
140 (void) munmap(map
, sbuf
.st_size
);
146 if ((ldata
= __locdata_alloc(locname
, sizeof (*lcc
))) == NULL
) {
147 (void) munmap(map
, sbuf
.st_size
);
150 lcc
= ldata
->l_data
[0];
152 ldata
->l_map_len
= sbuf
.st_size
;
155 lcc
->lc_directive_count
= info
->directive_count
;
156 lcc
->lc_large_count
= info
->large_count
;
158 for (z
= 0; z
< COLL_WEIGHTS_MAX
; z
++) {
159 lcc
->lc_directive
[z
] = info
->directive
[z
];
160 lcc
->lc_subst_count
[z
] = info
->subst_count
[z
];
161 lcc
->lc_pri_count
[z
] = info
->pri_count
[z
];
162 lcc
->lc_undef_pri
[z
] = info
->undef_pri
[z
];
165 lcc
->lc_char_table
= (void *)TMP
;
166 TMP
+= sizeof (collate_char_t
) * (UCHAR_MAX
+ 1);
168 for (z
= 0; z
< lcc
->lc_directive_count
; z
++) {
170 if ((count
= lcc
->lc_subst_count
[z
]) > 0) {
171 lcc
->lc_subst_table
[z
] = (void *)TMP
;
172 TMP
+= count
* sizeof (collate_subst_t
);
174 lcc
->lc_subst_table
[z
] = NULL
;
179 lcc
->lc_chain_table
= (void *)TMP
;
180 TMP
+= chains
* sizeof (collate_chain_t
);
182 lcc
->lc_chain_table
= NULL
;
183 lcc
->lc_chain_count
= chains
;
184 if (lcc
->lc_large_count
> 0)
185 lcc
->lc_large_table
= (void *)TMP
;
187 lcc
->lc_large_table
= NULL
;
192 static const int32_t *
193 substsearch(const struct lc_collate
*lcc
, const wchar_t key
, int pass
)
195 const collate_subst_t
*p
;
196 int n
= lcc
->lc_subst_count
[pass
];
201 if (pass
>= lcc
->lc_directive_count
)
204 if (!(key
& COLLATE_SUBST_PRIORITY
))
207 p
= lcc
->lc_subst_table
[pass
] + (key
& ~COLLATE_SUBST_PRIORITY
);
208 assert(p
->key
== key
);
213 * Note: for performance reasons, we have expanded bsearch here. This avoids
214 * function call overhead with each comparison.
217 static collate_chain_t
*
218 chainsearch(const struct lc_collate
*lcc
, const wchar_t *key
, int *len
)
221 int high
= lcc
->lc_chain_count
- 1;
224 collate_chain_t
*tab
= lcc
->lc_chain_table
;
229 while (low
<= high
) {
230 next
= (low
+ high
) / 2;
232 compar
= *key
- *p
->str
;
234 l
= wcsnlen(p
->str
, COLLATE_STR_LEN
);
235 compar
= wcsncmp(key
, p
->str
, l
);
249 static collate_large_t
*
250 largesearch(const struct lc_collate
*lcc
, const wchar_t key
)
253 int high
= lcc
->lc_info
->large_count
- 1;
256 collate_large_t
*tab
= lcc
->lc_large_table
;
261 while (low
<= high
) {
262 next
= (low
+ high
) / 2;
264 compar
= key
- p
->val
;
276 _collate_lookup(const struct lc_collate
*lcc
, const wchar_t *t
,
277 int *len
, int *pri
, int which
, const int **state
)
280 collate_large_t
*match
;
285 * If this is the "last" pass for the UNDEFINED, then
286 * we just return the priority itself.
288 if (which
>= lcc
->lc_directive_count
) {
296 * If we have remaining substitution data from a previous
297 * call, consume it first.
299 if ((sptr
= *state
) != NULL
) {
302 if ((sptr
== *state
) || (sptr
== NULL
))
310 /* No active substitutions */
314 * Check for composites such as dipthongs that collate as a
315 * single element (aka chains or collating-elements).
317 if (((p2
= chainsearch(lcc
, t
, &l
)) != NULL
) &&
318 ((p
= p2
->pri
[which
]) >= 0)) {
323 } else if (*t
<= UCHAR_MAX
) {
326 * Character is a small (8-bit) character.
327 * We just look these up directly for speed.
329 *pri
= lcc
->lc_char_table
[*t
].pri
[which
];
331 } else if ((lcc
->lc_info
->large_count
> 0) &&
332 ((match
= largesearch(lcc
, *t
)) != NULL
)) {
335 * Character was found in the extended table.
337 *pri
= match
->pri
.pri
[which
];
341 * Character lacks a specific definition.
343 if (lcc
->lc_directive
[which
] & DIRECTIVE_UNDEFINED
) {
344 /* Mask off sign bit to prevent ordering confusion. */
345 *pri
= (*t
& COLLATE_MAX_PRIORITY
);
347 *pri
= lcc
->lc_undef_pri
[which
];
349 /* No substitutions for undefined characters! */
354 * Try substituting (expanding) the character. We are
355 * currently doing this *after* the chain compression. I
356 * think it should not matter, but this way might be slightly
359 * We do this after the priority search, as this will help us
360 * to identify a single key value. In order for this to work,
361 * its important that the priority assigned to a given element
362 * to be substituted be unique for that level. The localedef
363 * code ensures this for us.
365 if ((sptr
= substsearch(lcc
, *pri
, which
)) != NULL
) {
366 if ((*pri
= *sptr
) > 0) {
368 *state
= *sptr
? sptr
: NULL
;
375 * This is the meaty part of wcsxfrm & strxfrm. Note that it does
376 * NOT NULL terminate. That is left to the caller.
379 _collate_wxfrm(const struct lc_collate
*lcc
, const wchar_t *src
, wchar_t *xf
,
388 const int32_t *state
;
391 int ndir
= lcc
->lc_directive_count
;
395 for (pass
= 0; pass
<= ndir
; pass
++) {
400 /* insert level separator from the previous pass */
408 /* special pass for undefined */
410 direc
= DIRECTIVE_FORWARD
| DIRECTIVE_UNDEFINED
;
412 direc
= lcc
->lc_directive
[pass
];
417 if (direc
& DIRECTIVE_BACKWARD
) {
420 if ((tr
= wcsdup(t
)) == NULL
) {
425 fp
= tr
+ wcslen(tr
) - 1;
431 t
= (const wchar_t *)tr
;
434 if (direc
& DIRECTIVE_POSITION
) {
435 while (*t
|| state
) {
436 _collate_lookup(lcc
, t
, &len
, &pri
, pass
,
445 pri
= COLLATE_MAX_PRIORITY
;
455 while (*t
|| state
) {
456 _collate_lookup(lcc
, t
, &len
, &pri
, pass
,
483 return ((size_t)(-1));
487 * In the non-POSIX case, we transform each character into a string of
488 * characters representing the character's priority. Since char is usually
489 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add
490 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6
493 * It turns out that we sometimes have real priorities that are
494 * 31-bits wide. (But: be careful using priorities where the high
495 * order bit is set -- i.e. the priority is negative. The sort order
496 * may be surprising!)
498 * TODO: This would be a good area to optimize somewhat. It turns out
499 * that real prioririties *except for the last UNDEFINED pass* are generally
500 * very small. We need the localedef code to precalculate the max
501 * priority for us, and ideally also give us a mask, and then we could
502 * severely limit what we expand to.
505 #define XFRM_OFFSET ('0') /* make all printable characters */
507 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1)
508 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */
511 xfrm(locale_t loc
, unsigned char *p
, int pri
, int pass
)
513 /* we use unsigned to ensure zero fill on right shift */
514 uint32_t val
= (uint32_t)loc
->collate
->lc_pri_count
[pass
];
518 *p
= (pri
& XFRM_MASK
) + XFRM_OFFSET
;
528 _collate_sxfrm(const wchar_t *src
, char *xf
, size_t room
, locale_t loc
)
536 const int32_t *state
;
540 uint8_t buf
[XFRM_BYTES
];
541 const struct lc_collate
*lcc
= loc
->collate
;
542 int ndir
= lcc
->lc_directive_count
;
546 for (pass
= 0; pass
<= ndir
; pass
++) {
551 /* insert level separator from the previous pass */
559 /* special pass for undefined */
561 direc
= DIRECTIVE_FORWARD
| DIRECTIVE_UNDEFINED
;
563 direc
= lcc
->lc_directive
[pass
];
568 if (direc
& DIRECTIVE_BACKWARD
) {
571 if ((tr
= wcsdup(t
)) == NULL
) {
576 fp
= tr
+ wcslen(tr
) - 1;
582 t
= (const wchar_t *)tr
;
585 if (direc
& DIRECTIVE_POSITION
) {
586 while (*t
|| state
) {
588 _collate_lookup(lcc
, t
, &len
, &pri
, pass
,
597 pri
= COLLATE_MAX_PRIORITY
;
600 b
= xfrm(loc
, buf
, pri
, pass
);
614 while (*t
|| state
) {
615 _collate_lookup(lcc
, t
, &len
, &pri
, pass
,
627 b
= xfrm(loc
, buf
, pri
, pass
);
650 return ((size_t)(-1));