2 * Copyright 2014 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
4 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
5 * at Electronni Visti IA, Kiev, Ukraine.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47 #include "localeimpl.h"
49 /* Check file format vs libc runtime. (See collatefile.h) */
50 #if COLL_WEIGHTS_MAX != COLLATE_WEIGHTS_MAX
51 #error "COLL_WEIGHTS_MAX != COLLATE_WEIGHTS_MAX"
55 * See the comments in usr/src/cmd/localedef/collate.c for further
56 * information. It would also be very helpful to have a copy of the
57 * POSIX standard for collation (in the locale format manual page)
58 * handy (www.opengroup.org).
62 * POSIX uses empty tables and falls down to strcmp.
64 struct lc_collate lc_collate_posix
= {
68 struct locdata __posix_collate_locdata
= {
70 .l_data
= { &lc_collate_posix
}
75 __lc_collate_load(const char *locname
)
84 struct locdata
*ldata
;
85 struct lc_collate
*lcc
;
88 * Slurp the locale file into the cache.
91 (void) snprintf(buf
, sizeof (buf
), "%s/%s/LC_COLLATE/LCL_DATA",
92 _PathLocale
, locname
);
94 if ((fd
= open(buf
, O_RDONLY
)) < 0) {
98 if (fstat(fd
, &sbuf
) < 0) {
103 if (sbuf
.st_size
< (COLLATE_STR_LEN
+ sizeof (info
))) {
108 map
= mmap(NULL
, sbuf
.st_size
, PROT_READ
, MAP_PRIVATE
, fd
, 0);
110 if ((TMP
= map
) == NULL
) {
115 if (strncmp(TMP
, COLLATE_VERSION
, COLLATE_STR_LEN
) != 0) {
116 (void) munmap(map
, sbuf
.st_size
);
120 TMP
+= COLLATE_STR_LEN
;
123 TMP
+= sizeof (*info
);
125 if ((info
->directive_count
< 1) ||
126 (info
->directive_count
>= COLL_WEIGHTS_MAX
) ||
127 ((chains
= info
->chain_count
) < 0)) {
128 (void) munmap(map
, sbuf
.st_size
);
133 i
= (sizeof (collate_char_t
) * (UCHAR_MAX
+ 1)) +
134 (sizeof (collate_chain_t
) * chains
) +
135 (sizeof (collate_large_t
) * info
->large_count
);
136 for (z
= 0; z
< info
->directive_count
; z
++) {
137 i
+= sizeof (collate_subst_t
) * info
->subst_count
[z
];
139 if (i
!= (sbuf
.st_size
- (TMP
- map
))) {
140 (void) munmap(map
, sbuf
.st_size
);
146 if ((ldata
= __locdata_alloc(locname
, sizeof (*lcc
))) == NULL
) {
147 (void) munmap(map
, sbuf
.st_size
);
150 lcc
= ldata
->l_data
[0];
152 ldata
->l_map_len
= sbuf
.st_size
;
155 lcc
->lc_directive_count
= info
->directive_count
;
156 lcc
->lc_large_count
= info
->large_count
;
158 for (z
= 0; z
< COLL_WEIGHTS_MAX
; z
++) {
159 lcc
->lc_directive
[z
] = info
->directive
[z
];
160 lcc
->lc_subst_count
[z
] = info
->subst_count
[z
];
161 lcc
->lc_pri_count
[z
] = info
->pri_count
[z
];
162 lcc
->lc_undef_pri
[z
] = info
->undef_pri
[z
];
165 lcc
->lc_char_table
= (void *)TMP
;
166 TMP
+= sizeof (collate_char_t
) * (UCHAR_MAX
+ 1);
168 for (z
= 0; z
< lcc
->lc_directive_count
; z
++) {
170 if ((count
= lcc
->lc_subst_count
[z
]) > 0) {
171 lcc
->lc_subst_table
[z
] = (void *)TMP
;
172 TMP
+= count
* sizeof (collate_subst_t
);
174 lcc
->lc_subst_table
[z
] = NULL
;
179 lcc
->lc_chain_table
= (void *)TMP
;
180 TMP
+= chains
* sizeof (collate_chain_t
);
182 lcc
->lc_chain_table
= NULL
;
183 lcc
->lc_chain_count
= chains
;
184 if (lcc
->lc_large_count
> 0)
185 lcc
->lc_large_table
= (void *)TMP
;
187 lcc
->lc_large_table
= NULL
;
192 static const int32_t *
193 substsearch(const struct lc_collate
*lcc
, const wchar_t key
, int pass
)
195 const collate_subst_t
*p
;
196 int n
= lcc
->lc_subst_count
[pass
];
201 if (pass
>= lcc
->lc_directive_count
)
204 if (!(key
& COLLATE_SUBST_PRIORITY
))
207 p
= lcc
->lc_subst_table
[pass
] + (key
& ~COLLATE_SUBST_PRIORITY
);
208 assert(p
->key
== key
);
212 static collate_chain_t
*
213 chainsearch(const struct lc_collate
*lcc
, const wchar_t *key
, int *len
)
216 int high
= lcc
->lc_info
->chain_count
- 1;
219 collate_chain_t
*tab
= lcc
->lc_chain_table
;
224 while (low
<= high
) {
225 next
= (low
+ high
) / 2;
227 compar
= *key
- *p
->str
;
229 l
= wcsnlen(p
->str
, COLLATE_STR_LEN
);
230 compar
= wcsncmp(key
, p
->str
, l
);
244 static collate_large_t
*
245 largesearch(const struct lc_collate
*lcc
, const wchar_t key
)
248 int high
= lcc
->lc_info
->large_count
- 1;
251 collate_large_t
*tab
= lcc
->lc_large_table
;
256 while (low
<= high
) {
257 next
= (low
+ high
) / 2;
259 compar
= key
- p
->val
;
271 _collate_lookup(const struct lc_collate
*lcc
, const wchar_t *t
,
272 int *len
, int *pri
, int which
, const int **state
)
275 collate_large_t
*match
;
280 * If this is the "last" pass for the UNDEFINED, then
281 * we just return the priority itself.
283 if (which
>= lcc
->lc_directive_count
) {
291 * If we have remaining substitution data from a previous
292 * call, consume it first.
294 if ((sptr
= *state
) != NULL
) {
297 if ((sptr
== *state
) || (sptr
== NULL
))
305 /* No active substitutions */
309 * Check for composites such as dipthongs that collate as a
310 * single element (aka chains or collating-elements).
312 if (((p2
= chainsearch(lcc
, t
, &l
)) != NULL
) &&
313 ((p
= p2
->pri
[which
]) >= 0)) {
318 } else if (*t
<= UCHAR_MAX
) {
321 * Character is a small (8-bit) character.
322 * We just look these up directly for speed.
324 *pri
= lcc
->lc_char_table
[*t
].pri
[which
];
326 } else if ((lcc
->lc_info
->large_count
> 0) &&
327 ((match
= largesearch(lcc
, *t
)) != NULL
)) {
330 * Character was found in the extended table.
332 *pri
= match
->pri
.pri
[which
];
336 * Character lacks a specific definition.
338 if (lcc
->lc_directive
[which
] & DIRECTIVE_UNDEFINED
) {
339 /* Mask off sign bit to prevent ordering confusion. */
340 *pri
= (*t
& COLLATE_MAX_PRIORITY
);
342 *pri
= lcc
->lc_undef_pri
[which
];
344 /* No substitutions for undefined characters! */
349 * Try substituting (expanding) the character. We are
350 * currently doing this *after* the chain compression. I
351 * think it should not matter, but this way might be slightly
354 * We do this after the priority search, as this will help us
355 * to identify a single key value. In order for this to work,
356 * its important that the priority assigned to a given element
357 * to be substituted be unique for that level. The localedef
358 * code ensures this for us.
360 if ((sptr
= substsearch(lcc
, *pri
, which
)) != NULL
) {
361 if ((*pri
= *sptr
) > 0) {
363 *state
= *sptr
? sptr
: NULL
;
370 * This is the meaty part of wcsxfrm & strxfrm. Note that it does
371 * NOT NULL terminate. That is left to the caller.
374 _collate_wxfrm(const struct lc_collate
*lcc
, const wchar_t *src
, wchar_t *xf
,
383 const int32_t *state
;
386 int ndir
= lcc
->lc_directive_count
;
390 for (pass
= 0; pass
<= ndir
; pass
++) {
395 /* insert level separator from the previous pass */
403 /* special pass for undefined */
405 direc
= DIRECTIVE_FORWARD
| DIRECTIVE_UNDEFINED
;
407 direc
= lcc
->lc_directive
[pass
];
412 if (direc
& DIRECTIVE_BACKWARD
) {
416 if ((tr
= wcsdup(t
)) == NULL
) {
421 fp
= tr
+ wcslen(tr
) - 1;
427 t
= (const wchar_t *)tr
;
430 if (direc
& DIRECTIVE_POSITION
) {
431 while (*t
|| state
) {
432 _collate_lookup(lcc
, t
, &len
, &pri
, pass
,
441 pri
= COLLATE_MAX_PRIORITY
;
451 while (*t
|| state
) {
452 _collate_lookup(lcc
, t
, &len
, &pri
, pass
,
481 return ((size_t)(-1));
485 * In the non-POSIX case, we transform each character into a string of
486 * characters representing the character's priority. Since char is usually
487 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add
488 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6
491 * It turns out that we sometimes have real priorities that are
492 * 31-bits wide. (But: be careful using priorities where the high
493 * order bit is set -- i.e. the priority is negative. The sort order
494 * may be surprising!)
496 * TODO: This would be a good area to optimize somewhat. It turns out
497 * that real prioririties *except for the last UNDEFINED pass* are generally
498 * very small. We need the localedef code to precalculate the max
499 * priority for us, and ideally also give us a mask, and then we could
500 * severely limit what we expand to.
503 #define XFRM_OFFSET ('0') /* make all printable characters */
505 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1)
506 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */
509 xfrm(locale_t loc
, unsigned char *p
, int pri
, int pass
)
511 /* we use unsigned to ensure zero fill on right shift */
512 uint32_t val
= (uint32_t)loc
->collate
->lc_pri_count
[pass
];
516 *p
= (pri
& XFRM_MASK
) + XFRM_OFFSET
;
526 _collate_sxfrm(const wchar_t *src
, char *xf
, size_t room
, locale_t loc
)
534 const int32_t *state
;
538 uint8_t buf
[XFRM_BYTES
];
539 const struct lc_collate
*lcc
= loc
->collate
;
540 int ndir
= lcc
->lc_directive_count
;
544 for (pass
= 0; pass
<= ndir
; pass
++) {
549 /* insert level separator from the previous pass */
557 /* special pass for undefined */
559 direc
= DIRECTIVE_FORWARD
| DIRECTIVE_UNDEFINED
;
561 direc
= lcc
->lc_directive
[pass
];
566 if (direc
& DIRECTIVE_BACKWARD
) {
570 if ((tr
= wcsdup(t
)) == NULL
) {
575 fp
= tr
+ wcslen(tr
) - 1;
581 t
= (const wchar_t *)tr
;
584 if (direc
& DIRECTIVE_POSITION
) {
585 while (*t
|| state
) {
587 _collate_lookup(lcc
, t
, &len
, &pri
, pass
,
596 pri
= COLLATE_MAX_PRIORITY
;
599 b
= xfrm(loc
, buf
, pri
, pass
);
613 while (*t
|| state
) {
614 _collate_lookup(lcc
, t
, &len
, &pri
, pass
,
626 b
= xfrm(loc
, buf
, pri
, pass
);
651 return ((size_t)(-1));