usr/src/lib/libslp/clib/slp_utf8.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 1999 by Sun Microsystems, Inc.
  24  * All rights reserved.
  25  */
  26
  27 #pragma ident   "%Z%%M% %I%     %E% SMI"
  28
  29 /*
  30  * UTF-8 encoded Unicode parsing routines. For efficiency, we convert
  31  * to wide chars only when absolutely needed. The following interfaces
  32  * are exported to libslp:
  33  *
  34  * slp_utf_strchr:      same semantics as strchr, but handles UTF-8 strings
  35  * slp_fold_space:      folds white space around and in between works;
  36  *                              handles UTF-8 strings
  37  * slp_strcasecmp:      same semantics as strcasecmp, but also folds white
  38  *                              space and attempts locale-specific
  39  *                              case-insensitive comparisons.
  40  */
  41
  42 #include <stdio.h>
  43 #include <string.h>
  44 #include <widec.h>
  45 #include <stdlib.h>
  46 #include <syslog.h>
  47 #include <slp-internal.h>
  48
  49 /*
  50  * Same semantics as strchr.
  51  * Assumes that we start on a char boundry, and that c is a 7-bit
  52  * ASCII char.
  53  */
  54 char *slp_utf_strchr(const char *s, char c) {
  55         int len;
  56         char *p;
  57
  58         for (p = (char *)s; *p; p += len) {
  59                 len = mblen(p, MB_CUR_MAX);
  60                 if (len == 1 && *p == c)
  61                         return (p);
  62         }
  63         return (NULL);
  64 }
  65
  66 /*
  67  * folds white space around and in between words.
  68  * " aa    bb   " becomes "aa bb".
  69  * returns NULL if it couldn't allocate memory. The caller must free
  70  * the result when done.
  71  */
  72 static char *slp_fold_space(const char *s) {
  73         int len;
  74         char *folded, *f;
  75
  76         if (!(folded = malloc(strlen(s) + 1))) {
  77                 slp_err(LOG_CRIT, 0, "slp_fold_space", "out of memory");
  78                 return (NULL);
  79         }
  80
  81         f = folded;
  82         for (;;) {
  83                 /* step 1: skip white space */
  84                 for (; *s; s++) {
  85                         len = mblen(s, MB_CUR_MAX);
  86                         if (len != 1)
  87                                 break;
  88                         if (!isspace(*s))
  89                                 break;
  90                 }
  91
  92                 if (!*s) {
  93                         /* end of string */
  94                         *f = 0;
  95                         return (folded);
  96                 }
  97                 /* if we are in between words, keep one space */
  98                 if (f != folded)
  99                         *f++ = ' ';
 100
 101                 /* step 2: copy into folded until we hit more white space */
 102                 while (*s) {
 103                         int i;
 104                         len = mblen(s, MB_CUR_MAX);
 105                         if (len == 1 && isspace(*s))
 106                                 break;
 107
 108                         for (i = 0; i < len; i++)
 109                                 *f++ = *s++;
 110                 }
 111                 *f = *s;
 112                 if (!*s++)
 113                         return (folded);
 114         }
 115 }
 116
 117 /*
 118  * performs like strcasecmp, but also folds white space before comparing,
 119  * and will handle UTF-8 comparisons (including case). Note that the
 120  * application's locale must have been set to a UTF-8 locale for this
 121  * to work properly.
 122  */
 123 int slp_strcasecmp(const char *s1, const char *s2) {
 124         int diff = -1;
 125         char *p1, *p2;
 126         size_t wcslen1, wcslen2;
 127         wchar_t *wcs1, *wcs2;
 128
 129         p1 = p2 = NULL; wcs1 = wcs2 = NULL;
 130
 131         /* optimization: try simple case first */
 132         if (strcasecmp(s1, s2) == 0)
 133                 return (0);
 134
 135         /* fold white space, and try again */
 136         p1 = slp_fold_space(s1);
 137         p2 = slp_fold_space(s2);
 138         if (!p1 || !p2)
 139                 goto cleanup;
 140
 141         if ((diff = strcasecmp(p1, p2)) == 0)
 142                 goto cleanup;
 143
 144         /*
 145          * try converting to wide char -- we must be in a locale which
 146          * supports the UTF8 codeset for this to work.
 147          */
 148         if ((wcslen1 = mbstowcs(NULL, p1, 0)) == (size_t)-1)
 149                 goto cleanup;
 150
 151         if (!(wcs1 = malloc(sizeof (*wcs1) * (wcslen1 + 1)))) {
 152                 slp_err(LOG_CRIT, 0, "slp_strcasecmp", "out of memory");
 153                 goto cleanup;
 154         }
 155
 156         if ((wcslen2 = mbstowcs(NULL, p2, 0)) == (size_t)-1)
 157                 goto cleanup;
 158
 159         if (!(wcs2 = malloc(sizeof (*wcs2) * (wcslen2 + 1)))) {
 160                 slp_err(LOG_CRIT, 0, "slp_strcasecmp", "out of memory");
 161                 goto cleanup;
 162         }
 163         if (mbstowcs(wcs1, p1, wcslen1 + 1) == (size_t)-1)
 164                 goto cleanup;
 165         if (mbstowcs(wcs2, p2, wcslen2 + 1) == (size_t)-1)
 166                 goto cleanup;
 167
 168         diff = wscasecmp(wcs1, wcs2);
 169
 170 cleanup:
 171         if (p1) free(p1);
 172         if (p2) free(p2);
 173         if (wcs1) free(wcs1);
 174         if (wcs2) free(wcs2);
 175         return (diff);
 176 }