src/common/unicode_case.c

   1 /*-------------------------------------------------------------------------
   2  * unicode_case.c
   3  *              Unicode case mapping and case conversion.
   4  *
   5  * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
   6  *
   7  * IDENTIFICATION
   8  *        src/common/unicode_case.c
   9  *
  10  *-------------------------------------------------------------------------
  11  */
  12 #ifndef FRONTEND
  13 #include "postgres.h"
  14 #else
  15 #include "postgres_fe.h"
  16 #endif
  17
  18 #include "common/unicode_case.h"
  19 #include "common/unicode_case_table.h"
  20 #include "mb/pg_wchar.h"
  21
  22 static const pg_case_map *find_case_map(pg_wchar ucs);
  23 static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
  24                                                    CaseKind str_casekind, WordBoundaryNext wbnext,
  25                                                    void *wbstate);
  26
  27 pg_wchar
  28 unicode_lowercase_simple(pg_wchar code)
  29 {
  30         const pg_case_map *map = find_case_map(code);
  31
  32         return map ? map->simplemap[CaseLower] : code;
  33 }
  34
  35 pg_wchar
  36 unicode_titlecase_simple(pg_wchar code)
  37 {
  38         const pg_case_map *map = find_case_map(code);
  39
  40         return map ? map->simplemap[CaseTitle] : code;
  41 }
  42
  43 pg_wchar
  44 unicode_uppercase_simple(pg_wchar code)
  45 {
  46         const pg_case_map *map = find_case_map(code);
  47
  48         return map ? map->simplemap[CaseUpper] : code;
  49 }
  50
  51 /*
  52  * unicode_strlower()
  53  *
  54  * Convert src to lowercase, and return the result length (not including
  55  * terminating NUL).
  56  *
  57  * String src must be encoded in UTF-8. If srclen < 0, src must be
  58  * NUL-terminated.
  59  *
  60  * Result string is stored in dst, truncating if larger than dstsize. If
  61  * dstsize is greater than the result length, dst will be NUL-terminated;
  62  * otherwise not.
  63  *
  64  * If dstsize is zero, dst may be NULL. This is useful for calculating the
  65  * required buffer size before allocating.
  66  */
  67 size_t
  68 unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
  69 {
  70         return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
  71 }
  72
  73 /*
  74  * unicode_strtitle()
  75  *
  76  * Convert src to titlecase, and return the result length (not including
  77  * terminating NUL).
  78  *
  79  * String src must be encoded in UTF-8. If srclen < 0, src must be
  80  * NUL-terminated.
  81  *
  82  * Result string is stored in dst, truncating if larger than dstsize. If
  83  * dstsize is greater than the result length, dst will be NUL-terminated;
  84  * otherwise not.
  85  *
  86  * If dstsize is zero, dst may be NULL. This is useful for calculating the
  87  * required buffer size before allocating.
  88  *
  89  * Titlecasing requires knowledge about word boundaries, which is provided by
  90  * the callback wbnext. A word boundary is the offset of the start of a word
  91  * or the offset of the character immediately following a word.
  92  *
  93  * The caller is expected to initialize and free the callback state
  94  * wbstate. The callback should first return offset 0 for the first boundary;
  95  * then the offset of each subsequent word boundary; then the total length of
  96  * the string to indicate the final boundary.
  97  */
  98 size_t
  99 unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 100                                  WordBoundaryNext wbnext, void *wbstate)
 101 {
 102         return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
 103                                                 wbstate);
 104 }
 105
 106 /*
 107  * unicode_strupper()
 108  *
 109  * Convert src to uppercase, and return the result length (not including
 110  * terminating NUL).
 111  *
 112  * String src must be encoded in UTF-8. If srclen < 0, src must be
 113  * NUL-terminated.
 114  *
 115  * Result string is stored in dst, truncating if larger than dstsize. If
 116  * dstsize is greater than the result length, dst will be NUL-terminated;
 117  * otherwise not.
 118  *
 119  * If dstsize is zero, dst may be NULL. This is useful for calculating the
 120  * required buffer size before allocating.
 121  */
 122 size_t
 123 unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
 124 {
 125         return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
 126 }
 127
 128 /*
 129  * If str_casekind is CaseLower or CaseUpper, map each character in the string
 130  * for which a mapping is available.
 131  *
 132  * If str_casekind is CaseTitle, maps characters found on a word boundary to
 133  * uppercase and other characters to lowercase.
 134  */
 135 static size_t
 136 convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 137                          CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
 138 {
 139         /* character CaseKind varies while titlecasing */
 140         CaseKind        chr_casekind = str_casekind;
 141         size_t          srcoff = 0;
 142         size_t          result_len = 0;
 143         size_t          boundary = 0;
 144
 145         Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
 146                    (str_casekind != CaseTitle && !wbnext && !wbstate));
 147
 148         if (str_casekind == CaseTitle)
 149         {
 150                 boundary = wbnext(wbstate);
 151                 Assert(boundary == 0);  /* start of text is always a boundary */
 152         }
 153
 154         while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
 155         {
 156                 pg_wchar        u1 = utf8_to_unicode((unsigned char *) src + srcoff);
 157                 int                     u1len = unicode_utf8len(u1);
 158                 const pg_case_map *casemap = find_case_map(u1);
 159
 160                 if (str_casekind == CaseTitle)
 161                 {
 162                         if (srcoff == boundary)
 163                         {
 164                                 chr_casekind = CaseUpper;
 165                                 boundary = wbnext(wbstate);
 166                         }
 167                         else
 168                                 chr_casekind = CaseLower;
 169                 }
 170
 171                 /* perform mapping, update result_len, and write to dst */
 172                 if (casemap)
 173                 {
 174                         pg_wchar        u2 = casemap->simplemap[chr_casekind];
 175                         pg_wchar        u2len = unicode_utf8len(u2);
 176
 177                         if (result_len + u2len <= dstsize)
 178                                 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
 179
 180                         result_len += u2len;
 181                 }
 182                 else
 183                 {
 184                         /* no mapping; copy bytes from src */
 185                         if (result_len + u1len <= dstsize)
 186                                 memcpy(dst + result_len, src + srcoff, u1len);
 187
 188                         result_len += u1len;
 189                 }
 190
 191                 srcoff += u1len;
 192         }
 193
 194         if (result_len < dstsize)
 195                 dst[result_len] = '\0';
 196
 197         return result_len;
 198 }
 199
 200 /* find entry in simple case map, if any */
 201 static const pg_case_map *
 202 find_case_map(pg_wchar ucs)
 203 {
 204         int                     min;
 205         int                     mid;
 206         int                     max;
 207
 208         /* all chars <= 0x80 are stored in array for fast lookup */
 209         Assert(lengthof(case_map) >= 0x80);
 210         if (ucs < 0x80)
 211         {
 212                 const pg_case_map *map = &case_map[ucs];
 213
 214                 Assert(map->codepoint == ucs);
 215                 return map;
 216         }
 217
 218         /* otherwise, binary search */
 219         min = 0x80;
 220         max = lengthof(case_map) - 1;
 221         while (max >= min)
 222         {
 223                 mid = (min + max) / 2;
 224                 if (ucs > case_map[mid].codepoint)
 225                         min = mid + 1;
 226                 else if (ucs < case_map[mid].codepoint)
 227                         max = mid - 1;
 228                 else
 229                         return &case_map[mid];
 230         }
 231
 232         return NULL;
 233 }