src/backend/tsearch/ts_locale.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * ts_locale.c
   4  *              locale compatibility layer for tsearch
   5  *
   6  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
   7  *
   8  *
   9  * IDENTIFICATION
  10  *        $PostgreSQL$
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14 #include "postgres.h"
  15
  16 #include "storage/fd.h"
  17 #include "tsearch/ts_locale.h"
  18 #include "tsearch/ts_public.h"
  19
  20 static void tsearch_readline_callback(void *arg);
  21
  22
  23 #ifdef USE_WIDE_UPPER_LOWER
  24
  25 int
  26 t_isdigit(const char *ptr)
  27 {
  28         int                     clen = pg_mblen(ptr);
  29         wchar_t         character[2];
  30
  31         if (clen == 1 || lc_ctype_is_c())
  32                 return isdigit(TOUCHAR(ptr));
  33
  34         char2wchar(character, 2, ptr, clen);
  35
  36         return iswdigit((wint_t) character[0]);
  37 }
  38
  39 int
  40 t_isspace(const char *ptr)
  41 {
  42         int                     clen = pg_mblen(ptr);
  43         wchar_t         character[2];
  44
  45         if (clen == 1 || lc_ctype_is_c())
  46                 return isspace(TOUCHAR(ptr));
  47
  48         char2wchar(character, 2, ptr, clen);
  49
  50         return iswspace((wint_t) character[0]);
  51 }
  52
  53 int
  54 t_isalpha(const char *ptr)
  55 {
  56         int                     clen = pg_mblen(ptr);
  57         wchar_t         character[2];
  58
  59         if (clen == 1 || lc_ctype_is_c())
  60                 return isalpha(TOUCHAR(ptr));
  61
  62         char2wchar(character, 2, ptr, clen);
  63
  64         return iswalpha((wint_t) character[0]);
  65 }
  66
  67 int
  68 t_isprint(const char *ptr)
  69 {
  70         int                     clen = pg_mblen(ptr);
  71         wchar_t         character[2];
  72
  73         if (clen == 1 || lc_ctype_is_c())
  74                 return isprint(TOUCHAR(ptr));
  75
  76         char2wchar(character, 2, ptr, clen);
  77
  78         return iswprint((wint_t) character[0]);
  79 }
  80 #endif   /* USE_WIDE_UPPER_LOWER */
  81
  82
  83 /*
  84  * Set up to read a file using tsearch_readline().  This facility is
  85  * better than just reading the file directly because it provides error
  86  * context pointing to the specific line where a problem is detected.
  87  *
  88  * Expected usage is:
  89  *
  90  *              tsearch_readline_state trst;
  91  *
  92  *              if (!tsearch_readline_begin(&trst, filename))
  93  *                      ereport(ERROR,
  94  *                                      (errcode(ERRCODE_CONFIG_FILE_ERROR),
  95  *                                       errmsg("could not open stop-word file \"%s\": %m",
  96  *                                                      filename)));
  97  *              while ((line = tsearch_readline(&trst)) != NULL)
  98  *                      process line;
  99  *              tsearch_readline_end(&trst);
 100  *
 101  * Note that the caller supplies the ereport() for file open failure;
 102  * this is so that a custom message can be provided.  The filename string
 103  * passed to tsearch_readline_begin() must remain valid through
 104  * tsearch_readline_end().
 105  */
 106 bool
 107 tsearch_readline_begin(tsearch_readline_state *stp,
 108                                            const char *filename)
 109 {
 110         if ((stp->fp = AllocateFile(filename, "r")) == NULL)
 111                 return false;
 112         stp->filename = filename;
 113         stp->lineno = 0;
 114         stp->curline = NULL;
 115         /* Setup error traceback support for ereport() */
 116         stp->cb.callback = tsearch_readline_callback;
 117         stp->cb.arg = (void *) stp;
 118         stp->cb.previous = error_context_stack;
 119         error_context_stack = &stp->cb;
 120         return true;
 121 }
 122
 123 /*
 124  * Read the next line from a tsearch data file (expected to be in UTF-8), and
 125  * convert it to database encoding if needed. The returned string is palloc'd.
 126  * NULL return means EOF.
 127  */
 128 char *
 129 tsearch_readline(tsearch_readline_state *stp)
 130 {
 131         char       *result;
 132
 133         stp->lineno++;
 134         stp->curline = NULL;
 135         result = t_readline(stp->fp);
 136         stp->curline = result;
 137         return result;
 138 }
 139
 140 /*
 141  * Close down after reading a file with tsearch_readline()
 142  */
 143 void
 144 tsearch_readline_end(tsearch_readline_state *stp)
 145 {
 146         FreeFile(stp->fp);
 147         /* Pop the error context stack */
 148         error_context_stack = stp->cb.previous;
 149 }
 150
 151 /*
 152  * Error context callback for errors occurring while reading a tsearch
 153  * configuration file.
 154  */
 155 static void
 156 tsearch_readline_callback(void *arg)
 157 {
 158         tsearch_readline_state *stp = (tsearch_readline_state *) arg;
 159
 160         /*
 161          * We can't include the text of the config line for errors that occur
 162          * during t_readline() itself.  This is only partly a consequence of
 163          * our arms-length use of that routine: the major cause of such
 164          * errors is encoding violations, and we daren't try to print error
 165          * messages containing badly-encoded data.
 166          */
 167         if (stp->curline)
 168                 errcontext("line %d of configuration file \"%s\": \"%s\"",
 169                                    stp->lineno,
 170                                    stp->filename,
 171                                    stp->curline);
 172         else
 173                 errcontext("line %d of configuration file \"%s\"",
 174                                    stp->lineno,
 175                                    stp->filename);
 176 }
 177
 178
 179 /*
 180  * Read the next line from a tsearch data file (expected to be in UTF-8), and
 181  * convert it to database encoding if needed. The returned string is palloc'd.
 182  * NULL return means EOF.
 183  *
 184  * Note: direct use of this function is now deprecated.  Go through
 185  * tsearch_readline() to provide better error reporting.
 186  */
 187 char *
 188 t_readline(FILE *fp)
 189 {
 190         int                     len;
 191         char       *recoded;
 192         char            buf[4096];              /* lines must not be longer than this */
 193
 194         if (fgets(buf, sizeof(buf), fp) == NULL)
 195                 return NULL;
 196
 197         len = strlen(buf);
 198
 199         /* Make sure the input is valid UTF-8 */
 200         (void) pg_verify_mbstr(PG_UTF8, buf, len, false);
 201
 202         /* And convert */
 203         recoded = (char *) pg_do_encoding_conversion((unsigned char *) buf,
 204                                                                                                  len,
 205                                                                                                  PG_UTF8,
 206                                                                                                  GetDatabaseEncoding());
 207         if (recoded == buf)
 208         {
 209                 /*
 210                  * conversion didn't pstrdup, so we must. We can use the length of the
 211                  * original string, because no conversion was done.
 212                  */
 213                 recoded = pnstrdup(recoded, len);
 214         }
 215
 216         return recoded;
 217 }
 218
 219 /*
 220  * lowerstr --- fold null-terminated string to lower case
 221  *
 222  * Returned string is palloc'd
 223  */
 224 char *
 225 lowerstr(const char *str)
 226 {
 227         return lowerstr_with_len(str, strlen(str));
 228 }
 229
 230 /*
 231  * lowerstr_with_len --- fold string to lower case
 232  *
 233  * Input string need not be null-terminated.
 234  *
 235  * Returned string is palloc'd
 236  */
 237 char *
 238 lowerstr_with_len(const char *str, int len)
 239 {
 240         char       *out;
 241
 242         if (len == 0)
 243                 return pstrdup("");
 244
 245 #ifdef USE_WIDE_UPPER_LOWER
 246
 247         /*
 248          * Use wide char code only when max encoding length > 1 and ctype != C.
 249          * Some operating systems fail with multi-byte encodings and a C locale.
 250          * Also, for a C locale there is no need to process as multibyte. From
 251          * backend/utils/adt/oracle_compat.c Teodor
 252          */
 253         if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
 254         {
 255                 wchar_t    *wstr,
 256                                    *wptr;
 257                 int                     wlen;
 258
 259                 /*
 260                  * alloc number of wchar_t for worst case, len contains number of
 261                  * bytes >= number of characters and alloc 1 wchar_t for 0, because
 262                  * wchar2char wants zero-terminated string
 263                  */
 264                 wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
 265
 266                 wlen = char2wchar(wstr, len + 1, str, len);
 267                 Assert(wlen <= len);
 268
 269                 while (*wptr)
 270                 {
 271                         *wptr = towlower((wint_t) *wptr);
 272                         wptr++;
 273                 }
 274
 275                 /*
 276                  * Alloc result string for worst case + '\0'
 277                  */
 278                 len = pg_database_encoding_max_length() * wlen + 1;
 279                 out = (char *) palloc(len);
 280
 281                 wlen = wchar2char(out, wstr, len);
 282
 283                 pfree(wstr);
 284
 285                 if (wlen < 0)
 286                         ereport(ERROR,
 287                                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 288                                          errmsg("conversion from wchar_t to server encoding failed: %m")));
 289                 Assert(wlen < len);
 290         }
 291         else
 292 #endif   /* USE_WIDE_UPPER_LOWER */
 293         {
 294                 const char *ptr = str;
 295                 char       *outptr;
 296
 297                 outptr = out = (char *) palloc(sizeof(char) * (len + 1));
 298                 while ((ptr - str) < len && *ptr)
 299                 {
 300                         *outptr++ = tolower(TOUCHAR(ptr));
 301                         ptr++;
 302                 }
 303                 *outptr = '\0';
 304         }
 305
 306         return out;
 307 }