1 /*-------------------------------------------------------------------------
4 * locale compatibility layer for tsearch
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
10 * src/backend/tsearch/ts_locale.c
12 *-------------------------------------------------------------------------
16 #include "common/string.h"
17 #include "storage/fd.h"
18 #include "tsearch/ts_locale.h"
20 static void tsearch_readline_callback(void *arg
);
24 * The reason these functions use a 3-wchar_t output buffer, not 2 as you
25 * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
26 * getting from char2wchar() is UTF16 not UTF32. A single input character
27 * may therefore produce a surrogate pair rather than just one wchar_t;
28 * we also need room for a trailing null. When we do get a surrogate pair,
29 * we pass just the first code to iswdigit() etc, so that these functions will
30 * always return false for characters outside the Basic Multilingual Plane.
35 t_isalpha(const char *ptr
)
37 int clen
= pg_mblen(ptr
);
38 wchar_t character
[WC_BUF_LEN
];
39 pg_locale_t mylocale
= 0; /* TODO */
41 if (clen
== 1 || database_ctype_is_c
)
42 return isalpha(TOUCHAR(ptr
));
44 char2wchar(character
, WC_BUF_LEN
, ptr
, clen
, mylocale
);
46 return iswalpha((wint_t) character
[0]);
50 t_isalnum(const char *ptr
)
52 int clen
= pg_mblen(ptr
);
53 wchar_t character
[WC_BUF_LEN
];
54 pg_locale_t mylocale
= 0; /* TODO */
56 if (clen
== 1 || database_ctype_is_c
)
57 return isalnum(TOUCHAR(ptr
));
59 char2wchar(character
, WC_BUF_LEN
, ptr
, clen
, mylocale
);
61 return iswalnum((wint_t) character
[0]);
66 * Set up to read a file using tsearch_readline(). This facility is
67 * better than just reading the file directly because it provides error
68 * context pointing to the specific line where a problem is detected.
72 * tsearch_readline_state trst;
74 * if (!tsearch_readline_begin(&trst, filename))
76 * (errcode(ERRCODE_CONFIG_FILE_ERROR),
77 * errmsg("could not open stop-word file \"%s\": %m",
79 * while ((line = tsearch_readline(&trst)) != NULL)
81 * tsearch_readline_end(&trst);
83 * Note that the caller supplies the ereport() for file open failure;
84 * this is so that a custom message can be provided. The filename string
85 * passed to tsearch_readline_begin() must remain valid through
86 * tsearch_readline_end().
89 tsearch_readline_begin(tsearch_readline_state
*stp
,
92 if ((stp
->fp
= AllocateFile(filename
, "r")) == NULL
)
94 stp
->filename
= filename
;
96 initStringInfo(&stp
->buf
);
98 /* Setup error traceback support for ereport() */
99 stp
->cb
.callback
= tsearch_readline_callback
;
101 stp
->cb
.previous
= error_context_stack
;
102 error_context_stack
= &stp
->cb
;
107 * Read the next line from a tsearch data file (expected to be in UTF-8), and
108 * convert it to database encoding if needed. The returned string is palloc'd.
109 * NULL return means EOF.
112 tsearch_readline(tsearch_readline_state
*stp
)
116 /* Advance line number to use in error reports */
119 /* Clear curline, it's no longer relevant */
122 if (stp
->curline
!= stp
->buf
.data
)
127 /* Collect next line, if there is one */
128 if (!pg_get_line_buf(stp
->fp
, &stp
->buf
))
131 /* Validate the input as UTF-8, then convert to DB encoding if needed */
132 recoded
= pg_any_to_server(stp
->buf
.data
, stp
->buf
.len
, PG_UTF8
);
134 /* Save the correctly-encoded string for possible error reports */
135 stp
->curline
= recoded
; /* might be equal to buf.data */
138 * We always return a freshly pstrdup'd string. This is clearly necessary
139 * if pg_any_to_server() returned buf.data, and we need a second copy even
140 * if encoding conversion did occur. The caller is entitled to pfree the
141 * returned string at any time, which would leave curline pointing to
142 * recycled storage, causing problems if an error occurs after that point.
143 * (It's preferable to return the result of pstrdup instead of the output
144 * of pg_any_to_server, because the conversion result tends to be
145 * over-allocated. Since callers might save the result string directly
146 * into a long-lived dictionary structure, we don't want it to be a larger
147 * palloc chunk than necessary. We'll reclaim the conversion result on
150 return pstrdup(recoded
);
154 * Close down after reading a file with tsearch_readline()
157 tsearch_readline_end(tsearch_readline_state
*stp
)
159 /* Suppress use of curline in any error reported below */
162 if (stp
->curline
!= stp
->buf
.data
)
167 /* Release other resources */
168 pfree(stp
->buf
.data
);
171 /* Pop the error context stack */
172 error_context_stack
= stp
->cb
.previous
;
176 * Error context callback for errors occurring while reading a tsearch
177 * configuration file.
180 tsearch_readline_callback(void *arg
)
182 tsearch_readline_state
*stp
= (tsearch_readline_state
*) arg
;
185 * We can't include the text of the config line for errors that occur
186 * during tsearch_readline() itself. The major cause of such errors is
187 * encoding violations, and we daren't try to print error messages
188 * containing badly-encoded data.
191 errcontext("line %d of configuration file \"%s\": \"%s\"",
196 errcontext("line %d of configuration file \"%s\"",