1 /*-------------------------------------------------------------------------
3 * Multibyte character printing support for frontend code
6 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
9 * src/fe_utils/mbprint.c
11 *-------------------------------------------------------------------------
13 #include "postgres_fe.h"
15 #include "fe_utils/mbprint.h"
21 * To avoid version-skew problems, this file must not use declarations
22 * from pg_wchar.h: the encoding IDs we are dealing with are determined
23 * by the libpq.so we are linked with, and that might not match the
24 * numbers we see at compile time. (If this file were inside libpq,
25 * the problem would go away...)
27 * Hence, we have our own definition of pg_wchar, and we get the values
28 * of any needed encoding IDs on-the-fly.
31 typedef unsigned int pg_wchar
;
36 static int utf8_id
= -1;
39 utf8_id
= pg_char_to_encoding("utf8");
43 #define PG_UTF8 pg_get_utf8_id()
47 * Convert a UTF-8 character to a Unicode code point.
48 * This is a one-character version of pg_utf2wchar_with_len.
50 * No error checks here, c must point to a long-enough string.
53 utf8_to_unicode(const unsigned char *c
)
56 return (pg_wchar
) c
[0];
57 else if ((*c
& 0xe0) == 0xc0)
58 return (pg_wchar
) (((c
[0] & 0x1f) << 6) |
60 else if ((*c
& 0xf0) == 0xe0)
61 return (pg_wchar
) (((c
[0] & 0x0f) << 12) |
62 ((c
[1] & 0x3f) << 6) |
64 else if ((*c
& 0xf8) == 0xf0)
65 return (pg_wchar
) (((c
[0] & 0x07) << 18) |
66 ((c
[1] & 0x3f) << 12) |
67 ((c
[2] & 0x3f) << 6) |
70 /* that is an invalid code on purpose */
76 * Unicode 3.1 compliant validation : for each category, it checks the
77 * combination of each byte to make sure it maps to a valid range. It also
78 * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
79 * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
82 utf_charcheck(const unsigned char *c
)
86 else if ((*c
& 0xe0) == 0xc0)
89 if (((c
[1] & 0xc0) == 0x80) && ((c
[0] & 0x1f) > 0x01))
93 else if ((*c
& 0xf0) == 0xe0)
96 if (((c
[1] & 0xc0) == 0x80) &&
97 (((c
[0] & 0x0f) != 0x00) || ((c
[1] & 0x20) == 0x20)) &&
98 ((c
[2] & 0xc0) == 0x80))
101 int yx
= ((c
[1] & 0x3f) << 6) | (c
[0] & 0x3f);
104 /* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
106 (((yx
& 0xffe) == 0xffe) ||
107 (((yx
& 0xf80) == 0xd80) && (lx
>= 0x30) && (lx
<= 0x4f)))) ||
108 ((z
== 0x0d) && ((yx
& 0xb00) == 0x800)))
114 else if ((*c
& 0xf8) == 0xf0)
116 int u
= ((c
[0] & 0x07) << 2) | ((c
[1] & 0x30) >> 4);
119 if (((c
[1] & 0xc0) == 0x80) &&
120 (u
> 0x00) && (u
<= 0x10) &&
121 ((c
[2] & 0xc0) == 0x80) && ((c
[3] & 0xc0) == 0x80))
123 /* test for 0xzzzzfffe/0xzzzzfffff */
124 if (((c
[1] & 0x0f) == 0x0f) && ((c
[2] & 0x3f) == 0x3f) &&
125 ((c
[3] & 0x3e) == 0x3e))
136 mb_utf_validate(unsigned char *pwcs
)
138 unsigned char *p
= pwcs
;
144 if ((len
= utf_charcheck(pwcs
)) > 0)
150 for (i
= 0; i
< len
; i
++)
160 /* we skip the char */
168 * public functions : wcswidth and mbvalidate
172 * pg_wcswidth is the dumb display-width function.
173 * It assumes that everything will appear on one line.
174 * OTOH it is easier to use than pg_wcssize if this applies to you.
177 pg_wcswidth(const char *pwcs
, size_t len
, int encoding
)
186 chlen
= PQmblen(pwcs
, encoding
);
187 if (len
< (size_t) chlen
)
188 break; /* Invalid string */
190 chwidth
= PQdsplen(pwcs
, encoding
);
201 * pg_wcssize takes the given string in the given encoding and returns three
203 * result_width: Width in display characters of the longest line in string
204 * result_height: Number of lines in display output
205 * result_format_size: Number of bytes required to store formatted
206 * representation of string
208 * This MUST be kept in sync with pg_wcsformat!
211 pg_wcssize(const unsigned char *pwcs
, size_t len
, int encoding
,
212 int *result_width
, int *result_height
, int *result_format_size
)
221 for (; *pwcs
&& len
> 0; pwcs
+= chlen
)
223 chlen
= PQmblen((const char *) pwcs
, encoding
);
224 if (len
< (size_t) chlen
)
226 w
= PQdsplen((const char *) pwcs
, encoding
);
228 if (chlen
== 1) /* single-byte char */
230 if (*pwcs
== '\n') /* Newline */
232 if (linewidth
> width
)
236 format_size
+= 1; /* For NUL char */
238 else if (*pwcs
== '\r') /* Linefeed */
243 else if (*pwcs
== '\t') /* Tab */
249 } while (linewidth
% 8 != 0);
251 else if (w
< 0) /* Other control char */
256 else /* Output it as-is */
262 else if (w
< 0) /* Non-ascii control char */
264 linewidth
+= 6; /* \u0000 */
267 else /* All other chars */
270 format_size
+= chlen
;
274 if (linewidth
> width
)
276 format_size
+= 1; /* For NUL char */
280 *result_width
= width
;
282 *result_height
= height
;
283 if (result_format_size
)
284 *result_format_size
= format_size
;
288 * Format a string into one or more "struct lineptr" lines.
289 * lines[i].ptr == NULL indicates the end of the array.
291 * This MUST be kept in sync with pg_wcssize!
294 pg_wcsformat(const unsigned char *pwcs
, size_t len
, int encoding
,
295 struct lineptr
*lines
, int count
)
300 unsigned char *ptr
= lines
->ptr
; /* Pointer to data area */
302 for (; *pwcs
&& len
> 0; pwcs
+= chlen
)
304 chlen
= PQmblen((const char *) pwcs
, encoding
);
305 if (len
< (size_t) chlen
)
307 w
= PQdsplen((const char *) pwcs
, encoding
);
309 if (chlen
== 1) /* single-byte char */
311 if (*pwcs
== '\n') /* Newline */
314 lines
->width
= linewidth
;
319 exit(1); /* Screwup */
321 /* make next line point to remaining memory */
324 else if (*pwcs
== '\r') /* Linefeed */
326 strcpy((char *) ptr
, "\\r");
330 else if (*pwcs
== '\t') /* Tab */
336 } while (linewidth
% 8 != 0);
338 else if (w
< 0) /* Other control char */
340 sprintf((char *) ptr
, "\\x%02X", *pwcs
);
344 else /* Output it as-is */
350 else if (w
< 0) /* Non-ascii control char */
352 if (encoding
== PG_UTF8
)
353 sprintf((char *) ptr
, "\\u%04X", utf8_to_unicode(pwcs
));
357 * This case cannot happen in the current code because only
358 * UTF-8 signals multibyte control characters. But we may need
359 * to support it at some stage
361 sprintf((char *) ptr
, "\\u????");
366 else /* All other chars */
370 for (i
= 0; i
< chlen
; i
++)
376 lines
->width
= linewidth
;
377 *ptr
++ = '\0'; /* Terminate formatted string */
380 exit(1); /* Screwup */
382 (lines
+ 1)->ptr
= NULL
; /* terminate line array */
387 * Encoding validation: delete any unvalidatable characters from the string
389 * This seems redundant with existing functionality elsewhere?
392 mbvalidate(unsigned char *pwcs
, int encoding
)
394 if (encoding
== PG_UTF8
)
395 mb_utf_validate(pwcs
);
399 * other encodings needing validation should add their own routines