Add support for user-defined I/O conversion casts.
[PostgreSQL.git] / src / backend / utils / mb / conv.c
blob5de21d7bc78f2e8c863441bbb8cac67e49282ee1
1 /*-------------------------------------------------------------------------
3 * Utility functions for conversion procs.
5 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
8 * IDENTIFICATION
9 * $PostgreSQL$
11 *-------------------------------------------------------------------------
13 #include "postgres.h"
14 #include "mb/pg_wchar.h"
18 * LATINn ---> MIC when the charset's local codes map directly to MIC
20 * l points to the source string of length len
21 * p is the output area (must be large enough!)
22 * lc is the mule character set id for the local encoding
23 * encoding is the PG identifier for the local encoding
25 void
26 latin2mic(const unsigned char *l, unsigned char *p, int len,
27 int lc, int encoding)
29 int c1;
31 while (len > 0)
33 c1 = *l;
34 if (c1 == 0)
35 report_invalid_encoding(encoding, (const char *) l, len);
36 if (IS_HIGHBIT_SET(c1))
37 *p++ = lc;
38 *p++ = c1;
39 l++;
40 len--;
42 *p = '\0';
46 * MIC ---> LATINn when the charset's local codes map directly to MIC
48 * mic points to the source string of length len
49 * p is the output area (must be large enough!)
50 * lc is the mule character set id for the local encoding
51 * encoding is the PG identifier for the local encoding
53 void
54 mic2latin(const unsigned char *mic, unsigned char *p, int len,
55 int lc, int encoding)
57 int c1;
59 while (len > 0)
61 c1 = *mic;
62 if (c1 == 0)
63 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
64 if (!IS_HIGHBIT_SET(c1))
66 /* easy for ASCII */
67 *p++ = c1;
68 mic++;
69 len--;
71 else
73 int l = pg_mic_mblen(mic);
75 if (len < l)
76 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
77 len);
78 if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
79 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
80 (const char *) mic, len);
81 *p++ = mic[1];
82 mic += 2;
83 len -= 2;
86 *p = '\0';
91 * ASCII ---> MIC
93 * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
94 * characters, here we must take a hard line because we don't know
95 * the appropriate MIC equivalent.
97 void
98 pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
100 int c1;
102 while (len > 0)
104 c1 = *l;
105 if (c1 == 0 || IS_HIGHBIT_SET(c1))
106 report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
107 *p++ = c1;
108 l++;
109 len--;
111 *p = '\0';
115 * MIC ---> ASCII
117 void
118 pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
120 int c1;
122 while (len > 0)
124 c1 = *mic;
125 if (c1 == 0 || IS_HIGHBIT_SET(c1))
126 report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
127 (const char *) mic, len);
128 *p++ = c1;
129 mic++;
130 len--;
132 *p = '\0';
136 * latin2mic_with_table: a generic single byte charset encoding
137 * conversion from a local charset to the mule internal code.
139 * l points to the source string of length len
140 * p is the output area (must be large enough!)
141 * lc is the mule character set id for the local encoding
142 * encoding is the PG identifier for the local encoding
143 * tab holds conversion entries for the local charset
144 * starting from 128 (0x80). each entry in the table
145 * holds the corresponding code point for the mule internal code.
147 void
148 latin2mic_with_table(const unsigned char *l,
149 unsigned char *p,
150 int len,
151 int lc,
152 int encoding,
153 const unsigned char *tab)
155 unsigned char c1,
158 while (len > 0)
160 c1 = *l;
161 if (c1 == 0)
162 report_invalid_encoding(encoding, (const char *) l, len);
163 if (!IS_HIGHBIT_SET(c1))
164 *p++ = c1;
165 else
167 c2 = tab[c1 - HIGHBIT];
168 if (c2)
170 *p++ = lc;
171 *p++ = c2;
173 else
174 report_untranslatable_char(encoding, PG_MULE_INTERNAL,
175 (const char *) l, len);
177 l++;
178 len--;
180 *p = '\0';
184 * mic2latin_with_table: a generic single byte charset encoding
185 * conversion from the mule internal code to a local charset.
187 * mic points to the source string of length len
188 * p is the output area (must be large enough!)
189 * lc is the mule character set id for the local encoding
190 * encoding is the PG identifier for the local encoding
191 * tab holds conversion entries for the mule internal code's
192 * second byte, starting from 128 (0x80). each entry in the table
193 * holds the corresponding code point for the local charset.
195 void
196 mic2latin_with_table(const unsigned char *mic,
197 unsigned char *p,
198 int len,
199 int lc,
200 int encoding,
201 const unsigned char *tab)
203 unsigned char c1,
206 while (len > 0)
208 c1 = *mic;
209 if (c1 == 0)
210 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
211 if (!IS_HIGHBIT_SET(c1))
213 /* easy for ASCII */
214 *p++ = c1;
215 mic++;
216 len--;
218 else
220 int l = pg_mic_mblen(mic);
222 if (len < l)
223 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
224 len);
225 if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
226 (c2 = tab[mic[1] - HIGHBIT]) == 0)
228 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
229 (const char *) mic, len);
230 break; /* keep compiler quiet */
232 *p++ = c2;
233 mic += 2;
234 len -= 2;
237 *p = '\0';
241 * comparison routine for bsearch()
242 * this routine is intended for UTF8 -> local code
244 static int
245 compare1(const void *p1, const void *p2)
247 uint32 v1,
250 v1 = *(uint32 *) p1;
251 v2 = ((pg_utf_to_local *) p2)->utf;
252 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
256 * comparison routine for bsearch()
257 * this routine is intended for local code -> UTF8
259 static int
260 compare2(const void *p1, const void *p2)
262 uint32 v1,
265 v1 = *(uint32 *) p1;
266 v2 = ((pg_local_to_utf *) p2)->code;
267 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
271 * comparison routine for bsearch()
272 * this routine is intended for combined UTF8 -> local code
274 static int
275 compare3(const void *p1, const void *p2)
277 uint32 s1,
282 s1 = *(uint32 *) p1;
283 s2 = *((uint32 *) p1 + 1);
284 d1 = ((pg_utf_to_local_combined *) p2)->utf1;
285 d2 = ((pg_utf_to_local_combined *) p2)->utf2;
286 return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
290 * comparison routine for bsearch()
291 * this routine is intended for local code -> combined UTF8
293 static int
294 compare4(const void *p1, const void *p2)
296 uint32 v1,
299 v1 = *(uint32 *) p1;
300 v2 = ((pg_local_to_utf_combined *) p2)->code;
301 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
305 * convert 32bit wide character to mutibye stream pointed to by iso
307 static unsigned char *
308 set_iso_code(unsigned char *iso, uint32 code)
310 if (code & 0xff000000)
311 *iso++ = code >> 24;
312 if (code & 0x00ff0000)
313 *iso++ = (code & 0x00ff0000) >> 16;
314 if (code & 0x0000ff00)
315 *iso++ = (code & 0x0000ff00) >> 8;
316 if (code & 0x000000ff)
317 *iso++ = code & 0x000000ff;
318 return iso;
322 * UTF8 ---> local code
324 * utf: input UTF8 string (need not be null-terminated).
325 * iso: pointer to the output area (must be large enough!)
326 * map: the conversion map.
327 * cmap: the conversion map for combined characters.
328 * (optional)
329 * size1: the size of the conversion map.
330 * size2: the size of the conversion map for combined characters
331 * (optional)
332 * encoding: the PG identifier for the local encoding.
333 * len: length of input string.
335 void
336 UtfToLocal(const unsigned char *utf, unsigned char *iso,
337 const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap,
338 int size1, int size2, int encoding, int len)
340 uint32 iutf;
341 uint32 cutf[2];
342 uint32 code;
343 pg_utf_to_local *p;
344 pg_utf_to_local_combined *cp;
345 int l;
347 for (; len > 0; len -= l)
349 /* "break" cases all represent errors */
350 if (*utf == '\0')
351 break;
353 l = pg_utf_mblen(utf);
355 if (len < l)
356 break;
358 if (!pg_utf8_islegal(utf, l))
359 break;
361 if (l == 1)
363 /* ASCII case is easy */
364 *iso++ = *utf++;
365 continue;
367 else if (l == 2)
369 iutf = *utf++ << 8;
370 iutf |= *utf++;
372 else if (l == 3)
374 iutf = *utf++ << 16;
375 iutf |= *utf++ << 8;
376 iutf |= *utf++;
378 else if (l == 4)
380 iutf = *utf++ << 24;
381 iutf |= *utf++ << 16;
382 iutf |= *utf++ << 8;
383 iutf |= *utf++;
387 * first, try with combined map if possible
389 if (cmap && len > l)
391 const unsigned char *utf_save = utf;
392 int len_save = len;
393 int l_save = l;
395 len -= l;
397 l = pg_utf_mblen(utf);
398 if (len < l)
399 break;
401 if (!pg_utf8_islegal(utf, l))
402 break;
404 cutf[0] = iutf;
406 if (l == 1)
408 if (len_save > 1)
410 p = bsearch(&cutf[0], map, size1,
411 sizeof(pg_utf_to_local), compare1);
412 if (p == NULL)
413 report_untranslatable_char(PG_UTF8, encoding,
414 (const char *) (utf_save - l_save), len_save);
415 iso = set_iso_code(iso, p->code);
418 /* ASCII case is easy */
419 *iso++ = *utf++;
420 continue;
422 else if (l == 2)
424 iutf = *utf++ << 8;
425 iutf |= *utf++;
427 else if (l == 3)
429 iutf = *utf++ << 16;
430 iutf |= *utf++ << 8;
431 iutf |= *utf++;
433 else if (l == 4)
435 iutf = *utf++ << 24;
436 iutf |= *utf++ << 16;
437 iutf |= *utf++ << 8;
438 iutf |= *utf++;
441 cutf[1] = iutf;
442 cp = bsearch(cutf, cmap, size2,
443 sizeof(pg_utf_to_local_combined), compare3);
444 if (cp)
445 code = cp->code;
446 else
448 /* not found in combined map. try with ordinary map */
449 p = bsearch(&cutf[0], map, size1,
450 sizeof(pg_utf_to_local), compare1);
451 if (p == NULL)
452 report_untranslatable_char(PG_UTF8, encoding,
453 (const char *) (utf_save - l_save), len_save);
454 iso = set_iso_code(iso, p->code);
456 p = bsearch(&cutf[1], map, size1,
457 sizeof(pg_utf_to_local), compare1);
458 if (p == NULL)
459 report_untranslatable_char(PG_UTF8, encoding,
460 (const char *) (utf - l), len);
461 code = p->code;
464 else /* no cmap or no remaining data */
466 p = bsearch(&iutf, map, size1,
467 sizeof(pg_utf_to_local), compare1);
468 if (p == NULL)
469 report_untranslatable_char(PG_UTF8, encoding,
470 (const char *) (utf - l), len);
471 code = p->code;
473 iso = set_iso_code(iso, code);
476 if (len > 0)
477 report_invalid_encoding(PG_UTF8, (const char *) utf, len);
479 *iso = '\0';
483 * local code ---> UTF8
485 * iso: input local string (need not be null-terminated).
486 * utf: pointer to the output area (must be large enough!)
487 * map: the conversion map.
488 * cmap: the conversion map for combined characters.
489 * (optional)
490 * size1: the size of the conversion map.
491 * size2: the size of the conversion map for combined characters
492 * (optional)
493 * encoding: the PG identifier for the local encoding.
494 * len: length of input string.
496 void
497 LocalToUtf(const unsigned char *iso, unsigned char *utf,
498 const pg_local_to_utf *map, const pg_local_to_utf_combined *cmap,
499 int size1, int size2, int encoding, int len)
501 unsigned int iiso;
502 int l;
503 pg_local_to_utf *p;
504 pg_local_to_utf_combined *cp;
506 if (!PG_VALID_ENCODING(encoding))
507 ereport(ERROR,
508 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
509 errmsg("invalid encoding number: %d", encoding)));
511 for (; len > 0; len -= l)
513 /* "break" cases all represent errors */
514 if (*iso == '\0')
515 break;
517 if (!IS_HIGHBIT_SET(*iso))
519 /* ASCII case is easy */
520 *utf++ = *iso++;
521 l = 1;
522 continue;
525 l = pg_encoding_verifymb(encoding, (const char *) iso, len);
526 if (l < 0)
527 break;
529 if (l == 1)
530 iiso = *iso++;
531 else if (l == 2)
533 iiso = *iso++ << 8;
534 iiso |= *iso++;
536 else if (l == 3)
538 iiso = *iso++ << 16;
539 iiso |= *iso++ << 8;
540 iiso |= *iso++;
542 else if (l == 4)
544 iiso = *iso++ << 24;
545 iiso |= *iso++ << 16;
546 iiso |= *iso++ << 8;
547 iiso |= *iso++;
550 p = bsearch(&iiso, map, size1,
551 sizeof(pg_local_to_utf), compare2);
553 if (p == NULL)
556 * not found in the ordinary map. if there's a combined character
557 * map, try with it
559 if (cmap)
561 cp = bsearch(&iiso, cmap, size2,
562 sizeof(pg_local_to_utf_combined), compare4);
564 if (cp)
566 if (cp->utf1 & 0xff000000)
567 *utf++ = cp->utf1 >> 24;
568 if (cp->utf1 & 0x00ff0000)
569 *utf++ = (cp->utf1 & 0x00ff0000) >> 16;
570 if (cp->utf1 & 0x0000ff00)
571 *utf++ = (cp->utf1 & 0x0000ff00) >> 8;
572 if (cp->utf1 & 0x000000ff)
573 *utf++ = cp->utf1 & 0x000000ff;
575 if (cp->utf2 & 0xff000000)
576 *utf++ = cp->utf2 >> 24;
577 if (cp->utf2 & 0x00ff0000)
578 *utf++ = (cp->utf2 & 0x00ff0000) >> 16;
579 if (cp->utf2 & 0x0000ff00)
580 *utf++ = (cp->utf2 & 0x0000ff00) >> 8;
581 if (cp->utf2 & 0x000000ff)
582 *utf++ = cp->utf2 & 0x000000ff;
584 continue;
588 report_untranslatable_char(encoding, PG_UTF8,
589 (const char *) (iso - l), len);
592 else
594 if (p->utf & 0xff000000)
595 *utf++ = p->utf >> 24;
596 if (p->utf & 0x00ff0000)
597 *utf++ = (p->utf & 0x00ff0000) >> 16;
598 if (p->utf & 0x0000ff00)
599 *utf++ = (p->utf & 0x0000ff00) >> 8;
600 if (p->utf & 0x000000ff)
601 *utf++ = p->utf & 0x000000ff;
605 if (len > 0)
606 report_invalid_encoding(encoding, (const char *) iso, len);
608 *utf = '\0';