1 /*-------------------------------------------------------------------------
3 * Utility functions for conversion procs.
5 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
11 *-------------------------------------------------------------------------
14 #include "mb/pg_wchar.h"
18 * LATINn ---> MIC when the charset's local codes map directly to MIC
20 * l points to the source string of length len
21 * p is the output area (must be large enough!)
22 * lc is the mule character set id for the local encoding
23 * encoding is the PG identifier for the local encoding
26 latin2mic(const unsigned char *l
, unsigned char *p
, int len
,
35 report_invalid_encoding(encoding
, (const char *) l
, len
);
36 if (IS_HIGHBIT_SET(c1
))
46 * MIC ---> LATINn when the charset's local codes map directly to MIC
48 * mic points to the source string of length len
49 * p is the output area (must be large enough!)
50 * lc is the mule character set id for the local encoding
51 * encoding is the PG identifier for the local encoding
54 mic2latin(const unsigned char *mic
, unsigned char *p
, int len
,
63 report_invalid_encoding(PG_MULE_INTERNAL
, (const char *) mic
, len
);
64 if (!IS_HIGHBIT_SET(c1
))
73 int l
= pg_mic_mblen(mic
);
76 report_invalid_encoding(PG_MULE_INTERNAL
, (const char *) mic
,
78 if (l
!= 2 || c1
!= lc
|| !IS_HIGHBIT_SET(mic
[1]))
79 report_untranslatable_char(PG_MULE_INTERNAL
, encoding
,
80 (const char *) mic
, len
);
93 * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
94 * characters, here we must take a hard line because we don't know
95 * the appropriate MIC equivalent.
98 pg_ascii2mic(const unsigned char *l
, unsigned char *p
, int len
)
105 if (c1
== 0 || IS_HIGHBIT_SET(c1
))
106 report_invalid_encoding(PG_SQL_ASCII
, (const char *) l
, len
);
118 pg_mic2ascii(const unsigned char *mic
, unsigned char *p
, int len
)
125 if (c1
== 0 || IS_HIGHBIT_SET(c1
))
126 report_untranslatable_char(PG_MULE_INTERNAL
, PG_SQL_ASCII
,
127 (const char *) mic
, len
);
136 * latin2mic_with_table: a generic single byte charset encoding
137 * conversion from a local charset to the mule internal code.
139 * l points to the source string of length len
140 * p is the output area (must be large enough!)
141 * lc is the mule character set id for the local encoding
142 * encoding is the PG identifier for the local encoding
143 * tab holds conversion entries for the local charset
144 * starting from 128 (0x80). each entry in the table
145 * holds the corresponding code point for the mule internal code.
148 latin2mic_with_table(const unsigned char *l
,
153 const unsigned char *tab
)
162 report_invalid_encoding(encoding
, (const char *) l
, len
);
163 if (!IS_HIGHBIT_SET(c1
))
167 c2
= tab
[c1
- HIGHBIT
];
174 report_untranslatable_char(encoding
, PG_MULE_INTERNAL
,
175 (const char *) l
, len
);
184 * mic2latin_with_table: a generic single byte charset encoding
185 * conversion from the mule internal code to a local charset.
187 * mic points to the source string of length len
188 * p is the output area (must be large enough!)
189 * lc is the mule character set id for the local encoding
190 * encoding is the PG identifier for the local encoding
191 * tab holds conversion entries for the mule internal code's
192 * second byte, starting from 128 (0x80). each entry in the table
193 * holds the corresponding code point for the local charset.
196 mic2latin_with_table(const unsigned char *mic
,
201 const unsigned char *tab
)
210 report_invalid_encoding(PG_MULE_INTERNAL
, (const char *) mic
, len
);
211 if (!IS_HIGHBIT_SET(c1
))
220 int l
= pg_mic_mblen(mic
);
223 report_invalid_encoding(PG_MULE_INTERNAL
, (const char *) mic
,
225 if (l
!= 2 || c1
!= lc
|| !IS_HIGHBIT_SET(mic
[1]) ||
226 (c2
= tab
[mic
[1] - HIGHBIT
]) == 0)
228 report_untranslatable_char(PG_MULE_INTERNAL
, encoding
,
229 (const char *) mic
, len
);
230 break; /* keep compiler quiet */
241 * comparison routine for bsearch()
242 * this routine is intended for UTF8 -> local code
245 compare1(const void *p1
, const void *p2
)
251 v2
= ((pg_utf_to_local
*) p2
)->utf
;
252 return (v1
> v2
) ? 1 : ((v1
== v2
) ? 0 : -1);
256 * comparison routine for bsearch()
257 * this routine is intended for local code -> UTF8
260 compare2(const void *p1
, const void *p2
)
266 v2
= ((pg_local_to_utf
*) p2
)->code
;
267 return (v1
> v2
) ? 1 : ((v1
== v2
) ? 0 : -1);
271 * comparison routine for bsearch()
272 * this routine is intended for combined UTF8 -> local code
275 compare3(const void *p1
, const void *p2
)
283 s2
= *((uint32
*) p1
+ 1);
284 d1
= ((pg_utf_to_local_combined
*) p2
)->utf1
;
285 d2
= ((pg_utf_to_local_combined
*) p2
)->utf2
;
286 return (s1
> d1
|| (s1
== d1
&& s2
> d2
)) ? 1 : ((s1
== d1
&& s2
== d2
) ? 0 : -1);
290 * comparison routine for bsearch()
291 * this routine is intended for local code -> combined UTF8
294 compare4(const void *p1
, const void *p2
)
300 v2
= ((pg_local_to_utf_combined
*) p2
)->code
;
301 return (v1
> v2
) ? 1 : ((v1
== v2
) ? 0 : -1);
305 * convert 32bit wide character to mutibye stream pointed to by iso
307 static unsigned char *
308 set_iso_code(unsigned char *iso
, uint32 code
)
310 if (code
& 0xff000000)
312 if (code
& 0x00ff0000)
313 *iso
++ = (code
& 0x00ff0000) >> 16;
314 if (code
& 0x0000ff00)
315 *iso
++ = (code
& 0x0000ff00) >> 8;
316 if (code
& 0x000000ff)
317 *iso
++ = code
& 0x000000ff;
322 * UTF8 ---> local code
324 * utf: input UTF8 string (need not be null-terminated).
325 * iso: pointer to the output area (must be large enough!)
326 * map: the conversion map.
327 * cmap: the conversion map for combined characters.
329 * size1: the size of the conversion map.
330 * size2: the size of the conversion map for combined characters
332 * encoding: the PG identifier for the local encoding.
333 * len: length of input string.
336 UtfToLocal(const unsigned char *utf
, unsigned char *iso
,
337 const pg_utf_to_local
*map
, const pg_utf_to_local_combined
*cmap
,
338 int size1
, int size2
, int encoding
, int len
)
344 pg_utf_to_local_combined
*cp
;
347 for (; len
> 0; len
-= l
)
349 /* "break" cases all represent errors */
353 l
= pg_utf_mblen(utf
);
358 if (!pg_utf8_islegal(utf
, l
))
363 /* ASCII case is easy */
381 iutf
|= *utf
++ << 16;
387 * first, try with combined map if possible
391 const unsigned char *utf_save
= utf
;
397 l
= pg_utf_mblen(utf
);
401 if (!pg_utf8_islegal(utf
, l
))
410 p
= bsearch(&cutf
[0], map
, size1
,
411 sizeof(pg_utf_to_local
), compare1
);
413 report_untranslatable_char(PG_UTF8
, encoding
,
414 (const char *) (utf_save
- l_save
), len_save
);
415 iso
= set_iso_code(iso
, p
->code
);
418 /* ASCII case is easy */
436 iutf
|= *utf
++ << 16;
442 cp
= bsearch(cutf
, cmap
, size2
,
443 sizeof(pg_utf_to_local_combined
), compare3
);
448 /* not found in combined map. try with ordinary map */
449 p
= bsearch(&cutf
[0], map
, size1
,
450 sizeof(pg_utf_to_local
), compare1
);
452 report_untranslatable_char(PG_UTF8
, encoding
,
453 (const char *) (utf_save
- l_save
), len_save
);
454 iso
= set_iso_code(iso
, p
->code
);
456 p
= bsearch(&cutf
[1], map
, size1
,
457 sizeof(pg_utf_to_local
), compare1
);
459 report_untranslatable_char(PG_UTF8
, encoding
,
460 (const char *) (utf
- l
), len
);
464 else /* no cmap or no remaining data */
466 p
= bsearch(&iutf
, map
, size1
,
467 sizeof(pg_utf_to_local
), compare1
);
469 report_untranslatable_char(PG_UTF8
, encoding
,
470 (const char *) (utf
- l
), len
);
473 iso
= set_iso_code(iso
, code
);
477 report_invalid_encoding(PG_UTF8
, (const char *) utf
, len
);
483 * local code ---> UTF8
485 * iso: input local string (need not be null-terminated).
486 * utf: pointer to the output area (must be large enough!)
487 * map: the conversion map.
488 * cmap: the conversion map for combined characters.
490 * size1: the size of the conversion map.
491 * size2: the size of the conversion map for combined characters
493 * encoding: the PG identifier for the local encoding.
494 * len: length of input string.
497 LocalToUtf(const unsigned char *iso
, unsigned char *utf
,
498 const pg_local_to_utf
*map
, const pg_local_to_utf_combined
*cmap
,
499 int size1
, int size2
, int encoding
, int len
)
504 pg_local_to_utf_combined
*cp
;
506 if (!PG_VALID_ENCODING(encoding
))
508 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
509 errmsg("invalid encoding number: %d", encoding
)));
511 for (; len
> 0; len
-= l
)
513 /* "break" cases all represent errors */
517 if (!IS_HIGHBIT_SET(*iso
))
519 /* ASCII case is easy */
525 l
= pg_encoding_verifymb(encoding
, (const char *) iso
, len
);
545 iiso
|= *iso
++ << 16;
550 p
= bsearch(&iiso
, map
, size1
,
551 sizeof(pg_local_to_utf
), compare2
);
556 * not found in the ordinary map. if there's a combined character
561 cp
= bsearch(&iiso
, cmap
, size2
,
562 sizeof(pg_local_to_utf_combined
), compare4
);
566 if (cp
->utf1
& 0xff000000)
567 *utf
++ = cp
->utf1
>> 24;
568 if (cp
->utf1
& 0x00ff0000)
569 *utf
++ = (cp
->utf1
& 0x00ff0000) >> 16;
570 if (cp
->utf1
& 0x0000ff00)
571 *utf
++ = (cp
->utf1
& 0x0000ff00) >> 8;
572 if (cp
->utf1
& 0x000000ff)
573 *utf
++ = cp
->utf1
& 0x000000ff;
575 if (cp
->utf2
& 0xff000000)
576 *utf
++ = cp
->utf2
>> 24;
577 if (cp
->utf2
& 0x00ff0000)
578 *utf
++ = (cp
->utf2
& 0x00ff0000) >> 16;
579 if (cp
->utf2
& 0x0000ff00)
580 *utf
++ = (cp
->utf2
& 0x0000ff00) >> 8;
581 if (cp
->utf2
& 0x000000ff)
582 *utf
++ = cp
->utf2
& 0x000000ff;
588 report_untranslatable_char(encoding
, PG_UTF8
,
589 (const char *) (iso
- l
), len
);
594 if (p
->utf
& 0xff000000)
595 *utf
++ = p
->utf
>> 24;
596 if (p
->utf
& 0x00ff0000)
597 *utf
++ = (p
->utf
& 0x00ff0000) >> 16;
598 if (p
->utf
& 0x0000ff00)
599 *utf
++ = (p
->utf
& 0x0000ff00) >> 8;
600 if (p
->utf
& 0x000000ff)
601 *utf
++ = p
->utf
& 0x000000ff;
606 report_invalid_encoding(encoding
, (const char *) iso
, len
);