From e91c0ce377731c90d3b0aea4786737343f4b061a Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Thu, 12 Apr 2001 12:55:41 +0000 Subject: [PATCH] Add UTF-32 encodings. --- ChangeLog | 15 +++++++ NEWS | 5 +++ NOTES | 2 + README | 1 + lib/converters.h | 3 ++ lib/encodings.def | 15 +++++++ lib/ucs4.h | 10 ++--- lib/utf32.h | 91 +++++++++++++++++++++++++++++++++++++++++++ lib/{ucs4.h => utf32be.h} | 42 +++++++------------- lib/utf32le.h | 55 ++++++++++++++++++++++++++ man/iconv_open.3 | 3 +- tests/Makefile.in | 3 ++ tests/Makefile.msvc | 3 ++ tests/Makefile.os2 | 3 ++ tests/UTF-32-snippet | Bin 0 -> 1068 bytes tests/UTF-32-snippet.UTF-8 | 6 +++ tests/UTF-32BE-snippet | Bin 0 -> 1064 bytes tests/UTF-32BE-snippet.UTF-8 | 6 +++ tests/UTF-32LE-snippet | Bin 0 -> 1064 bytes tests/UTF-32LE-snippet.UTF-8 | 6 +++ 20 files changed, 236 insertions(+), 33 deletions(-) create mode 100644 lib/utf32.h copy lib/{ucs4.h => utf32be.h} (52%) create mode 100644 lib/utf32le.h create mode 100644 tests/UTF-32-snippet create mode 100644 tests/UTF-32-snippet.UTF-8 create mode 100644 tests/UTF-32BE-snippet create mode 100644 tests/UTF-32BE-snippet.UTF-8 create mode 100644 tests/UTF-32LE-snippet create mode 100644 tests/UTF-32LE-snippet.UTF-8 diff --git a/ChangeLog b/ChangeLog index 4153ea3..1398539 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,20 @@ 2001-04-11 Bruno Haible + Implement and document UTF-32, UTF-32BE, UTF-32LE. + * src/utf32.h, src/utf32be.h, src/utf32le.h: New files. + * src/converters.h: Include them. + * src/encodings.def (UTF-32, UTF-32BE, UTF32LE): New encodings. + * README, man/iconv_open.3: Add UTF-32, UTF-32BE, UTF32LE. + * tests/Makefile.in (check): Check UTF-32, UTF-32BE, UTF32LE. + * tests/Makefile.os2 (check): Likewise. + * tests/Makefile.msvc (check): Likewise. + * tests/UTF-32*snippet*: New files. + + * lib/ucs4.h (ucs4_mbtowc): Fix value of other-endian byte order. + (ucs4_wctomb): Allow any 31-bit codepoint. + +2001-04-11 Bruno Haible + * tests/GB18030.TXT: Add mappings for all of U+0000..U+FFFF, including unassigned code points. * tests/table-from.c (main); When dumping GB18030, don't print code diff --git a/NEWS b/NEWS index 9e754d7..51c0157 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,8 @@ +New in 1.7: + +* Added UTF-32, UTF-32BE, UTF-32LE converters. +* Fixed a bug in the byte order mark treatment of the UCS-4 decoder. + New in 1.6: * The iconv program's -f and -t options are now optional. * Many more transliterations. diff --git a/NOTES b/NOTES index 2d8133d..47b2c51 100644 --- a/NOTES +++ b/NOTES @@ -290,6 +290,8 @@ A: libiconv, as an internationalization library, supports those character We implement these, because UTF-16 is still the favourite encoding of the president of the Unicode Consortium (for political reasons), and because they appear in RFC 2781. + * UTF-32, UTF-32BE, UTF-32LE + We implement these because they are part of Unicode 3.1. * UTF-7 We implement this because it is essential functionality for mail applications. diff --git a/README b/README index 84e0023..93c0755 100644 --- a/README +++ b/README @@ -38,6 +38,7 @@ It provides support for the encodings: UCS-2, UCS-2BE, UCS-2LE UCS-4, UCS-4BE, UCS-4LE UTF-16, UTF-16BE, UTF-16LE + UTF-32, UTF-32BE, UTF-32LE UTF-7 JAVA Full Unicode, in terms of `uint16_t' or `uint32_t' diff --git a/lib/converters.h b/lib/converters.h index 2d1ee51..3456724 100644 --- a/lib/converters.h +++ b/lib/converters.h @@ -107,6 +107,9 @@ struct conv_struct { #include "utf16.h" #include "utf16be.h" #include "utf16le.h" +#include "utf32.h" +#include "utf32be.h" +#include "utf32le.h" #include "utf7.h" #include "ucs2internal.h" #include "ucs2swapped.h" diff --git a/lib/encodings.def b/lib/encodings.def index e6b64b8..60c4a93 100644 --- a/lib/encodings.def +++ b/lib/encodings.def @@ -112,6 +112,21 @@ DEFENCODING(( "UTF-16LE", /* RFC 2781 */ utf16le, { utf16le_mbtowc }, { utf16le_wctomb, NULL }) +DEFENCODING(( "UTF-32", /* Unicode 3.1 */ + ), + utf32, + { utf32_mbtowc }, { utf32_wctomb, NULL }) + +DEFENCODING(( "UTF-32BE", /* Unicode 3.1 */ + ), + utf32be, + { utf32be_mbtowc }, { utf32be_wctomb, NULL }) + +DEFENCODING(( "UTF-32LE", /* Unicode 3.1 */ + ), + utf32le, + { utf32le_mbtowc }, { utf32le_wctomb, NULL }) + DEFENCODING(( "UTF-7", /* IANA, RFC 2152 */ "UNICODE-1-1-UTF-7", /* IANA, RFC 1642 */ "csUnicode11UTF7", /* IANA */ diff --git a/lib/ucs4.h b/lib/ucs4.h index 927e994..e4f1c79 100644 --- a/lib/ucs4.h +++ b/lib/ucs4.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 1999-2000 Free Software Foundation, Inc. + * Copyright (C) 1999-2001 Free Software Foundation, Inc. * This file is part of the GNU LIBICONV Library. * * The GNU LIBICONV Library is free software; you can redistribute it @@ -22,7 +22,7 @@ * UCS-4 */ -/* Here we accept 0000FFFE/0000FEFF marks as endianness indicators everywhere +/* Here we accept FFFE0000/0000FEFF marks as endianness indicators everywhere in the stream, not just at the beginning. The default is big-endian. */ /* The state is 0 if big-endian, 1 if little-endian. */ static int @@ -35,8 +35,8 @@ ucs4_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) ? s[0] + (s[1] << 8) + (s[2] << 16) + (s[3] << 24) : (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + s[3]); s += 4; n -= 4; count += 4; - if (wc == 0xfeff) { - } else if (wc == 0xfffe) { + if (wc == 0x0000feff) { + } else if (wc == 0xfffe0000u) { state ^= 1; } else if (wc <= 0x7fffffff) { *pwc = wc; @@ -53,7 +53,7 @@ ucs4_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) static int ucs4_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) { - if (wc != 0xfffe) { + if (wc <= 0x7fffffff) { if (n >= 4) { r[0] = (unsigned char) (wc >> 24); r[1] = (unsigned char) (wc >> 16); diff --git a/lib/utf32.h b/lib/utf32.h new file mode 100644 index 0000000..442a9a2 --- /dev/null +++ b/lib/utf32.h @@ -0,0 +1,91 @@ +/* + * Copyright (C) 1999-2001 Free Software Foundation, Inc. + * This file is part of the GNU LIBICONV Library. + * + * The GNU LIBICONV Library is free software; you can redistribute it + * and/or modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * The GNU LIBICONV Library is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the GNU LIBICONV Library; see the file COPYING.LIB. + * If not, write to the Free Software Foundation, Inc., 59 Temple Place - + * Suite 330, Boston, MA 02111-1307, USA. + */ + +/* + * UTF-32 + */ + +/* Specification: Unicode 3.1 Standard Annex #19 */ + +/* Here we accept FFFE0000/0000FEFF marks as endianness indicators + everywhere in the stream, not just at the beginning. (This is contrary + to what #19 D36c specifies, but it allows concatenation of byte + sequences to work flawlessly, while disagreeing with #19 behaviour + only for strings containing U+FEFF characters, which is quite rare.) + The default is big-endian. */ +/* The state is 0 if big-endian, 1 if little-endian. */ +static int +utf32_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) +{ + state_t state = conv->istate; + int count = 0; + for (; n >= 4;) { + ucs4_t wc = (state + ? s[0] + (s[1] << 8) + (s[2] << 16) + (s[3] << 24) + : (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + s[3]); + count += 4; + if (wc == 0x0000feff) { + } else if (wc == 0xfffe0000u) { + state ^= 1; + } else { + if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000)) { + *pwc = wc; + conv->istate = state; + return count; + } else + return RET_ILSEQ; + } + s += 4; n -= 4; + } + conv->istate = state; + return RET_TOOFEW(count); +} + +/* We output UTF-32 in big-endian order, with byte-order mark. */ +/* The state is 0 at the beginning, 1 after the BOM has been written. */ +static int +utf32_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) +{ + if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000)) { + int count = 0; + if (!conv->ostate) { + if (n >= 4) { + r[0] = 0x00; + r[1] = 0x00; + r[2] = 0xFE; + r[3] = 0xFF; + r += 4; n -= 4; count += 4; + } else + return RET_TOOSMALL; + } + if (wc < 0x110000) { + if (n >= 4) { + r[0] = 0; + r[1] = (unsigned char) (wc >> 16); + r[2] = (unsigned char) (wc >> 8); + r[3] = (unsigned char) wc; + conv->ostate = 1; + return count+4; + } else + return RET_TOOSMALL; + } + } + return RET_ILSEQ; +} diff --git a/lib/ucs4.h b/lib/utf32be.h similarity index 52% copy from lib/ucs4.h copy to lib/utf32be.h index 927e994..21875a9 100644 --- a/lib/ucs4.h +++ b/lib/utf32be.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 1999-2000 Free Software Foundation, Inc. + * Copyright (C) 1999-2001 Free Software Foundation, Inc. * This file is part of the GNU LIBICONV Library. * * The GNU LIBICONV Library is free software; you can redistribute it @@ -19,49 +19,37 @@ */ /* - * UCS-4 + * UTF-32BE */ -/* Here we accept 0000FFFE/0000FEFF marks as endianness indicators everywhere - in the stream, not just at the beginning. The default is big-endian. */ -/* The state is 0 if big-endian, 1 if little-endian. */ +/* Specification: Unicode 3.1 Standard Annex #19 */ + static int -ucs4_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) +utf32be_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) { - state_t state = conv->istate; - int count = 0; - for (; n >= 4;) { - ucs4_t wc = (state - ? s[0] + (s[1] << 8) + (s[2] << 16) + (s[3] << 24) - : (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + s[3]); - s += 4; n -= 4; count += 4; - if (wc == 0xfeff) { - } else if (wc == 0xfffe) { - state ^= 1; - } else if (wc <= 0x7fffffff) { + if (n >= 4) { + ucs4_t wc = (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + s[3]; + if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000)) { *pwc = wc; - conv->istate = state; - return count; + return 4; } else return RET_ILSEQ; } - conv->istate = state; - return RET_TOOFEW(count); + return RET_TOOFEW(0); } -/* But we output UCS-4 in big-endian order, without byte-order mark. */ static int -ucs4_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) +utf32be_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) { - if (wc != 0xfffe) { + if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000)) { if (n >= 4) { - r[0] = (unsigned char) (wc >> 24); + r[0] = 0; r[1] = (unsigned char) (wc >> 16); r[2] = (unsigned char) (wc >> 8); r[3] = (unsigned char) wc; return 4; } else return RET_TOOSMALL; - } else - return RET_ILSEQ; + } + return RET_ILSEQ; } diff --git a/lib/utf32le.h b/lib/utf32le.h new file mode 100644 index 0000000..c065a1d --- /dev/null +++ b/lib/utf32le.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 1999-2001 Free Software Foundation, Inc. + * This file is part of the GNU LIBICONV Library. + * + * The GNU LIBICONV Library is free software; you can redistribute it + * and/or modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * The GNU LIBICONV Library is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the GNU LIBICONV Library; see the file COPYING.LIB. + * If not, write to the Free Software Foundation, Inc., 59 Temple Place - + * Suite 330, Boston, MA 02111-1307, USA. + */ + +/* + * UTF-32LE + */ + +/* Specification: Unicode 3.1 Standard Annex #19 */ + +static int +utf32le_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) +{ + if (n >= 4) { + ucs4_t wc = s[0] + (s[1] << 8) + (s[2] << 16) + (s[3] << 24); + if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000)) { + *pwc = wc; + return 4; + } else + return RET_ILSEQ; + } + return RET_TOOFEW(0); +} + +static int +utf32le_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) +{ + if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000)) { + if (n >= 4) { + r[0] = (unsigned char) wc; + r[1] = (unsigned char) (wc >> 8); + r[2] = (unsigned char) (wc >> 16); + r[3] = 0; + return 4; + } else + return RET_TOOSMALL; + } + return RET_ILSEQ; +} diff --git a/man/iconv_open.3 b/man/iconv_open.3 index 699fde7..a3735d9 100644 --- a/man/iconv_open.3 +++ b/man/iconv_open.3 @@ -9,7 +9,7 @@ .\" GNU glibc-2 source code and manual .\" OpenGroup's Single Unix specification http://www.UNIX-systems.org/online.html .\" -.TH ICONV_OPEN 3 "January 5, 2001" "GNU" "Linux Programmer's Manual" +.TH ICONV_OPEN 3 "April 12, 2001" "GNU" "Linux Programmer's Manual" .SH NAME iconv_open \- allocate descriptor for character set conversion .SH SYNOPSIS @@ -72,6 +72,7 @@ UTF-8 UCS-2, UCS-2BE, UCS-2LE UCS-4, UCS-4BE, UCS-4LE UTF-16, UTF-16BE, UTF-16LE +UTF-32, UTF-32BE, UTF-32LE UTF-7 JAVA .fi diff --git a/tests/Makefile.in b/tests/Makefile.in index f27af9e..40f8887 100644 --- a/tests/Makefile.in +++ b/tests/Makefile.in @@ -41,6 +41,9 @@ check : all table-from table-to ../src/iconv $(srcdir)/check-stateful $(srcdir) UTF-16 $(srcdir)/check-stateful $(srcdir) UTF-16BE $(srcdir)/check-stateful $(srcdir) UTF-16LE + $(srcdir)/check-stateful $(srcdir) UTF-32 + $(srcdir)/check-stateful $(srcdir) UTF-32BE + $(srcdir)/check-stateful $(srcdir) UTF-32LE $(srcdir)/check-stateful $(srcdir) UTF-7 # /* 8-bit encodings */ $(srcdir)/check-stateless $(srcdir) ISO-8859-1 diff --git a/tests/Makefile.msvc b/tests/Makefile.msvc index f340975..6bc3b18 100644 --- a/tests/Makefile.msvc +++ b/tests/Makefile.msvc @@ -63,6 +63,9 @@ check : all table-from.exe table-to.exe ../src/iconv.exe uniq-u.exe $(srcdir)\check-stateful.bat $(srcdir) UTF-16 $(srcdir)\check-stateful.bat $(srcdir) UTF-16BE $(srcdir)\check-stateful.bat $(srcdir) UTF-16LE + $(srcdir)\check-stateful.bat $(srcdir) UTF-32 + $(srcdir)\check-stateful.bat $(srcdir) UTF-32BE + $(srcdir)\check-stateful.bat $(srcdir) UTF-32LE $(srcdir)\check-stateful.bat $(srcdir) UTF-7 # /* 8-bit encodings */ $(srcdir)\check-stateless.bat $(srcdir) ISO-8859-1 diff --git a/tests/Makefile.os2 b/tests/Makefile.os2 index 3e1ed56..9cb324b 100644 --- a/tests/Makefile.os2 +++ b/tests/Makefile.os2 @@ -31,6 +31,9 @@ check : all table-from.exe table-to.exe ../src/iconv.exe genutf8.exe $(srcdir)\check-stateful $(srcdir) UTF-16 $(srcdir)\check-stateful $(srcdir) UTF-16BE $(srcdir)\check-stateful $(srcdir) UTF-16LE + $(srcdir)\check-stateful $(srcdir) UTF-32 + $(srcdir)\check-stateful $(srcdir) UTF-32BE + $(srcdir)\check-stateful $(srcdir) UTF-32LE $(srcdir)\check-stateful $(srcdir) UTF-7 # /* 8-bit encodings */ $(srcdir)\check-stateless $(srcdir) ISO-8859-1 diff --git a/tests/UTF-32-snippet b/tests/UTF-32-snippet new file mode 100644 index 0000000000000000000000000000000000000000..6aa4dcb8453cb36ea9ab7aa09e97a374854829fa GIT binary patch literal 1068 zcwUv$)oxWm6oBDX&Mq!>$KBnF)Qh{`K|=z8K!SwyF>trwZ-cuOhnE1p%{hT!fnI#c zTx|JgWhUbDmx#iWBcu!2h-lKBEM8k literal 0 HcwPel00001 diff --git a/tests/UTF-32-snippet.UTF-8 b/tests/UTF-32-snippet.UTF-8 new file mode 100644 index 0000000..4229c88 --- /dev/null +++ b/tests/UTF-32-snippet.UTF-8 @@ -0,0 +1,6 @@ +ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ <- Greek +𐌀𐌁𐌂𐌃𐌄𐌅𐌆𐌇𐌈𐌉𐌊𐌋𐌌𐌍𐌎𐌏𐌐𐌑𐌒𐌓𐌔𐌕𐌖𐌗𐌘𐌙𐌚𐌛𐌜𐌝 <- Etruscan +ABCDEFGHIJKLMNOPQRSTUVWXYZ <- Latin +АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ <- Cyrillic +𐌰𐌱𐌲𐌳𐌴𐌵𐌶𐌷𐌸𐌹𐌺𐌻𐌼𐌽𐌾𐌿𐍀𐍁𐍂𐍃𐍄𐍅𐍆𐍇𐍈 <- Gothic +אבגדהוזחטיךכלםמןנסעףפץצקרש <- Hebrew diff --git a/tests/UTF-32BE-snippet b/tests/UTF-32BE-snippet new file mode 100644 index 0000000000000000000000000000000000000000..72e4761c60351ceea2d1840a8b1e7eae65e85681 GIT binary patch literal 1064 zcwUv$)oxZn6oAoP{!?7)j=Q@TsTX&>gN6hGfdmQRF>trwZ-cu#Z^1hJClD;qi_iqVW=EaT9WOkm<) zTGP25iu{bz)?Jx&-4Xe-F^S1cVJg#@&J1QUi`mR!F7?b~J_}e#1C2D%Obe~F(H_6g z^G|*~$9p0_>)G`(`r#tXca~B81EIa)qm0<2pC!qo14H;x>1<%RTP%fQLNdF;95PGpfS> zsi!7Bw{ezZv6EyI5?LXOaU@8RVmuR=$Rx6v%oL_Fjp^i&OCI?YP)HHQlu$|;Ot!NrsS19{ChdND;-9P)Zr)R8YwbW>Up0s+r9k<}#1@EMOsvSj-Y? zSjw_w=GJI#Js~~gsP$AVI(Nt2%l)SntYj6ntY!^sSw|h~*+4xT*~DhH&_E+iG}A&W zZM0|DgG@&6@tKgG@$pf;&r>11;3cnk%^TkGj`w`vBLjTmGhg`1Am8}T4}S8C-~8b( VL#eU#M!Oi@x1(X?{#}fFhXKv=QcwT@ literal 0 HcwPel00001 diff --git a/tests/UTF-32LE-snippet.UTF-8 b/tests/UTF-32LE-snippet.UTF-8 new file mode 100644 index 0000000..4229c88 --- /dev/null +++ b/tests/UTF-32LE-snippet.UTF-8 @@ -0,0 +1,6 @@ +ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ <- Greek +𐌀𐌁𐌂𐌃𐌄𐌅𐌆𐌇𐌈𐌉𐌊𐌋𐌌𐌍𐌎𐌏𐌐𐌑𐌒𐌓𐌔𐌕𐌖𐌗𐌘𐌙𐌚𐌛𐌜𐌝 <- Etruscan +ABCDEFGHIJKLMNOPQRSTUVWXYZ <- Latin +АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ <- Cyrillic +𐌰𐌱𐌲𐌳𐌴𐌵𐌶𐌷𐌸𐌹𐌺𐌻𐌼𐌽𐌾𐌿𐍀𐍁𐍂𐍃𐍄𐍅𐍆𐍇𐍈 <- Gothic +אבגדהוזחטיךכלםמןנסעףפץצקרש <- Hebrew -- 2.11.4.GIT