1 /* xgettext C# backend.
2 Copyright (C) 2003 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
34 #include "error-progname.h"
38 #include "po-charset.h"
39 #include "utf8-ucs4.h"
40 #include "ucs4-utf8.h"
43 #define _(s) gettext(s)
45 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
48 /* The C# syntax is defined in ECMA-334, second edition. */
51 /* ====================== Keyword set customization. ====================== */
53 /* If true extract all strings. */
54 static bool extract_all
= false;
56 static hash_table keywords
;
57 static bool default_keywords
= true;
61 x_csharp_extract_all ()
67 /* Processes a --keyword option.
68 Non-ASCII function names can be used if given in UTF-8 encoding. */
70 x_csharp_keyword (const char *name
)
73 default_keywords
= false;
81 if (keywords
.table
== NULL
)
82 init_hash (&keywords
, 100);
84 split_keywordspec (name
, &end
, &argnum1
, &argnum2
);
86 /* The characters between name and end should form a valid C#
87 identifier sequence with dots.
88 A colon means an invalid parse in split_keywordspec(). */
89 colon
= strchr (name
, ':');
90 if (colon
== NULL
|| colon
>= end
)
94 insert_entry (&keywords
, name
, end
- name
,
95 (void *) (long) (argnum1
+ (argnum2
<< 10)));
100 /* Finish initializing the keywords hash table.
101 Called after argument processing, before each file is processed. */
105 if (default_keywords
)
107 x_csharp_keyword ("GetString"); /* Resource{Manager,Set}.GetString */
108 x_csharp_keyword ("GetPluralString:1,2"); /* GettextResource{Manager,Set}.GetPluralString */
109 default_keywords
= false;
114 init_flag_table_csharp ()
116 xgettext_record_flag ("GetString:1:pass-csharp-format");
117 xgettext_record_flag ("GetPluralString:1:pass-csharp-format");
118 xgettext_record_flag ("GetPluralString:2:pass-csharp-format");
119 xgettext_record_flag ("String.Format:1:csharp-format");
123 /* ======================== Reading of characters. ======================== */
125 /* Real filename, used in error messages about the input file. */
126 static const char *real_file_name
;
128 /* Logical filename and line number, used to label the extracted messages. */
129 static char *logical_file_name
;
130 static int line_number
;
132 /* The input file stream. */
136 /* Phase 1: line_number handling. */
138 /* Maximum used, roughly a safer MB_LEN_MAX. */
139 #define MAX_PHASE1_PUSHBACK 16
140 static unsigned char phase1_pushback
[MAX_PHASE1_PUSHBACK
];
141 static int phase1_pushback_length
;
143 /* Read the next single byte from the input file. */
149 if (phase1_pushback_length
)
151 c
= phase1_pushback
[--phase1_pushback_length
];
161 error (EXIT_FAILURE
, errno
, _("error while reading \"%s\""),
171 /* Supports MAX_PHASE1_PUSHBACK characters of pushback. */
173 phase1_ungetc (int c
)
179 if (phase1_pushback_length
== SIZEOF (phase1_pushback
))
181 phase1_pushback
[phase1_pushback_length
++] = c
;
186 /* Phase 2: Conversion to Unicode.
187 This is done early because ECMA-334 section 9.1. says that the source is
188 "an ordered sequence of Unicode characters", and because the recognition
189 of the line terminators (ECMA-334 section 9.3.1) is hardly possible without
190 prior conversion to Unicode. */
192 /* End-of-file indicator for functions returning an UCS-4 character. */
195 /* Newline Unicode character. */
198 static int phase2_pushback
[1];
199 static int phase2_pushback_length
;
201 /* Read the next Unicode UCS-4 character from the input file. */
205 if (phase2_pushback_length
)
206 return phase2_pushback
[--phase2_pushback_length
];
208 if (xgettext_current_source_encoding
== po_charset_ascii
)
210 int c
= phase1_getc ();
216 sprintf (buffer
, ":%ld", (long) line_number
);
217 multiline_error (xstrdup (""),
219 Non-ASCII string at %s%s.\n\
220 Please specify the source encoding through --from-code.\n"),
221 real_file_name
, buffer
));
226 else if (xgettext_current_source_encoding
!= po_charset_utf8
)
229 /* Use iconv on an increasing number of bytes. Read only as many bytes
230 through phase1_getc as needed. This is needed to give reasonable
231 interactive behaviour when fp is connected to an interactive tty. */
232 unsigned char buf
[MAX_PHASE1_PUSHBACK
];
237 unsigned char scratchbuf
[6];
238 const char *inptr
= (const char *) &buf
[0];
239 size_t insize
= bufcount
;
240 char *outptr
= (char *) &scratchbuf
[0];
241 size_t outsize
= sizeof (scratchbuf
);
243 size_t res
= iconv (xgettext_current_source_iconv
,
244 (ICONV_CONST
char **) &inptr
, &insize
,
246 /* We expect that a character has been produced if and only if
247 some input bytes have been consumed. */
248 if ((insize
< bufcount
) != (outsize
< sizeof (scratchbuf
)))
250 if (outsize
== sizeof (scratchbuf
))
252 /* No character has been produced. Must be an error. */
253 if (res
!= (size_t)(-1))
258 /* An invalid multibyte sequence was encountered. */
259 multiline_error (xstrdup (""),
261 %s:%d: Invalid multibyte sequence.\n\
262 Please specify the correct source encoding through --from-code.\n"),
263 real_file_name
, line_number
));
266 else if (errno
== EINVAL
)
268 /* An incomplete multibyte character. */
271 if (bufcount
== MAX_PHASE1_PUSHBACK
)
273 /* An overlong incomplete multibyte sequence was
275 multiline_error (xstrdup (""),
277 %s:%d: Long incomplete multibyte sequence.\n\
278 Please specify the correct source encoding through --from-code.\n"),
279 real_file_name
, line_number
));
283 /* Read one more byte and retry iconv. */
287 multiline_error (xstrdup (""),
289 %s:%d: Incomplete multibyte sequence at end of file.\n\
290 Please specify the correct source encoding through --from-code.\n"),
291 real_file_name
, line_number
));
296 multiline_error (xstrdup (""),
298 %s:%d: Incomplete multibyte sequence at end of line.\n\
299 Please specify the correct source encoding through --from-code.\n"),
300 real_file_name
, line_number
- 1));
303 buf
[bufcount
++] = (unsigned char) c
;
306 error (EXIT_FAILURE
, errno
, _("%s:%d: iconv failure"),
307 real_file_name
, line_number
);
311 size_t outbytes
= sizeof (scratchbuf
) - outsize
;
312 size_t bytes
= bufcount
- insize
;
315 /* We expect that one character has been produced. */
320 /* Push back the unused bytes. */
322 phase1_ungetc (buf
[--insize
]);
323 /* Convert the character from UTF-8 to UCS-4. */
324 if (u8_mbtouc (&uc
, scratchbuf
, outbytes
) < outbytes
)
326 /* scratchbuf contains an out-of-range Unicode character
328 multiline_error (xstrdup (""),
330 %s:%d: Invalid multibyte sequence.\n\
331 Please specify the source encoding through --from-code.\n"),
332 real_file_name
, line_number
));
339 /* If we don't have iconv(), the only supported values for
340 xgettext_global_source_encoding and thus also for
341 xgettext_current_source_encoding are ASCII and UTF-8. */
347 /* Read an UTF-8 encoded character. */
348 unsigned char buf
[6];
369 && ((buf
[1] ^ 0x80) < 0x40))
379 && ((buf
[1] ^ 0x80) < 0x40)
380 && ((buf
[2] ^ 0x80) < 0x40))
390 && ((buf
[1] ^ 0x80) < 0x40)
391 && ((buf
[2] ^ 0x80) < 0x40)
392 && ((buf
[3] ^ 0x80) < 0x40))
402 && ((buf
[1] ^ 0x80) < 0x40)
403 && ((buf
[2] ^ 0x80) < 0x40)
404 && ((buf
[3] ^ 0x80) < 0x40)
405 && ((buf
[4] ^ 0x80) < 0x40))
414 u8_mbtouc (&uc
, buf
, count
);
419 /* Supports only one pushback character. */
421 phase2_ungetc (int c
)
425 if (phase2_pushback_length
== SIZEOF (phase2_pushback
))
427 phase2_pushback
[phase2_pushback_length
++] = c
;
432 /* Phase 3: Convert all line terminators to LF.
433 See ECMA-334 section 9.3.1. */
435 /* Line number defined in terms of phase3. */
436 static int logical_line_number
;
438 static int phase3_pushback
[9];
439 static int phase3_pushback_length
;
441 /* Read the next Unicode UCS-4 character from the input file, mapping
442 all line terminators to U+000A, and dropping U+001A at the end of file. */
448 if (phase3_pushback_length
)
450 c
= phase3_pushback
[--phase3_pushback_length
];
452 ++logical_line_number
;
460 int c1
= phase2_getc ();
462 if (c1
!= UEOF
&& c1
!= 0x000a)
465 /* Seen line terminator CR or CR/LF. */
466 ++logical_line_number
;
470 if (c
== 0x0085 || c
== 0x2028 || c
== 0x2029)
472 /* Seen Unicode word processor newline. */
473 ++logical_line_number
;
479 int c1
= phase2_getc ();
482 /* Seen U+001A right before the end of file. */
489 ++logical_line_number
;
493 /* Supports 9 characters of pushback. */
495 phase3_ungetc (int c
)
500 --logical_line_number
;
501 if (phase3_pushback_length
== SIZEOF (phase3_pushback
))
503 phase3_pushback
[phase3_pushback_length
++] = c
;
508 /* ========================= Accumulating strings. ======================== */
510 /* A string buffer type that allows appending Unicode characters.
511 Returns the entire string in UTF-8 encoding. */
515 /* The part of the string that has already been converted to UTF-8. */
518 size_t utf8_allocated
;
521 /* Initialize a 'struct string_buffer' to empty. */
523 init_string_buffer (struct string_buffer
*bp
)
525 bp
->utf8_buffer
= NULL
;
527 bp
->utf8_allocated
= 0;
530 /* Auxiliary function: Ensure count more bytes are available in bp->utf8. */
532 string_buffer_append_unicode_grow (struct string_buffer
*bp
, size_t count
)
534 if (bp
->utf8_buflen
+ count
> bp
->utf8_allocated
)
536 size_t new_allocated
= 2 * bp
->utf8_allocated
+ 10;
537 if (new_allocated
< bp
->utf8_buflen
+ count
)
538 new_allocated
= bp
->utf8_buflen
+ count
;
539 bp
->utf8_allocated
= new_allocated
;
540 bp
->utf8_buffer
= xrealloc (bp
->utf8_buffer
, new_allocated
);
544 /* Auxiliary function: Append a Unicode character to bp->utf8.
545 uc must be < 0x110000. */
547 string_buffer_append_unicode (struct string_buffer
*bp
, unsigned int uc
)
549 unsigned char utf8buf
[6];
550 int count
= u8_uctomb (utf8buf
, uc
, 6);
553 /* The caller should have ensured that uc is not out-of-range. */
556 string_buffer_append_unicode_grow (bp
, count
);
557 memcpy (bp
->utf8_buffer
+ bp
->utf8_buflen
, utf8buf
, count
);
558 bp
->utf8_buflen
+= count
;
561 /* Return the string buffer's contents. */
563 string_buffer_result (struct string_buffer
*bp
)
565 /* NUL-terminate it. */
566 string_buffer_append_unicode_grow (bp
, 1);
567 bp
->utf8_buffer
[bp
->utf8_buflen
] = '\0';
569 return bp
->utf8_buffer
;
572 /* Free the memory pointed to by a 'struct string_buffer'. */
574 free_string_buffer (struct string_buffer
*bp
)
576 free (bp
->utf8_buffer
);
580 /* ======================== Accumulating comments. ======================== */
583 /* Accumulating a single comment line. */
585 static struct string_buffer comment_buffer
;
590 comment_buffer
.utf8_buflen
= 0;
596 return (comment_buffer
.utf8_buflen
== 0);
602 string_buffer_append_unicode (&comment_buffer
, c
);
606 comment_line_end (size_t chars_to_remove
)
608 char *buffer
= string_buffer_result (&comment_buffer
);
609 size_t buflen
= strlen (buffer
);
611 buflen
-= chars_to_remove
;
613 && (buffer
[buflen
- 1] == ' ' || buffer
[buflen
- 1] == '\t'))
615 buffer
[buflen
] = '\0';
616 savable_comment_add (buffer
);
620 /* These are for tracking whether comments count as immediately before
622 static int last_comment_line
;
623 static int last_non_comment_line
;
626 /* Phase 4: Replace each comment that is not inside a character constant or
627 string literal with a space or newline character.
628 See ECMA-334 section 9.3.2. */
648 /* C style comment. */
650 last_was_star
= false;
656 /* We skip all leading white space, but not EOLs. */
657 if (!(comment_at_start () && (c
== ' ' || c
== '\t')))
662 comment_line_end (1);
664 last_was_star
= false;
668 last_was_star
= true;
674 comment_line_end (2);
680 last_was_star
= false;
685 last_comment_line
= logical_line_number
;
689 /* C++ style comment. */
690 last_comment_line
= logical_line_number
;
695 if (c
== UNL
|| c
== UEOF
)
697 /* We skip all leading white space, but not EOLs. */
698 if (!(comment_at_start () && (c
== ' ' || c
== '\t')))
701 phase3_ungetc (c
); /* push back the newline, to decrement logical_line_number */
702 comment_line_end (0);
703 phase3_getc (); /* read the newline again */
708 /* Supports only one pushback character. */
710 phase4_ungetc (int c
)
716 /* ======================= Character classification. ====================== */
719 /* Return true if a given character is white space.
720 See ECMA-334 section 9.3.3. */
722 is_whitespace (int c
)
724 /* Unicode character class Zs, as of Unicode 4.0. */
725 /* grep '^[^;]*;[^;]*;Zs;' UnicodeData-4.0.0.txt */
729 return (c
== 0x0020 || c
== 0x00a0);
731 return (c
== 0x1680);
733 return (c
== 0x180e);
735 return ((c
>= 0x2000 && c
<= 0x200b) || c
== 0x202f || c
== 0x205f);
737 return (c
== 0x3000);
744 /* C# allows identifiers containing many Unicode characters. We recognize
745 them; to use an identifier with Unicode characters in a --keyword option,
746 it must be specified in UTF-8. */
749 bitmap_lookup (const void *table
, unsigned int uc
)
751 unsigned int index1
= uc
>> 16;
752 if (index1
< ((const int *) table
)[0])
754 int lookup1
= ((const int *) table
)[1 + index1
];
757 unsigned int index2
= (uc
>> 9) & 0x7f;
758 int lookup2
= ((const int *) table
)[lookup1
+ index2
];
761 unsigned int index3
= (uc
>> 5) & 0xf;
762 unsigned int lookup3
= ((const int *) table
)[lookup2
+ index3
];
764 return (lookup3
>> (uc
& 0x1f)) & 1;
771 /* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, as of Unicode 4.0,
772 plus the underscore. */
779 /*unsigned*/ int level3
[34 << 4];
781 table_identifier_start
=
786 388, 404, 420, 436, 452, 468, 484, 500,
787 516, 532, 548, 564, 580, -1, 596, 612,
788 628, -1, -1, -1, -1, -1, -1, -1,
789 644, -1, 660, 660, 660, 660, 660, 660,
790 660, 660, 660, 660, 660, 660, 676, 660,
791 660, 660, 660, 660, 660, 660, 660, 660,
792 660, 660, 660, 660, 660, 660, 660, 660,
793 660, 660, 660, 660, 660, 660, 660, 660,
794 660, 660, 660, 660, 660, 660, 660, 660,
795 660, 660, 660, 660, 660, 660, 660, 692,
796 660, 660, 708, -1, -1, -1, 660, 660,
797 660, 660, 660, 660, 660, 660, 660, 660,
798 660, 660, 660, 660, 660, 660, 660, 660,
799 660, 660, 660, 724, -1, -1, -1, -1,
800 -1, -1, -1, -1, -1, -1, -1, -1,
801 -1, -1, -1, -1, 740, 756, 772, 788,
802 804, 820, 836, -1, 852, -1, -1, -1,
803 -1, -1, -1, -1, -1, -1, -1, -1,
804 -1, -1, -1, -1, -1, -1, -1, -1,
805 -1, -1, -1, -1, -1, -1, -1, -1,
806 -1, -1, -1, -1, -1, -1, -1, -1,
807 -1, -1, -1, -1, -1, -1, -1, -1,
808 -1, -1, -1, -1, -1, -1, -1, -1,
809 -1, -1, -1, -1, -1, -1, -1, -1,
810 -1, -1, -1, -1, -1, -1, -1, -1,
811 -1, -1, -1, -1, -1, -1, -1, -1,
812 -1, -1, -1, -1, -1, -1, -1, -1,
813 -1, -1, -1, -1, -1, -1, -1, -1,
814 -1, -1, -1, -1, -1, -1, -1, -1,
815 -1, -1, 868, 884, -1, -1, -1, -1,
816 -1, -1, -1, -1, -1, -1, -1, -1,
817 -1, -1, -1, -1, -1, -1, -1, -1,
818 660, 660, 660, 660, 660, 660, 660, 660,
819 660, 660, 660, 660, 660, 660, 660, 660,
820 660, 660, 660, 660, 660, 660, 660, 660,
821 660, 660, 660, 660, 660, 660, 660, 660,
822 660, 660, 660, 660, 660, 660, 660, 660,
823 660, 660, 660, 660, 660, 660, 660, 660,
824 660, 660, 660, 660, 660, 660, 660, 660,
825 660, 660, 660, 660, 660, 660, 660, 660,
826 660, 660, 660, 660, 660, 660, 660, 660,
827 660, 660, 660, 660, 660, 660, 660, 660,
828 660, 660, 660, 900, -1, -1, -1, -1,
829 -1, -1, -1, -1, -1, -1, -1, -1,
830 -1, -1, -1, -1, -1, -1, -1, -1,
831 -1, -1, -1, -1, -1, -1, -1, -1,
832 -1, -1, -1, -1, -1, -1, -1, -1,
833 -1, -1, -1, -1, 660, 916, -1, -1
836 0x00000000, 0x00000000, 0x87FFFFFE, 0x07FFFFFE,
837 0x00000000, 0x04200400, 0xFF7FFFFF, 0xFF7FFFFF,
838 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
839 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
840 0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF,
841 0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F,
842 0x00000000, 0x00000000, 0x00000000, 0x04000000,
843 0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF,
844 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
845 0xFFFFFC03, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF,
846 0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE,
847 0x000000FF, 0x00000000, 0xFFFF0000, 0x000707FF,
848 0x00000000, 0x07FFFFFE, 0x000007FF, 0xFFFEC000,
849 0xFFFFFFFF, 0xFFFFFFFF, 0x002FFFFF, 0x9C00C060,
850 0xFFFD0000, 0x0000FFFF, 0x0000E000, 0x00000000,
851 0xFFFFFFFF, 0x0002003F, 0x00000000, 0x00000000,
852 0x00000000, 0x00000000, 0x00000000, 0x00000000,
853 0x00000000, 0x00000000, 0x00000000, 0x00000000,
854 0xFFFFFFF0, 0x23FFFFFF, 0xFF010000, 0x00000003,
855 0xFFF99FE0, 0x23C5FDFF, 0xB0000000, 0x00030003,
856 0xFFF987E0, 0x036DFDFF, 0x5E000000, 0x001C0000,
857 0xFFFBBFE0, 0x23EDFDFF, 0x00010000, 0x00000003,
858 0xFFF99FE0, 0x23EDFDFF, 0xB0000000, 0x00020003,
859 0xD63DC7E8, 0x03BFC718, 0x00000000, 0x00000000,
860 0xFFFDDFE0, 0x03EFFDFF, 0x00000000, 0x00000003,
861 0xFFFDDFE0, 0x23EFFDFF, 0x40000000, 0x00000003,
862 0xFFFDDFE0, 0x03FFFDFF, 0x00000000, 0x00000003,
863 0xFC7FFFE0, 0x2FFBFFFF, 0x0000007F, 0x00000000,
864 0xFFFFFFFE, 0x000DFFFF, 0x0000007F, 0x00000000,
865 0xFEF02596, 0x200DECAE, 0x3000005F, 0x00000000,
866 0x00000001, 0x00000000, 0xFFFFFEFF, 0x000007FF,
867 0x00000F00, 0x00000000, 0x00000000, 0x00000000,
868 0xFFFFFFFF, 0x000006FB, 0x003F0000, 0x00000000,
869 0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF,
870 0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF,
871 0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF,
872 0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF,
873 0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF,
874 0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x00000000,
875 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF,
876 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
877 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
878 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
879 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
880 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF,
881 0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF,
882 0x0003DFFF, 0x0003FFFF, 0x0003FFFF, 0x0001DFFF,
883 0xFFFFFFFF, 0x000FFFFF, 0x10800000, 0x00000000,
884 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF,
885 0xFFFFFFFF, 0x000001FF, 0x00000000, 0x00000000,
886 0x1FFFFFFF, 0x00000000, 0xFFFF0000, 0x001F3FFF,
887 0x00000000, 0x00000000, 0x00000000, 0x00000000,
888 0x00000000, 0x00000000, 0x00000000, 0x00000000,
889 0x00000000, 0x00000000, 0x00000000, 0x00000000,
890 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF,
891 0x00000000, 0x00000000, 0x00000000, 0x00000000,
892 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
893 0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF,
894 0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF,
895 0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF,
896 0x00000000, 0x00000000, 0x00000000, 0x80020000,
897 0x00000000, 0x00000000, 0x00000000, 0x00000000,
898 0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF,
899 0x0000000F, 0x00000000, 0x00000000, 0x00000000,
900 0x000000E0, 0x1F3E03FE, 0xFFFFFFFE, 0xFFFFFFFF,
901 0xE07FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xF7FFFFFF,
902 0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF,
903 0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000,
904 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
905 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
906 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
907 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
908 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
909 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
910 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
911 0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000,
912 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
913 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
914 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
915 0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
916 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
917 0x00001FFF, 0x00000000, 0x00000000, 0x00000000,
918 0x00000000, 0x00000000, 0x00000000, 0x00000000,
919 0x00000000, 0x00000000, 0x00000000, 0x00000000,
920 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
921 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
922 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
923 0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000,
924 0x00000000, 0x00000000, 0x00000000, 0x00000000,
925 0x00000000, 0x00000000, 0x00000000, 0x00000000,
926 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
927 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
928 0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF,
929 0x00000000, 0x00000000, 0x00000000, 0x00000000,
930 0xA0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF,
931 0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF,
932 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
933 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
934 0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF,
935 0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000,
936 0x00000000, 0x00000000, 0x00000000, 0xFFDF0000,
937 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x1FFFFFFF,
938 0x00000000, 0x07FFFFFE, 0x07FFFFFE, 0xFFFFFFC0,
939 0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x00000000,
940 0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000,
941 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF,
942 0x00000000, 0x00000000, 0x00000000, 0x00000000,
943 0x00000000, 0x00000000, 0x00000000, 0x00000000,
944 0x00000000, 0x00000000, 0x00000000, 0x00000000,
945 0x00000000, 0x00000000, 0x00000000, 0x00000000,
946 0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000,
947 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
948 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
949 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
950 0x00000000, 0x00000000, 0x00000000, 0x00000000,
951 0x00000000, 0x00000000, 0x00000000, 0x00000000,
952 0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000,
953 0x00000000, 0x00000000, 0x00000000, 0x00000000,
954 0x00000000, 0x00000000, 0x00000000, 0x00000000,
955 0x00000000, 0x00000000, 0x00000000, 0x00000000,
956 0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF,
957 0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF,
958 0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF,
959 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
960 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
961 0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF,
962 0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF,
963 0xFFFFFDFF, 0xFFFFFDFF, 0x000003F7, 0x00000000,
964 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
965 0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000,
966 0x00000000, 0x00000000, 0x00000000, 0x00000000,
967 0x00000000, 0x00000000, 0x00000000, 0x00000000,
968 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
969 0x00000000, 0x00000000, 0x00000000, 0x00000000,
970 0x00000000, 0x00000000, 0x00000000, 0x00000000,
971 0x00000000, 0x00000000, 0x00000000, 0x00000000
975 /* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, Nd, Pc, Mn, Mc, Cf,
976 as of Unicode 4.0. */
983 /*unsigned*/ int level3
[36 << 4];
985 table_identifier_part
=
989 16, 144, 272, -1, -1, -1, -1, -1,
990 -1, -1, -1, -1, -1, -1, 400
993 528, 544, 560, 576, 592, 608, 624, 640,
994 656, 672, 688, 704, 720, -1, 736, 752,
995 768, -1, -1, -1, -1, -1, -1, -1,
996 784, -1, 800, 800, 800, 800, 800, 800,
997 800, 800, 800, 800, 800, 800, 816, 800,
998 800, 800, 800, 800, 800, 800, 800, 800,
999 800, 800, 800, 800, 800, 800, 800, 800,
1000 800, 800, 800, 800, 800, 800, 800, 800,
1001 800, 800, 800, 800, 800, 800, 800, 800,
1002 800, 800, 800, 800, 800, 800, 800, 832,
1003 800, 800, 848, -1, -1, -1, 800, 800,
1004 800, 800, 800, 800, 800, 800, 800, 800,
1005 800, 800, 800, 800, 800, 800, 800, 800,
1006 800, 800, 800, 864, -1, -1, -1, -1,
1007 -1, -1, -1, -1, -1, -1, -1, -1,
1008 -1, -1, -1, -1, 880, 896, 912, 928,
1009 944, 960, 976, -1, 992, -1, -1, -1,
1010 -1, -1, -1, -1, -1, -1, -1, -1,
1011 -1, -1, -1, -1, -1, -1, -1, -1,
1012 -1, -1, -1, -1, -1, -1, -1, -1,
1013 -1, -1, -1, -1, -1, -1, -1, -1,
1014 -1, -1, -1, -1, -1, -1, -1, -1,
1015 -1, -1, -1, -1, -1, -1, -1, -1,
1016 -1, -1, -1, -1, -1, -1, -1, -1,
1017 -1, -1, -1, -1, -1, -1, -1, -1,
1018 -1, -1, -1, -1, -1, -1, -1, -1,
1019 -1, -1, -1, -1, -1, -1, -1, -1,
1020 -1, -1, -1, -1, -1, -1, -1, -1,
1021 -1, -1, -1, -1, -1, -1, -1, -1,
1022 1008, -1, 1024, 1040, -1, -1, -1, -1,
1023 -1, -1, -1, -1, -1, -1, -1, -1,
1024 -1, -1, -1, -1, -1, -1, -1, -1,
1025 800, 800, 800, 800, 800, 800, 800, 800,
1026 800, 800, 800, 800, 800, 800, 800, 800,
1027 800, 800, 800, 800, 800, 800, 800, 800,
1028 800, 800, 800, 800, 800, 800, 800, 800,
1029 800, 800, 800, 800, 800, 800, 800, 800,
1030 800, 800, 800, 800, 800, 800, 800, 800,
1031 800, 800, 800, 800, 800, 800, 800, 800,
1032 800, 800, 800, 800, 800, 800, 800, 800,
1033 800, 800, 800, 800, 800, 800, 800, 800,
1034 800, 800, 800, 800, 800, 800, 800, 800,
1035 800, 800, 800, 1056, -1, -1, -1, -1,
1036 -1, -1, -1, -1, -1, -1, -1, -1,
1037 -1, -1, -1, -1, -1, -1, -1, -1,
1038 -1, -1, -1, -1, -1, -1, -1, -1,
1039 -1, -1, -1, -1, -1, -1, -1, -1,
1040 -1, -1, -1, -1, 800, 1072, -1, -1,
1041 1088, -1, -1, -1, -1, -1, -1, -1,
1042 -1, -1, -1, -1, -1, -1, -1, -1,
1043 -1, -1, -1, -1, -1, -1, -1, -1,
1044 -1, -1, -1, -1, -1, -1, -1, -1,
1045 -1, -1, -1, -1, -1, -1, -1, -1,
1046 -1, -1, -1, -1, -1, -1, -1, -1,
1047 -1, -1, -1, -1, -1, -1, -1, -1,
1048 -1, -1, -1, -1, -1, -1, -1, -1,
1049 -1, -1, -1, -1, -1, -1, -1, -1,
1050 -1, -1, -1, -1, -1, -1, -1, -1,
1051 -1, -1, -1, -1, -1, -1, -1, -1,
1052 -1, -1, -1, -1, -1, -1, -1, -1,
1053 -1, -1, -1, -1, -1, -1, -1, -1,
1054 -1, -1, -1, -1, -1, -1, -1, -1,
1055 -1, -1, -1, -1, -1, -1, -1, -1,
1056 -1, -1, -1, -1, -1, -1, -1, -1
1059 0x00000000, 0x03FF0000, 0x87FFFFFE, 0x07FFFFFE,
1060 0x00000000, 0x04202400, 0xFF7FFFFF, 0xFF7FFFFF,
1061 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1062 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1063 0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF,
1064 0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F,
1065 0xFFFFFFFF, 0xFFFFFFFF, 0xE0FFFFFF, 0x0400FFFF,
1066 0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF,
1067 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1068 0xFFFFFC7B, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF,
1069 0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE,
1070 0xFFFE00FF, 0xBBFFFFFB, 0xFFFF0016, 0x000707FF,
1071 0x003F000F, 0x07FFFFFE, 0x01FFFFFF, 0xFFFFC3FF,
1072 0xFFFFFFFF, 0xFFFFFFFF, 0xBFEFFFFF, 0x9FFFFDFF,
1073 0xFFFF8000, 0xFFFFFFFF, 0x0000E7FF, 0x00000000,
1074 0xFFFFFFFF, 0x0003FFFF, 0x00000000, 0x00000000,
1075 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1076 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1077 0xFFFFFFFE, 0xF3FFFFFF, 0xFF1F3FFF, 0x0000FFCF,
1078 0xFFF99FEE, 0xF3C5FDFF, 0xB080399F, 0x0003FFCF,
1079 0xFFF987EE, 0xD36DFDFF, 0x5E003987, 0x001FFFC0,
1080 0xFFFBBFEE, 0xF3EDFDFF, 0x00013BBF, 0x0000FFCF,
1081 0xFFF99FEE, 0xF3EDFDFF, 0xB0C0398F, 0x0002FFC3,
1082 0xD63DC7EC, 0xC3BFC718, 0x00803DC7, 0x0000FF80,
1083 0xFFFDDFEE, 0xC3EFFDFF, 0x00603DDF, 0x0000FFC3,
1084 0xFFFDDFEC, 0xF3EFFDFF, 0x40603DDF, 0x0000FFC3,
1085 0xFFFDDFEC, 0xC3FFFDFF, 0x00803DCF, 0x0000FFC3,
1086 0xFC7FFFEC, 0x2FFBFFFF, 0xFF5F847F, 0x000C0000,
1087 0xFFFFFFFE, 0x07FFFFFF, 0x03FF7FFF, 0x00000000,
1088 0xFEF02596, 0x3BFFECAE, 0x33FF3F5F, 0x00000000,
1089 0x03000001, 0xC2A003FF, 0xFFFFFEFF, 0xFFFE07FF,
1090 0xFEFF0FDF, 0x1FFFFFFF, 0x00000040, 0x00000000,
1091 0xFFFFFFFF, 0x03C7F6FB, 0x03FF03FF, 0x00000000,
1092 0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF,
1093 0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF,
1094 0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF,
1095 0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF,
1096 0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF,
1097 0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x0003FE00,
1098 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF,
1099 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1100 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1101 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1102 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1103 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF,
1104 0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF,
1105 0x001FDFFF, 0x001FFFFF, 0x000FFFFF, 0x000DDFFF,
1106 0xFFFFFFFF, 0xFFFFFFFF, 0x308FFFFF, 0x000003FF,
1107 0x03FF3800, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF,
1108 0xFFFFFFFF, 0x000003FF, 0x00000000, 0x00000000,
1109 0x1FFFFFFF, 0x0FFF0FFF, 0xFFFFFFC0, 0x001F3FFF,
1110 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1111 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1112 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1113 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF,
1114 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1115 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1116 0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF,
1117 0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF,
1118 0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF,
1119 0x0000F000, 0x80007C00, 0x00100001, 0x8002FC0F,
1120 0x00000000, 0x00000000, 0x1FFF0000, 0x000007E2,
1121 0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF,
1122 0x0000000F, 0x00000000, 0x00000000, 0x00000000,
1123 0x000000E0, 0x1F3EFFFE, 0xFFFFFFFE, 0xFFFFFFFF,
1124 0xE67FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF,
1125 0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF,
1126 0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000,
1127 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1128 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1129 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1130 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1131 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1132 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1133 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1134 0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000,
1135 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1136 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1137 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1138 0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
1139 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1140 0x00001FFF, 0x00000000, 0x00000000, 0x00000000,
1141 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1142 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1143 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1144 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1145 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1146 0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000,
1147 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1148 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1149 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1150 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1151 0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF,
1152 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1153 0xE0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF,
1154 0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF,
1155 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1156 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1157 0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF,
1158 0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000,
1159 0x0000FFFF, 0x0018000F, 0x0000E000, 0xFFDF0000,
1160 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x9FFFFFFF,
1161 0x03FF0000, 0x87FFFFFE, 0x07FFFFFE, 0xFFFFFFE0,
1162 0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x0E000000,
1163 0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000,
1164 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF,
1165 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1166 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1167 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1168 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1169 0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000,
1170 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
1171 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1172 0x3FFFFFFF, 0x000003FF, 0x00000000, 0x00000000,
1173 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1174 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1175 0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000,
1176 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1177 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1178 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1179 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1180 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1181 0x00000000, 0x00000000, 0x00000000, 0xFFFFE3E0,
1182 0x00000FE7, 0x00003C00, 0x00000000, 0x00000000,
1183 0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF,
1184 0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF,
1185 0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF,
1186 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1187 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1188 0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF,
1189 0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF,
1190 0xFFFFFDFF, 0xFFFFFDFF, 0xFFFFC3F7, 0xFFFFFFFF,
1191 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1192 0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000,
1193 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1194 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1195 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
1196 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1197 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1198 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1199 0x00000002, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1200 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1201 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1202 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000FFFF
1206 /* Return true if a given character can occur as first character of an
1207 identifier. See ECMA-334 section 9.4.2. */
1209 is_identifier_start (int c
)
1211 return bitmap_lookup (&table_identifier_start
, c
);
1212 /* In ASCII only this would be:
1213 return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_');
1217 /* Return true if a given character can occur as character of an identifier.
1218 See ECMA-334 section 9.4.2. */
1220 is_identifier_part (int c
)
1222 return bitmap_lookup (&table_identifier_part
, c
);
1223 /* In ASCII only this would be:
1224 return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1225 || (c >= '0' && c <= '9') || c == '_');
1230 is_any_character (int c
)
1236 /* ======================= Preprocessor directives. ======================= */
1239 /* Phase 5: Remove preprocessor lines. See ECMA-334 section 9.5.
1240 As a side effect, this also removes initial whitespace on every line;
1241 this whitespace doesn't matter. */
1243 static int phase5_pushback
[10];
1244 static int phase5_pushback_length
;
1251 if (phase5_pushback_length
)
1252 return phase5_pushback
[--phase5_pushback_length
];
1260 while (c
!= UEOF
&& is_whitespace (c
));
1264 /* Ignore the entire line containing the preprocessor directive
1265 (including the // comment if it contains one). */
1268 while (c
!= UEOF
&& c
!= UNL
);
1280 phase5_ungetc (int c
)
1284 if (phase5_pushback_length
== SIZEOF (phase5_pushback
))
1286 phase5_pushback
[phase5_pushback_length
++] = c
;
1292 /* ========================== Reading of tokens. ========================== */
1297 token_type_lparen
, /* ( */
1298 token_type_rparen
, /* ) */
1299 token_type_lbrace
, /* { */
1300 token_type_rbrace
, /* } */
1301 token_type_comma
, /* , */
1302 token_type_dot
, /* . */
1303 token_type_string_literal
, /* "abc", @"abc" */
1304 token_type_number
, /* 1.23 */
1305 token_type_symbol
, /* identifier, keyword, null */
1306 token_type_plus
, /* + */
1307 token_type_other
/* character literal, misc. operator */
1309 typedef enum token_type_ty token_type_ty
;
1311 typedef struct token_ty token_ty
;
1315 char *string
; /* for token_type_string_literal, token_type_symbol */
1316 refcounted_string_list_ty
*comment
; /* for token_type_string_literal */
1318 int logical_line_number
;
1322 /* Free the memory pointed to by a 'struct token_ty'. */
1324 free_token (token_ty
*tp
)
1326 if (tp
->type
== token_type_string_literal
|| tp
->type
== token_type_symbol
)
1328 if (tp
->type
== token_type_string_literal
)
1329 drop_reference (tp
->comment
);
1333 /* Read a Unicode escape sequence outside string/character literals.
1334 Reject Unicode escapes that don't fulfill the given predicate.
1335 See ECMA-334 section 9.4.2. */
1337 do_getc_unicode_escaped (bool (*predicate
) (int))
1341 /* Use phase 3, because phase 4 elides comments. */
1345 if (c
== 'u' || c
== 'U')
1347 unsigned char buf
[8];
1352 expect
= (c
== 'U' ? 8 : 4);
1354 for (i
= 0; i
< expect
; i
++)
1356 int c1
= phase3_getc ();
1358 if (c1
>= '0' && c1
<= '9')
1359 n
= (n
<< 4) + (c1
- '0');
1360 else if (c1
>= 'A' && c1
<= 'F')
1361 n
= (n
<< 4) + (c1
- 'A' + 10);
1362 else if (c1
>= 'a' && c1
<= 'f')
1363 n
= (n
<< 4) + (c1
- 'a' + 10);
1368 phase3_ungetc (buf
[i
]);
1378 error_with_progname
= false;
1379 error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1380 logical_file_name
, line_number
);
1381 error_with_progname
= true;
1383 else if (predicate (n
))
1387 phase3_ungetc (buf
[i
]);
1394 /* Read an escape sequence inside a string literal or character literal.
1395 See ECMA-334 sections 9.4.4.4., 9.4.4.5. */
1403 /* Use phase 3, because phase 4 elides comments. */
1437 phase3_ungetc ('x');
1440 case '0': case '1': case '2': case '3': case '4':
1441 case '5': case '6': case '7': case '8': case '9':
1442 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1443 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1454 case '0': case '1': case '2': case '3': case '4':
1455 case '5': case '6': case '7': case '8': case '9':
1456 n
= n
* 16 + c
- '0';
1458 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1459 n
= n
* 16 + 10 + c
- 'A';
1461 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1462 n
= n
* 16 + 10 + c
- 'a';
1472 return do_getc_unicode_escaped (is_any_character
);
1474 /* Invalid escape sequence. */
1480 /* Read a regular string literal or character literal.
1481 See ECMA-334 sections 9.4.4.4., 9.4.4.5. */
1483 accumulate_escaped (struct string_buffer
*literal
, int delimiter
)
1489 /* Use phase 3, because phase 4 elides comments. */
1491 if (c
== UEOF
|| c
== delimiter
)
1496 error_with_progname
= false;
1497 if (delimiter
== '\'')
1498 error (0, 0, _("%s:%d: warning: unterminated character constant"),
1499 logical_file_name
, line_number
);
1501 error (0, 0, _("%s:%d: warning: unterminated string constant"),
1502 logical_file_name
, line_number
);
1503 error_with_progname
= true;
1507 c
= do_getc_escaped ();
1508 string_buffer_append_unicode (literal
, c
);
1513 /* Combine characters into tokens. Discard whitespace. */
1515 /* Maximum used guaranteed to be < 4. */
1516 static token_ty phase6_pushback
[4];
1517 static int phase6_pushback_length
;
1520 phase6_get (token_ty
*tp
)
1524 if (phase6_pushback_length
)
1526 *tp
= phase6_pushback
[--phase6_pushback_length
];
1533 tp
->line_number
= line_number
;
1534 tp
->logical_line_number
= logical_line_number
;
1539 tp
->type
= token_type_eof
;
1546 if (last_non_comment_line
> last_comment_line
)
1547 savable_comment_reset ();
1552 /* Ignore whitespace and comments. */
1556 last_non_comment_line
= tp
->logical_line_number
;
1561 tp
->type
= token_type_lparen
;
1565 tp
->type
= token_type_rparen
;
1569 tp
->type
= token_type_lbrace
;
1573 tp
->type
= token_type_rbrace
;
1577 tp
->type
= token_type_comma
;
1582 if (!(c
>= '0' && c
<= '9'))
1585 tp
->type
= token_type_dot
;
1590 case '0': case '1': case '2': case '3': case '4':
1591 case '5': case '6': case '7': case '8': case '9':
1593 /* Don't need to verify the complicated syntax of integers and
1594 floating-point numbers. We assume a valid C# input.
1595 The simplified syntax that we recognize as number is: any
1596 sequence of alphanumeric characters, additionally '+' and '-'
1597 immediately after 'e' or 'E' except in hexadecimal numbers. */
1598 bool hexadecimal
= false;
1603 if (c
>= '0' && c
<= '9')
1605 if ((c
>= 'A' && c
<= 'Z') || (c
>= 'a' &&c
<= 'z'))
1607 if (c
== 'X' || c
== 'x')
1609 if ((c
== 'E' || c
== 'e') && !hexadecimal
)
1612 if (!(c
== '+' || c
== '-'))
1622 tp
->type
= token_type_number
;
1627 /* Regular string literal. */
1629 struct string_buffer literal
;
1631 init_string_buffer (&literal
);
1632 accumulate_escaped (&literal
, '"');
1633 tp
->string
= xstrdup (string_buffer_result (&literal
));
1634 free_string_buffer (&literal
);
1635 tp
->comment
= add_reference (savable_comment
);
1636 tp
->type
= token_type_string_literal
;
1641 /* Character literal. */
1643 struct string_buffer literal
;
1645 init_string_buffer (&literal
);
1646 accumulate_escaped (&literal
, '\'');
1647 free_string_buffer (&literal
);
1648 tp
->type
= token_type_other
;
1656 tp
->type
= token_type_other
;
1659 tp
->type
= token_type_other
;
1664 tp
->type
= token_type_plus
;
1672 /* Verbatim string literal. */
1673 struct string_buffer literal
;
1675 init_string_buffer (&literal
);
1678 /* Use phase 2, because phase 4 elides comments and phase 3
1679 mixes up the newline characters. */
1692 /* No special treatment of newline and backslash here. */
1693 string_buffer_append_unicode (&literal
, c
);
1695 tp
->string
= xstrdup (string_buffer_result (&literal
));
1696 free_string_buffer (&literal
);
1697 tp
->comment
= add_reference (savable_comment
);
1698 tp
->type
= token_type_string_literal
;
1701 /* FALLTHROUGH, so that @identifier is recognized. */
1705 c
= do_getc_unicode_escaped (is_identifier_start
);
1706 if (is_identifier_start (c
))
1708 static struct string_buffer buffer
;
1709 buffer
.utf8_buflen
= 0;
1712 string_buffer_append_unicode (&buffer
, c
);
1715 c
= do_getc_unicode_escaped (is_identifier_part
);
1716 if (!is_identifier_part (c
))
1720 tp
->string
= xstrdup (string_buffer_result (&buffer
));
1721 tp
->type
= token_type_symbol
;
1726 /* Misc. operator. */
1727 tp
->type
= token_type_other
;
1734 /* Supports 3 tokens of pushback. */
1736 phase6_unget (token_ty
*tp
)
1738 if (tp
->type
!= token_type_eof
)
1740 if (phase6_pushback_length
== SIZEOF (phase6_pushback
))
1742 phase6_pushback
[phase6_pushback_length
++] = *tp
;
1747 /* Compile-time optimization of string literal concatenation.
1748 Combine "string1" + ... + "stringN" to the concatenated string if
1749 - the token after this expression is not '.' (because then the last
1750 string could be part of a method call expression). */
1752 static token_ty phase7_pushback
[2];
1753 static int phase7_pushback_length
;
1756 phase7_get (token_ty
*tp
)
1758 if (phase7_pushback_length
)
1760 *tp
= phase7_pushback
[--phase7_pushback_length
];
1765 if (tp
->type
== token_type_string_literal
)
1767 char *sum
= tp
->string
;
1768 size_t sum_len
= strlen (sum
);
1774 phase6_get (&token2
);
1775 if (token2
.type
== token_type_plus
)
1779 phase6_get (&token3
);
1780 if (token3
.type
== token_type_string_literal
)
1782 token_ty token_after
;
1784 phase6_get (&token_after
);
1785 if (token_after
.type
!= token_type_dot
)
1787 char *addend
= token3
.string
;
1788 size_t addend_len
= strlen (addend
);
1790 sum
= (char *) xrealloc (sum
, sum_len
+ addend_len
+ 1);
1791 memcpy (sum
+ sum_len
, addend
, addend_len
+ 1);
1792 sum_len
+= addend_len
;
1794 phase6_unget (&token_after
);
1795 free_token (&token3
);
1796 free_token (&token2
);
1799 phase6_unget (&token_after
);
1801 phase6_unget (&token3
);
1803 phase6_unget (&token2
);
1810 /* Supports 2 tokens of pushback. */
1812 phase7_unget (token_ty
*tp
)
1814 if (tp
->type
!= token_type_eof
)
1816 if (phase7_pushback_length
== SIZEOF (phase7_pushback
))
1818 phase7_pushback
[phase7_pushback_length
++] = *tp
;
1824 x_csharp_lex (token_ty
*tp
)
1829 /* Supports 2 tokens of pushback. */
1831 x_csharp_unlex (token_ty
*tp
)
1837 /* ========================= Extracting strings. ========================== */
1840 /* Context lookup table. */
1841 static flag_context_list_table_ty
*flag_context_list_table
;
1844 /* The file is broken into tokens. Scan the token stream, looking for
1845 a keyword, followed by a left paren, followed by a string. When we
1846 see this sequence, we have something to remember. We assume we are
1847 looking at a valid C or C++ program, and leave the complaints about
1848 the grammar to the compiler.
1850 Normal handling: Look for
1851 keyword ( ... msgid ... )
1852 Plural handling: Look for
1853 keyword ( ... msgid ... msgid_plural ... )
1855 We use recursion because the arguments before msgid or between msgid
1856 and msgid_plural can contain subexpressions of the same form. */
1859 /* Extract messages until the next balanced closing parenthesis or brace,
1860 depending on TERMINATOR.
1861 Extracted messages are added to MLP.
1862 When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
1863 if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
1864 otherwise PLURAL_COMMAS = 0.
1865 When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
1866 Return true upon eof, false upon closing parenthesis or brace. */
1868 extract_parenthesized (message_list_ty
*mlp
, token_type_ty terminator
,
1869 flag_context_ty outer_context
,
1870 flag_context_list_iterator_ty context_iter
,
1871 int commas_to_skip
, int plural_commas
)
1873 /* Remember the message containing the msgid, for msgid_plural. */
1874 message_ty
*plural_mp
= NULL
;
1876 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1878 /* Parameters of the keyword just seen. Defined only in state 1. */
1879 int next_commas_to_skip
= -1;
1880 int next_plural_commas
= 0;
1881 /* Context iterator that will be used if the next token is a '('. */
1882 flag_context_list_iterator_ty next_context_iter
=
1883 passthrough_context_list_iterator
;
1884 /* Current context. */
1885 flag_context_ty inner_context
=
1886 inherited_context (outer_context
,
1887 flag_context_list_iterator_advance (&context_iter
));
1889 /* Start state is 0. */
1896 x_csharp_lex (&token
);
1899 case token_type_symbol
:
1901 /* Combine symbol1 . ... . symbolN to a single strings, so that
1902 we can recognize static function calls like
1903 GettextResource.gettext. The information present for
1904 symbolI.....symbolN has precedence over the information for
1905 symbolJ.....symbolN with J > I. */
1906 char *sum
= token
.string
;
1907 size_t sum_len
= strlen (sum
);
1908 const char *dottedname
;
1909 flag_context_list_ty
*context_list
;
1915 x_csharp_lex (&token2
);
1916 if (token2
.type
== token_type_dot
)
1920 x_csharp_lex (&token3
);
1921 if (token3
.type
== token_type_symbol
)
1923 char *addend
= token3
.string
;
1924 size_t addend_len
= strlen (addend
);
1927 (char *) xrealloc (sum
, sum_len
+ 1 + addend_len
+ 1);
1929 memcpy (sum
+ sum_len
+ 1, addend
, addend_len
+ 1);
1930 sum_len
+= 1 + addend_len
;
1932 free_token (&token3
);
1933 free_token (&token2
);
1936 x_csharp_unlex (&token3
);
1938 x_csharp_unlex (&token2
);
1942 for (dottedname
= sum
;;)
1944 void *keyword_value
;
1946 if (find_entry (&keywords
, dottedname
, strlen (dottedname
),
1950 int argnum1
= (int) (long) keyword_value
& ((1 << 10) - 1);
1951 int argnum2
= (int) (long) keyword_value
>> 10;
1953 next_commas_to_skip
= argnum1
- 1;
1954 next_plural_commas
= (argnum2
> argnum1
? argnum2
- argnum1
: 0);
1959 dottedname
= strchr (dottedname
, '.');
1960 if (dottedname
== NULL
)
1968 for (dottedname
= sum
;;)
1971 flag_context_list_table_lookup (
1972 flag_context_list_table
,
1973 dottedname
, strlen (dottedname
));
1974 if (context_list
!= NULL
)
1977 dottedname
= strchr (dottedname
, '.');
1978 if (dottedname
== NULL
)
1982 next_context_iter
= flag_context_list_iterator (context_list
);
1988 case token_type_lparen
:
1989 if (extract_parenthesized (mlp
, token_type_rparen
,
1990 inner_context
, next_context_iter
,
1991 state
? next_commas_to_skip
: -1,
1992 state
? next_plural_commas
: 0))
1994 next_context_iter
= null_context_list_iterator
;
1998 case token_type_rparen
:
1999 if (terminator
== token_type_rparen
)
2001 if (terminator
== token_type_rbrace
)
2003 error_with_progname
= false;
2005 _("%s:%d: warning: ')' found where '}' was expected"),
2006 logical_file_name
, token
.line_number
);
2007 error_with_progname
= true;
2009 next_context_iter
= null_context_list_iterator
;
2013 case token_type_lbrace
:
2014 if (extract_parenthesized (mlp
, token_type_rbrace
,
2015 null_context
, null_context_list_iterator
,
2018 next_context_iter
= null_context_list_iterator
;
2022 case token_type_rbrace
:
2023 if (terminator
== token_type_rbrace
)
2025 if (terminator
== token_type_rparen
)
2027 error_with_progname
= false;
2029 _("%s:%d: warning: '}' found where ')' was expected"),
2030 logical_file_name
, token
.line_number
);
2031 error_with_progname
= true;
2033 next_context_iter
= null_context_list_iterator
;
2037 case token_type_comma
:
2038 if (commas_to_skip
>= 0)
2040 if (commas_to_skip
> 0)
2043 if (plural_mp
!= NULL
&& plural_commas
> 0)
2045 commas_to_skip
= plural_commas
- 1;
2049 commas_to_skip
= -1;
2052 inherited_context (outer_context
,
2053 flag_context_list_iterator_advance (
2055 next_context_iter
= passthrough_context_list_iterator
;
2059 case token_type_string_literal
:
2062 pos
.file_name
= logical_file_name
;
2063 pos
.line_number
= token
.line_number
;
2067 xgettext_current_source_encoding
= po_charset_utf8
;
2068 savable_comment_to_xgettext_comment (token
.comment
);
2069 remember_a_message (mlp
, token
.string
, inner_context
, &pos
);
2070 savable_comment_reset ();
2071 xgettext_current_source_encoding
= xgettext_global_source_encoding
;
2075 if (commas_to_skip
== 0)
2077 if (plural_mp
== NULL
)
2079 /* Seen an msgid. */
2082 xgettext_current_source_encoding
= po_charset_utf8
;
2083 savable_comment_to_xgettext_comment (token
.comment
);
2084 mp
= remember_a_message (mlp
, token
.string
,
2085 inner_context
, &pos
);
2086 savable_comment_reset ();
2087 xgettext_current_source_encoding
= xgettext_global_source_encoding
;
2088 if (plural_commas
> 0)
2093 /* Seen an msgid_plural. */
2094 xgettext_current_source_encoding
= po_charset_utf8
;
2095 remember_a_message_plural (plural_mp
, token
.string
,
2096 inner_context
, &pos
);
2097 xgettext_current_source_encoding
= xgettext_global_source_encoding
;
2102 free (token
.string
);
2105 drop_reference (token
.comment
);
2106 next_context_iter
= null_context_list_iterator
;
2110 case token_type_eof
:
2113 case token_type_dot
:
2114 case token_type_number
:
2115 case token_type_plus
:
2116 case token_type_other
:
2117 next_context_iter
= null_context_list_iterator
;
2129 extract_csharp (FILE *f
,
2130 const char *real_filename
, const char *logical_filename
,
2131 flag_context_list_table_ty
*flag_table
,
2132 msgdomain_list_ty
*mdlp
)
2134 message_list_ty
*mlp
= mdlp
->item
[0]->messages
;
2137 real_file_name
= real_filename
;
2138 logical_file_name
= xstrdup (logical_filename
);
2141 logical_line_number
= 1;
2142 last_comment_line
= -1;
2143 last_non_comment_line
= -1;
2145 flag_context_list_table
= flag_table
;
2149 /* Eat tokens until eof is seen. When extract_parenthesized returns
2150 due to an unbalanced closing parenthesis, just restart it. */
2151 while (!extract_parenthesized (mlp
, token_type_eof
,
2152 null_context
, null_context_list_iterator
,
2157 real_file_name
= NULL
;
2158 logical_file_name
= NULL
;