No empty .Rs/.Re
[netbsd-mini2440.git] / gnu / dist / gettext / gettext-tools / src / x-csharp.c
blob4dd27e467edf555e7fd613d52e7ec63a55f597e3
1 /* xgettext C# backend.
2 Copyright (C) 2003 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
23 #include <errno.h>
24 #include <stdbool.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
29 #include "message.h"
30 #include "xgettext.h"
31 #include "x-csharp.h"
32 #include "c-ctype.h"
33 #include "error.h"
34 #include "error-progname.h"
35 #include "xalloc.h"
36 #include "exit.h"
37 #include "hash.h"
38 #include "po-charset.h"
39 #include "utf8-ucs4.h"
40 #include "ucs4-utf8.h"
41 #include "gettext.h"
43 #define _(s) gettext(s)
45 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
48 /* The C# syntax is defined in ECMA-334, second edition. */
51 /* ====================== Keyword set customization. ====================== */
53 /* If true extract all strings. */
54 static bool extract_all = false;
56 static hash_table keywords;
57 static bool default_keywords = true;
60 void
61 x_csharp_extract_all ()
63 extract_all = true;
67 /* Processes a --keyword option.
68 Non-ASCII function names can be used if given in UTF-8 encoding. */
69 void
70 x_csharp_keyword (const char *name)
72 if (name == NULL)
73 default_keywords = false;
74 else
76 const char *end;
77 int argnum1;
78 int argnum2;
79 const char *colon;
81 if (keywords.table == NULL)
82 init_hash (&keywords, 100);
84 split_keywordspec (name, &end, &argnum1, &argnum2);
86 /* The characters between name and end should form a valid C#
87 identifier sequence with dots.
88 A colon means an invalid parse in split_keywordspec(). */
89 colon = strchr (name, ':');
90 if (colon == NULL || colon >= end)
92 if (argnum1 == 0)
93 argnum1 = 1;
94 insert_entry (&keywords, name, end - name,
95 (void *) (long) (argnum1 + (argnum2 << 10)));
100 /* Finish initializing the keywords hash table.
101 Called after argument processing, before each file is processed. */
102 static void
103 init_keywords ()
105 if (default_keywords)
107 x_csharp_keyword ("GetString"); /* Resource{Manager,Set}.GetString */
108 x_csharp_keyword ("GetPluralString:1,2"); /* GettextResource{Manager,Set}.GetPluralString */
109 default_keywords = false;
113 void
114 init_flag_table_csharp ()
116 xgettext_record_flag ("GetString:1:pass-csharp-format");
117 xgettext_record_flag ("GetPluralString:1:pass-csharp-format");
118 xgettext_record_flag ("GetPluralString:2:pass-csharp-format");
119 xgettext_record_flag ("String.Format:1:csharp-format");
123 /* ======================== Reading of characters. ======================== */
125 /* Real filename, used in error messages about the input file. */
126 static const char *real_file_name;
128 /* Logical filename and line number, used to label the extracted messages. */
129 static char *logical_file_name;
130 static int line_number;
132 /* The input file stream. */
133 static FILE *fp;
136 /* Phase 1: line_number handling. */
138 /* Maximum used, roughly a safer MB_LEN_MAX. */
139 #define MAX_PHASE1_PUSHBACK 16
140 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
141 static int phase1_pushback_length;
143 /* Read the next single byte from the input file. */
144 static int
145 phase1_getc ()
147 int c;
149 if (phase1_pushback_length)
151 c = phase1_pushback[--phase1_pushback_length];
152 if (c == '\n')
153 ++line_number;
154 return c;
157 c = getc (fp);
158 if (c == EOF)
160 if (ferror (fp))
161 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
162 real_file_name);
163 return EOF;
166 if (c == '\n')
167 ++line_number;
168 return c;
171 /* Supports MAX_PHASE1_PUSHBACK characters of pushback. */
172 static void
173 phase1_ungetc (int c)
175 if (c != EOF)
177 if (c == '\n')
178 --line_number;
179 if (phase1_pushback_length == SIZEOF (phase1_pushback))
180 abort ();
181 phase1_pushback[phase1_pushback_length++] = c;
186 /* Phase 2: Conversion to Unicode.
187 This is done early because ECMA-334 section 9.1. says that the source is
188 "an ordered sequence of Unicode characters", and because the recognition
189 of the line terminators (ECMA-334 section 9.3.1) is hardly possible without
190 prior conversion to Unicode. */
192 /* End-of-file indicator for functions returning an UCS-4 character. */
193 #define UEOF -1
195 /* Newline Unicode character. */
196 #define UNL 0x000a
198 static int phase2_pushback[1];
199 static int phase2_pushback_length;
201 /* Read the next Unicode UCS-4 character from the input file. */
202 static int
203 phase2_getc ()
205 if (phase2_pushback_length)
206 return phase2_pushback[--phase2_pushback_length];
208 if (xgettext_current_source_encoding == po_charset_ascii)
210 int c = phase1_getc ();
211 if (c == EOF)
212 return UEOF;
213 if (!c_isascii (c))
215 char buffer[21];
216 sprintf (buffer, ":%ld", (long) line_number);
217 multiline_error (xstrdup (""),
218 xasprintf (_("\
219 Non-ASCII string at %s%s.\n\
220 Please specify the source encoding through --from-code.\n"),
221 real_file_name, buffer));
222 exit (EXIT_FAILURE);
224 return c;
226 else if (xgettext_current_source_encoding != po_charset_utf8)
228 #if HAVE_ICONV
229 /* Use iconv on an increasing number of bytes. Read only as many bytes
230 through phase1_getc as needed. This is needed to give reasonable
231 interactive behaviour when fp is connected to an interactive tty. */
232 unsigned char buf[MAX_PHASE1_PUSHBACK];
233 size_t bufcount = 0;
235 for (;;)
237 unsigned char scratchbuf[6];
238 const char *inptr = (const char *) &buf[0];
239 size_t insize = bufcount;
240 char *outptr = (char *) &scratchbuf[0];
241 size_t outsize = sizeof (scratchbuf);
243 size_t res = iconv (xgettext_current_source_iconv,
244 (ICONV_CONST char **) &inptr, &insize,
245 &outptr, &outsize);
246 /* We expect that a character has been produced if and only if
247 some input bytes have been consumed. */
248 if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
249 abort ();
250 if (outsize == sizeof (scratchbuf))
252 /* No character has been produced. Must be an error. */
253 if (res != (size_t)(-1))
254 abort ();
256 if (errno == EILSEQ)
258 /* An invalid multibyte sequence was encountered. */
259 multiline_error (xstrdup (""),
260 xasprintf (_("\
261 %s:%d: Invalid multibyte sequence.\n\
262 Please specify the correct source encoding through --from-code.\n"),
263 real_file_name, line_number));
264 exit (EXIT_FAILURE);
266 else if (errno == EINVAL)
268 /* An incomplete multibyte character. */
269 int c;
271 if (bufcount == MAX_PHASE1_PUSHBACK)
273 /* An overlong incomplete multibyte sequence was
274 encountered. */
275 multiline_error (xstrdup (""),
276 xasprintf (_("\
277 %s:%d: Long incomplete multibyte sequence.\n\
278 Please specify the correct source encoding through --from-code.\n"),
279 real_file_name, line_number));
280 exit (EXIT_FAILURE);
283 /* Read one more byte and retry iconv. */
284 c = phase1_getc ();
285 if (c == EOF)
287 multiline_error (xstrdup (""),
288 xasprintf (_("\
289 %s:%d: Incomplete multibyte sequence at end of file.\n\
290 Please specify the correct source encoding through --from-code.\n"),
291 real_file_name, line_number));
292 exit (EXIT_FAILURE);
294 if (c == '\n')
296 multiline_error (xstrdup (""),
297 xasprintf (_("\
298 %s:%d: Incomplete multibyte sequence at end of line.\n\
299 Please specify the correct source encoding through --from-code.\n"),
300 real_file_name, line_number - 1));
301 exit (EXIT_FAILURE);
303 buf[bufcount++] = (unsigned char) c;
305 else
306 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
307 real_file_name, line_number);
309 else
311 size_t outbytes = sizeof (scratchbuf) - outsize;
312 size_t bytes = bufcount - insize;
313 unsigned int uc;
315 /* We expect that one character has been produced. */
316 if (bytes == 0)
317 abort ();
318 if (outbytes == 0)
319 abort ();
320 /* Push back the unused bytes. */
321 while (insize > 0)
322 phase1_ungetc (buf[--insize]);
323 /* Convert the character from UTF-8 to UCS-4. */
324 if (u8_mbtouc (&uc, scratchbuf, outbytes) < outbytes)
326 /* scratchbuf contains an out-of-range Unicode character
327 (> 0x10ffff). */
328 multiline_error (xstrdup (""),
329 xasprintf (_("\
330 %s:%d: Invalid multibyte sequence.\n\
331 Please specify the source encoding through --from-code.\n"),
332 real_file_name, line_number));
333 exit (EXIT_FAILURE);
335 return uc;
338 #else
339 /* If we don't have iconv(), the only supported values for
340 xgettext_global_source_encoding and thus also for
341 xgettext_current_source_encoding are ASCII and UTF-8. */
342 abort ();
343 #endif
345 else
347 /* Read an UTF-8 encoded character. */
348 unsigned char buf[6];
349 unsigned int count;
350 int c;
351 unsigned int uc;
353 c = phase1_getc ();
354 if (c == EOF)
355 return UEOF;
356 buf[0] = c;
357 count = 1;
359 if (buf[0] >= 0xc0)
361 c = phase1_getc ();
362 if (c == EOF)
363 return UEOF;
364 buf[1] = c;
365 count = 2;
368 if (buf[0] >= 0xe0
369 && ((buf[1] ^ 0x80) < 0x40))
371 c = phase1_getc ();
372 if (c == EOF)
373 return UEOF;
374 buf[2] = c;
375 count = 3;
378 if (buf[0] >= 0xf0
379 && ((buf[1] ^ 0x80) < 0x40)
380 && ((buf[2] ^ 0x80) < 0x40))
382 c = phase1_getc ();
383 if (c == EOF)
384 return UEOF;
385 buf[3] = c;
386 count = 4;
389 if (buf[0] >= 0xf8
390 && ((buf[1] ^ 0x80) < 0x40)
391 && ((buf[2] ^ 0x80) < 0x40)
392 && ((buf[3] ^ 0x80) < 0x40))
394 c = phase1_getc ();
395 if (c == EOF)
396 return UEOF;
397 buf[4] = c;
398 count = 5;
401 if (buf[0] >= 0xfc
402 && ((buf[1] ^ 0x80) < 0x40)
403 && ((buf[2] ^ 0x80) < 0x40)
404 && ((buf[3] ^ 0x80) < 0x40)
405 && ((buf[4] ^ 0x80) < 0x40))
407 c = phase1_getc ();
408 if (c == EOF)
409 return UEOF;
410 buf[5] = c;
411 count = 6;
414 u8_mbtouc (&uc, buf, count);
415 return uc;
419 /* Supports only one pushback character. */
420 static void
421 phase2_ungetc (int c)
423 if (c != UEOF)
425 if (phase2_pushback_length == SIZEOF (phase2_pushback))
426 abort ();
427 phase2_pushback[phase2_pushback_length++] = c;
432 /* Phase 3: Convert all line terminators to LF.
433 See ECMA-334 section 9.3.1. */
435 /* Line number defined in terms of phase3. */
436 static int logical_line_number;
438 static int phase3_pushback[9];
439 static int phase3_pushback_length;
441 /* Read the next Unicode UCS-4 character from the input file, mapping
442 all line terminators to U+000A, and dropping U+001A at the end of file. */
443 static int
444 phase3_getc ()
446 int c;
448 if (phase3_pushback_length)
450 c = phase3_pushback[--phase3_pushback_length];
451 if (c == UNL)
452 ++logical_line_number;
453 return c;
456 c = phase2_getc ();
458 if (c == 0x000d)
460 int c1 = phase2_getc ();
462 if (c1 != UEOF && c1 != 0x000a)
463 phase2_ungetc (c1);
465 /* Seen line terminator CR or CR/LF. */
466 ++logical_line_number;
467 return UNL;
470 if (c == 0x0085 || c == 0x2028 || c == 0x2029)
472 /* Seen Unicode word processor newline. */
473 ++logical_line_number;
474 return UNL;
477 if (c == 0x001a)
479 int c1 = phase2_getc ();
481 if (c1 == UEOF)
482 /* Seen U+001A right before the end of file. */
483 return UEOF;
485 phase2_ungetc (c1);
488 if (c == UNL)
489 ++logical_line_number;
490 return c;
493 /* Supports 9 characters of pushback. */
494 static void
495 phase3_ungetc (int c)
497 if (c != UEOF)
499 if (c == UNL)
500 --logical_line_number;
501 if (phase3_pushback_length == SIZEOF (phase3_pushback))
502 abort ();
503 phase3_pushback[phase3_pushback_length++] = c;
508 /* ========================= Accumulating strings. ======================== */
510 /* A string buffer type that allows appending Unicode characters.
511 Returns the entire string in UTF-8 encoding. */
513 struct string_buffer
515 /* The part of the string that has already been converted to UTF-8. */
516 char *utf8_buffer;
517 size_t utf8_buflen;
518 size_t utf8_allocated;
521 /* Initialize a 'struct string_buffer' to empty. */
522 static inline void
523 init_string_buffer (struct string_buffer *bp)
525 bp->utf8_buffer = NULL;
526 bp->utf8_buflen = 0;
527 bp->utf8_allocated = 0;
530 /* Auxiliary function: Ensure count more bytes are available in bp->utf8. */
531 static inline void
532 string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count)
534 if (bp->utf8_buflen + count > bp->utf8_allocated)
536 size_t new_allocated = 2 * bp->utf8_allocated + 10;
537 if (new_allocated < bp->utf8_buflen + count)
538 new_allocated = bp->utf8_buflen + count;
539 bp->utf8_allocated = new_allocated;
540 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
544 /* Auxiliary function: Append a Unicode character to bp->utf8.
545 uc must be < 0x110000. */
546 static inline void
547 string_buffer_append_unicode (struct string_buffer *bp, unsigned int uc)
549 unsigned char utf8buf[6];
550 int count = u8_uctomb (utf8buf, uc, 6);
552 if (count < 0)
553 /* The caller should have ensured that uc is not out-of-range. */
554 abort ();
556 string_buffer_append_unicode_grow (bp, count);
557 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
558 bp->utf8_buflen += count;
561 /* Return the string buffer's contents. */
562 static char *
563 string_buffer_result (struct string_buffer *bp)
565 /* NUL-terminate it. */
566 string_buffer_append_unicode_grow (bp, 1);
567 bp->utf8_buffer[bp->utf8_buflen] = '\0';
568 /* Return it. */
569 return bp->utf8_buffer;
572 /* Free the memory pointed to by a 'struct string_buffer'. */
573 static inline void
574 free_string_buffer (struct string_buffer *bp)
576 free (bp->utf8_buffer);
580 /* ======================== Accumulating comments. ======================== */
583 /* Accumulating a single comment line. */
585 static struct string_buffer comment_buffer;
587 static inline void
588 comment_start ()
590 comment_buffer.utf8_buflen = 0;
593 static inline bool
594 comment_at_start ()
596 return (comment_buffer.utf8_buflen == 0);
599 static inline void
600 comment_add (int c)
602 string_buffer_append_unicode (&comment_buffer, c);
605 static inline void
606 comment_line_end (size_t chars_to_remove)
608 char *buffer = string_buffer_result (&comment_buffer);
609 size_t buflen = strlen (buffer);
611 buflen -= chars_to_remove;
612 while (buflen >= 1
613 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
614 --buflen;
615 buffer[buflen] = '\0';
616 savable_comment_add (buffer);
620 /* These are for tracking whether comments count as immediately before
621 keyword. */
622 static int last_comment_line;
623 static int last_non_comment_line;
626 /* Phase 4: Replace each comment that is not inside a character constant or
627 string literal with a space or newline character.
628 See ECMA-334 section 9.3.2. */
630 static int
631 phase4_getc ()
633 int c0;
634 int c;
635 bool last_was_star;
637 c0 = phase3_getc ();
638 if (c0 != '/')
639 return c0;
640 c = phase3_getc ();
641 switch (c)
643 default:
644 phase3_ungetc (c);
645 return c0;
647 case '*':
648 /* C style comment. */
649 comment_start ();
650 last_was_star = false;
651 for (;;)
653 c = phase3_getc ();
654 if (c == UEOF)
655 break;
656 /* We skip all leading white space, but not EOLs. */
657 if (!(comment_at_start () && (c == ' ' || c == '\t')))
658 comment_add (c);
659 switch (c)
661 case UNL:
662 comment_line_end (1);
663 comment_start ();
664 last_was_star = false;
665 continue;
667 case '*':
668 last_was_star = true;
669 continue;
671 case '/':
672 if (last_was_star)
674 comment_line_end (2);
675 break;
677 /* FALLTHROUGH */
679 default:
680 last_was_star = false;
681 continue;
683 break;
685 last_comment_line = logical_line_number;
686 return ' ';
688 case '/':
689 /* C++ style comment. */
690 last_comment_line = logical_line_number;
691 comment_start ();
692 for (;;)
694 c = phase3_getc ();
695 if (c == UNL || c == UEOF)
696 break;
697 /* We skip all leading white space, but not EOLs. */
698 if (!(comment_at_start () && (c == ' ' || c == '\t')))
699 comment_add (c);
701 phase3_ungetc (c); /* push back the newline, to decrement logical_line_number */
702 comment_line_end (0);
703 phase3_getc (); /* read the newline again */
704 return UNL;
708 /* Supports only one pushback character. */
709 static void
710 phase4_ungetc (int c)
712 phase3_ungetc (c);
716 /* ======================= Character classification. ====================== */
719 /* Return true if a given character is white space.
720 See ECMA-334 section 9.3.3. */
721 static bool
722 is_whitespace (int c)
724 /* Unicode character class Zs, as of Unicode 4.0. */
725 /* grep '^[^;]*;[^;]*;Zs;' UnicodeData-4.0.0.txt */
726 switch (c >> 8)
728 case 0x00:
729 return (c == 0x0020 || c == 0x00a0);
730 case 0x16:
731 return (c == 0x1680);
732 case 0x18:
733 return (c == 0x180e);
734 case 0x20:
735 return ((c >= 0x2000 && c <= 0x200b) || c == 0x202f || c == 0x205f);
736 case 0x30:
737 return (c == 0x3000);
738 default:
739 return false;
744 /* C# allows identifiers containing many Unicode characters. We recognize
745 them; to use an identifier with Unicode characters in a --keyword option,
746 it must be specified in UTF-8. */
748 static inline int
749 bitmap_lookup (const void *table, unsigned int uc)
751 unsigned int index1 = uc >> 16;
752 if (index1 < ((const int *) table)[0])
754 int lookup1 = ((const int *) table)[1 + index1];
755 if (lookup1 >= 0)
757 unsigned int index2 = (uc >> 9) & 0x7f;
758 int lookup2 = ((const int *) table)[lookup1 + index2];
759 if (lookup2 >= 0)
761 unsigned int index3 = (uc >> 5) & 0xf;
762 unsigned int lookup3 = ((const int *) table)[lookup2 + index3];
764 return (lookup3 >> (uc & 0x1f)) & 1;
768 return 0;
771 /* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, as of Unicode 4.0,
772 plus the underscore. */
773 static const
774 struct
776 int header[1];
777 int level1[3];
778 int level2[3 << 7];
779 /*unsigned*/ int level3[34 << 4];
781 table_identifier_start =
783 { 3 },
784 { 4, 132, 260 },
786 388, 404, 420, 436, 452, 468, 484, 500,
787 516, 532, 548, 564, 580, -1, 596, 612,
788 628, -1, -1, -1, -1, -1, -1, -1,
789 644, -1, 660, 660, 660, 660, 660, 660,
790 660, 660, 660, 660, 660, 660, 676, 660,
791 660, 660, 660, 660, 660, 660, 660, 660,
792 660, 660, 660, 660, 660, 660, 660, 660,
793 660, 660, 660, 660, 660, 660, 660, 660,
794 660, 660, 660, 660, 660, 660, 660, 660,
795 660, 660, 660, 660, 660, 660, 660, 692,
796 660, 660, 708, -1, -1, -1, 660, 660,
797 660, 660, 660, 660, 660, 660, 660, 660,
798 660, 660, 660, 660, 660, 660, 660, 660,
799 660, 660, 660, 724, -1, -1, -1, -1,
800 -1, -1, -1, -1, -1, -1, -1, -1,
801 -1, -1, -1, -1, 740, 756, 772, 788,
802 804, 820, 836, -1, 852, -1, -1, -1,
803 -1, -1, -1, -1, -1, -1, -1, -1,
804 -1, -1, -1, -1, -1, -1, -1, -1,
805 -1, -1, -1, -1, -1, -1, -1, -1,
806 -1, -1, -1, -1, -1, -1, -1, -1,
807 -1, -1, -1, -1, -1, -1, -1, -1,
808 -1, -1, -1, -1, -1, -1, -1, -1,
809 -1, -1, -1, -1, -1, -1, -1, -1,
810 -1, -1, -1, -1, -1, -1, -1, -1,
811 -1, -1, -1, -1, -1, -1, -1, -1,
812 -1, -1, -1, -1, -1, -1, -1, -1,
813 -1, -1, -1, -1, -1, -1, -1, -1,
814 -1, -1, -1, -1, -1, -1, -1, -1,
815 -1, -1, 868, 884, -1, -1, -1, -1,
816 -1, -1, -1, -1, -1, -1, -1, -1,
817 -1, -1, -1, -1, -1, -1, -1, -1,
818 660, 660, 660, 660, 660, 660, 660, 660,
819 660, 660, 660, 660, 660, 660, 660, 660,
820 660, 660, 660, 660, 660, 660, 660, 660,
821 660, 660, 660, 660, 660, 660, 660, 660,
822 660, 660, 660, 660, 660, 660, 660, 660,
823 660, 660, 660, 660, 660, 660, 660, 660,
824 660, 660, 660, 660, 660, 660, 660, 660,
825 660, 660, 660, 660, 660, 660, 660, 660,
826 660, 660, 660, 660, 660, 660, 660, 660,
827 660, 660, 660, 660, 660, 660, 660, 660,
828 660, 660, 660, 900, -1, -1, -1, -1,
829 -1, -1, -1, -1, -1, -1, -1, -1,
830 -1, -1, -1, -1, -1, -1, -1, -1,
831 -1, -1, -1, -1, -1, -1, -1, -1,
832 -1, -1, -1, -1, -1, -1, -1, -1,
833 -1, -1, -1, -1, 660, 916, -1, -1
836 0x00000000, 0x00000000, 0x87FFFFFE, 0x07FFFFFE,
837 0x00000000, 0x04200400, 0xFF7FFFFF, 0xFF7FFFFF,
838 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
839 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
840 0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF,
841 0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F,
842 0x00000000, 0x00000000, 0x00000000, 0x04000000,
843 0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF,
844 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
845 0xFFFFFC03, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF,
846 0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE,
847 0x000000FF, 0x00000000, 0xFFFF0000, 0x000707FF,
848 0x00000000, 0x07FFFFFE, 0x000007FF, 0xFFFEC000,
849 0xFFFFFFFF, 0xFFFFFFFF, 0x002FFFFF, 0x9C00C060,
850 0xFFFD0000, 0x0000FFFF, 0x0000E000, 0x00000000,
851 0xFFFFFFFF, 0x0002003F, 0x00000000, 0x00000000,
852 0x00000000, 0x00000000, 0x00000000, 0x00000000,
853 0x00000000, 0x00000000, 0x00000000, 0x00000000,
854 0xFFFFFFF0, 0x23FFFFFF, 0xFF010000, 0x00000003,
855 0xFFF99FE0, 0x23C5FDFF, 0xB0000000, 0x00030003,
856 0xFFF987E0, 0x036DFDFF, 0x5E000000, 0x001C0000,
857 0xFFFBBFE0, 0x23EDFDFF, 0x00010000, 0x00000003,
858 0xFFF99FE0, 0x23EDFDFF, 0xB0000000, 0x00020003,
859 0xD63DC7E8, 0x03BFC718, 0x00000000, 0x00000000,
860 0xFFFDDFE0, 0x03EFFDFF, 0x00000000, 0x00000003,
861 0xFFFDDFE0, 0x23EFFDFF, 0x40000000, 0x00000003,
862 0xFFFDDFE0, 0x03FFFDFF, 0x00000000, 0x00000003,
863 0xFC7FFFE0, 0x2FFBFFFF, 0x0000007F, 0x00000000,
864 0xFFFFFFFE, 0x000DFFFF, 0x0000007F, 0x00000000,
865 0xFEF02596, 0x200DECAE, 0x3000005F, 0x00000000,
866 0x00000001, 0x00000000, 0xFFFFFEFF, 0x000007FF,
867 0x00000F00, 0x00000000, 0x00000000, 0x00000000,
868 0xFFFFFFFF, 0x000006FB, 0x003F0000, 0x00000000,
869 0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF,
870 0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF,
871 0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF,
872 0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF,
873 0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF,
874 0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x00000000,
875 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF,
876 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
877 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
878 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
879 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
880 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF,
881 0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF,
882 0x0003DFFF, 0x0003FFFF, 0x0003FFFF, 0x0001DFFF,
883 0xFFFFFFFF, 0x000FFFFF, 0x10800000, 0x00000000,
884 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF,
885 0xFFFFFFFF, 0x000001FF, 0x00000000, 0x00000000,
886 0x1FFFFFFF, 0x00000000, 0xFFFF0000, 0x001F3FFF,
887 0x00000000, 0x00000000, 0x00000000, 0x00000000,
888 0x00000000, 0x00000000, 0x00000000, 0x00000000,
889 0x00000000, 0x00000000, 0x00000000, 0x00000000,
890 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF,
891 0x00000000, 0x00000000, 0x00000000, 0x00000000,
892 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
893 0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF,
894 0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF,
895 0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF,
896 0x00000000, 0x00000000, 0x00000000, 0x80020000,
897 0x00000000, 0x00000000, 0x00000000, 0x00000000,
898 0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF,
899 0x0000000F, 0x00000000, 0x00000000, 0x00000000,
900 0x000000E0, 0x1F3E03FE, 0xFFFFFFFE, 0xFFFFFFFF,
901 0xE07FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xF7FFFFFF,
902 0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF,
903 0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000,
904 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
905 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
906 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
907 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
908 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
909 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
910 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
911 0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000,
912 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
913 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
914 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
915 0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
916 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
917 0x00001FFF, 0x00000000, 0x00000000, 0x00000000,
918 0x00000000, 0x00000000, 0x00000000, 0x00000000,
919 0x00000000, 0x00000000, 0x00000000, 0x00000000,
920 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
921 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
922 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
923 0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000,
924 0x00000000, 0x00000000, 0x00000000, 0x00000000,
925 0x00000000, 0x00000000, 0x00000000, 0x00000000,
926 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
927 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
928 0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF,
929 0x00000000, 0x00000000, 0x00000000, 0x00000000,
930 0xA0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF,
931 0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF,
932 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
933 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
934 0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF,
935 0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000,
936 0x00000000, 0x00000000, 0x00000000, 0xFFDF0000,
937 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x1FFFFFFF,
938 0x00000000, 0x07FFFFFE, 0x07FFFFFE, 0xFFFFFFC0,
939 0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x00000000,
940 0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000,
941 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF,
942 0x00000000, 0x00000000, 0x00000000, 0x00000000,
943 0x00000000, 0x00000000, 0x00000000, 0x00000000,
944 0x00000000, 0x00000000, 0x00000000, 0x00000000,
945 0x00000000, 0x00000000, 0x00000000, 0x00000000,
946 0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000,
947 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
948 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
949 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
950 0x00000000, 0x00000000, 0x00000000, 0x00000000,
951 0x00000000, 0x00000000, 0x00000000, 0x00000000,
952 0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000,
953 0x00000000, 0x00000000, 0x00000000, 0x00000000,
954 0x00000000, 0x00000000, 0x00000000, 0x00000000,
955 0x00000000, 0x00000000, 0x00000000, 0x00000000,
956 0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF,
957 0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF,
958 0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF,
959 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
960 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
961 0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF,
962 0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF,
963 0xFFFFFDFF, 0xFFFFFDFF, 0x000003F7, 0x00000000,
964 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
965 0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000,
966 0x00000000, 0x00000000, 0x00000000, 0x00000000,
967 0x00000000, 0x00000000, 0x00000000, 0x00000000,
968 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
969 0x00000000, 0x00000000, 0x00000000, 0x00000000,
970 0x00000000, 0x00000000, 0x00000000, 0x00000000,
971 0x00000000, 0x00000000, 0x00000000, 0x00000000
975 /* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, Nd, Pc, Mn, Mc, Cf,
976 as of Unicode 4.0. */
977 static const
978 struct
980 int header[1];
981 int level1[15];
982 int level2[4 << 7];
983 /*unsigned*/ int level3[36 << 4];
985 table_identifier_part =
987 { 15 },
989 16, 144, 272, -1, -1, -1, -1, -1,
990 -1, -1, -1, -1, -1, -1, 400
993 528, 544, 560, 576, 592, 608, 624, 640,
994 656, 672, 688, 704, 720, -1, 736, 752,
995 768, -1, -1, -1, -1, -1, -1, -1,
996 784, -1, 800, 800, 800, 800, 800, 800,
997 800, 800, 800, 800, 800, 800, 816, 800,
998 800, 800, 800, 800, 800, 800, 800, 800,
999 800, 800, 800, 800, 800, 800, 800, 800,
1000 800, 800, 800, 800, 800, 800, 800, 800,
1001 800, 800, 800, 800, 800, 800, 800, 800,
1002 800, 800, 800, 800, 800, 800, 800, 832,
1003 800, 800, 848, -1, -1, -1, 800, 800,
1004 800, 800, 800, 800, 800, 800, 800, 800,
1005 800, 800, 800, 800, 800, 800, 800, 800,
1006 800, 800, 800, 864, -1, -1, -1, -1,
1007 -1, -1, -1, -1, -1, -1, -1, -1,
1008 -1, -1, -1, -1, 880, 896, 912, 928,
1009 944, 960, 976, -1, 992, -1, -1, -1,
1010 -1, -1, -1, -1, -1, -1, -1, -1,
1011 -1, -1, -1, -1, -1, -1, -1, -1,
1012 -1, -1, -1, -1, -1, -1, -1, -1,
1013 -1, -1, -1, -1, -1, -1, -1, -1,
1014 -1, -1, -1, -1, -1, -1, -1, -1,
1015 -1, -1, -1, -1, -1, -1, -1, -1,
1016 -1, -1, -1, -1, -1, -1, -1, -1,
1017 -1, -1, -1, -1, -1, -1, -1, -1,
1018 -1, -1, -1, -1, -1, -1, -1, -1,
1019 -1, -1, -1, -1, -1, -1, -1, -1,
1020 -1, -1, -1, -1, -1, -1, -1, -1,
1021 -1, -1, -1, -1, -1, -1, -1, -1,
1022 1008, -1, 1024, 1040, -1, -1, -1, -1,
1023 -1, -1, -1, -1, -1, -1, -1, -1,
1024 -1, -1, -1, -1, -1, -1, -1, -1,
1025 800, 800, 800, 800, 800, 800, 800, 800,
1026 800, 800, 800, 800, 800, 800, 800, 800,
1027 800, 800, 800, 800, 800, 800, 800, 800,
1028 800, 800, 800, 800, 800, 800, 800, 800,
1029 800, 800, 800, 800, 800, 800, 800, 800,
1030 800, 800, 800, 800, 800, 800, 800, 800,
1031 800, 800, 800, 800, 800, 800, 800, 800,
1032 800, 800, 800, 800, 800, 800, 800, 800,
1033 800, 800, 800, 800, 800, 800, 800, 800,
1034 800, 800, 800, 800, 800, 800, 800, 800,
1035 800, 800, 800, 1056, -1, -1, -1, -1,
1036 -1, -1, -1, -1, -1, -1, -1, -1,
1037 -1, -1, -1, -1, -1, -1, -1, -1,
1038 -1, -1, -1, -1, -1, -1, -1, -1,
1039 -1, -1, -1, -1, -1, -1, -1, -1,
1040 -1, -1, -1, -1, 800, 1072, -1, -1,
1041 1088, -1, -1, -1, -1, -1, -1, -1,
1042 -1, -1, -1, -1, -1, -1, -1, -1,
1043 -1, -1, -1, -1, -1, -1, -1, -1,
1044 -1, -1, -1, -1, -1, -1, -1, -1,
1045 -1, -1, -1, -1, -1, -1, -1, -1,
1046 -1, -1, -1, -1, -1, -1, -1, -1,
1047 -1, -1, -1, -1, -1, -1, -1, -1,
1048 -1, -1, -1, -1, -1, -1, -1, -1,
1049 -1, -1, -1, -1, -1, -1, -1, -1,
1050 -1, -1, -1, -1, -1, -1, -1, -1,
1051 -1, -1, -1, -1, -1, -1, -1, -1,
1052 -1, -1, -1, -1, -1, -1, -1, -1,
1053 -1, -1, -1, -1, -1, -1, -1, -1,
1054 -1, -1, -1, -1, -1, -1, -1, -1,
1055 -1, -1, -1, -1, -1, -1, -1, -1,
1056 -1, -1, -1, -1, -1, -1, -1, -1
1059 0x00000000, 0x03FF0000, 0x87FFFFFE, 0x07FFFFFE,
1060 0x00000000, 0x04202400, 0xFF7FFFFF, 0xFF7FFFFF,
1061 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1062 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1063 0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF,
1064 0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F,
1065 0xFFFFFFFF, 0xFFFFFFFF, 0xE0FFFFFF, 0x0400FFFF,
1066 0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF,
1067 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1068 0xFFFFFC7B, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF,
1069 0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE,
1070 0xFFFE00FF, 0xBBFFFFFB, 0xFFFF0016, 0x000707FF,
1071 0x003F000F, 0x07FFFFFE, 0x01FFFFFF, 0xFFFFC3FF,
1072 0xFFFFFFFF, 0xFFFFFFFF, 0xBFEFFFFF, 0x9FFFFDFF,
1073 0xFFFF8000, 0xFFFFFFFF, 0x0000E7FF, 0x00000000,
1074 0xFFFFFFFF, 0x0003FFFF, 0x00000000, 0x00000000,
1075 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1076 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1077 0xFFFFFFFE, 0xF3FFFFFF, 0xFF1F3FFF, 0x0000FFCF,
1078 0xFFF99FEE, 0xF3C5FDFF, 0xB080399F, 0x0003FFCF,
1079 0xFFF987EE, 0xD36DFDFF, 0x5E003987, 0x001FFFC0,
1080 0xFFFBBFEE, 0xF3EDFDFF, 0x00013BBF, 0x0000FFCF,
1081 0xFFF99FEE, 0xF3EDFDFF, 0xB0C0398F, 0x0002FFC3,
1082 0xD63DC7EC, 0xC3BFC718, 0x00803DC7, 0x0000FF80,
1083 0xFFFDDFEE, 0xC3EFFDFF, 0x00603DDF, 0x0000FFC3,
1084 0xFFFDDFEC, 0xF3EFFDFF, 0x40603DDF, 0x0000FFC3,
1085 0xFFFDDFEC, 0xC3FFFDFF, 0x00803DCF, 0x0000FFC3,
1086 0xFC7FFFEC, 0x2FFBFFFF, 0xFF5F847F, 0x000C0000,
1087 0xFFFFFFFE, 0x07FFFFFF, 0x03FF7FFF, 0x00000000,
1088 0xFEF02596, 0x3BFFECAE, 0x33FF3F5F, 0x00000000,
1089 0x03000001, 0xC2A003FF, 0xFFFFFEFF, 0xFFFE07FF,
1090 0xFEFF0FDF, 0x1FFFFFFF, 0x00000040, 0x00000000,
1091 0xFFFFFFFF, 0x03C7F6FB, 0x03FF03FF, 0x00000000,
1092 0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF,
1093 0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF,
1094 0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF,
1095 0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF,
1096 0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF,
1097 0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x0003FE00,
1098 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF,
1099 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1100 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1101 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1102 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1103 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF,
1104 0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF,
1105 0x001FDFFF, 0x001FFFFF, 0x000FFFFF, 0x000DDFFF,
1106 0xFFFFFFFF, 0xFFFFFFFF, 0x308FFFFF, 0x000003FF,
1107 0x03FF3800, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF,
1108 0xFFFFFFFF, 0x000003FF, 0x00000000, 0x00000000,
1109 0x1FFFFFFF, 0x0FFF0FFF, 0xFFFFFFC0, 0x001F3FFF,
1110 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1111 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1112 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1113 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF,
1114 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1115 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1116 0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF,
1117 0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF,
1118 0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF,
1119 0x0000F000, 0x80007C00, 0x00100001, 0x8002FC0F,
1120 0x00000000, 0x00000000, 0x1FFF0000, 0x000007E2,
1121 0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF,
1122 0x0000000F, 0x00000000, 0x00000000, 0x00000000,
1123 0x000000E0, 0x1F3EFFFE, 0xFFFFFFFE, 0xFFFFFFFF,
1124 0xE67FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF,
1125 0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF,
1126 0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000,
1127 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1128 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1129 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1130 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1131 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1132 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1133 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1134 0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000,
1135 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1136 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1137 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1138 0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
1139 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1140 0x00001FFF, 0x00000000, 0x00000000, 0x00000000,
1141 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1142 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1143 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1144 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1145 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1146 0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000,
1147 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1148 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1149 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1150 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1151 0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF,
1152 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1153 0xE0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF,
1154 0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF,
1155 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1156 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1157 0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF,
1158 0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000,
1159 0x0000FFFF, 0x0018000F, 0x0000E000, 0xFFDF0000,
1160 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x9FFFFFFF,
1161 0x03FF0000, 0x87FFFFFE, 0x07FFFFFE, 0xFFFFFFE0,
1162 0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x0E000000,
1163 0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000,
1164 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF,
1165 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1166 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1167 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1168 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1169 0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000,
1170 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
1171 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1172 0x3FFFFFFF, 0x000003FF, 0x00000000, 0x00000000,
1173 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1174 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1175 0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000,
1176 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1177 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1178 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1179 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1180 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1181 0x00000000, 0x00000000, 0x00000000, 0xFFFFE3E0,
1182 0x00000FE7, 0x00003C00, 0x00000000, 0x00000000,
1183 0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF,
1184 0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF,
1185 0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF,
1186 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1187 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1188 0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF,
1189 0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF,
1190 0xFFFFFDFF, 0xFFFFFDFF, 0xFFFFC3F7, 0xFFFFFFFF,
1191 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1192 0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000,
1193 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1194 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1195 0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
1196 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1197 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1198 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1199 0x00000002, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1200 0x00000000, 0x00000000, 0x00000000, 0x00000000,
1201 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1202 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000FFFF
1206 /* Return true if a given character can occur as first character of an
1207 identifier. See ECMA-334 section 9.4.2. */
1208 static bool
1209 is_identifier_start (int c)
1211 return bitmap_lookup (&table_identifier_start, c);
1212 /* In ASCII only this would be:
1213 return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_');
1217 /* Return true if a given character can occur as character of an identifier.
1218 See ECMA-334 section 9.4.2. */
1219 static bool
1220 is_identifier_part (int c)
1222 return bitmap_lookup (&table_identifier_part, c);
1223 /* In ASCII only this would be:
1224 return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1225 || (c >= '0' && c <= '9') || c == '_');
1229 static bool
1230 is_any_character (int c)
1232 return true;
1236 /* ======================= Preprocessor directives. ======================= */
1239 /* Phase 5: Remove preprocessor lines. See ECMA-334 section 9.5.
1240 As a side effect, this also removes initial whitespace on every line;
1241 this whitespace doesn't matter. */
1243 static int phase5_pushback[10];
1244 static int phase5_pushback_length;
1246 static int
1247 phase5_getc ()
1249 int c;
1251 if (phase5_pushback_length)
1252 return phase5_pushback[--phase5_pushback_length];
1254 c = phase4_getc ();
1255 if (c != UNL)
1256 return c;
1259 c = phase3_getc ();
1260 while (c != UEOF && is_whitespace (c));
1262 if (c == '#')
1264 /* Ignore the entire line containing the preprocessor directive
1265 (including the // comment if it contains one). */
1267 c = phase3_getc ();
1268 while (c != UEOF && c != UNL);
1269 return c;
1271 else
1273 phase3_ungetc (c);
1274 return UNL;
1278 #ifdef unused
1279 static void
1280 phase5_ungetc (int c)
1282 if (c != UEOF)
1284 if (phase5_pushback_length == SIZEOF (phase5_pushback))
1285 abort ();
1286 phase5_pushback[phase5_pushback_length++] = c;
1289 #endif
1292 /* ========================== Reading of tokens. ========================== */
1294 enum token_type_ty
1296 token_type_eof,
1297 token_type_lparen, /* ( */
1298 token_type_rparen, /* ) */
1299 token_type_lbrace, /* { */
1300 token_type_rbrace, /* } */
1301 token_type_comma, /* , */
1302 token_type_dot, /* . */
1303 token_type_string_literal, /* "abc", @"abc" */
1304 token_type_number, /* 1.23 */
1305 token_type_symbol, /* identifier, keyword, null */
1306 token_type_plus, /* + */
1307 token_type_other /* character literal, misc. operator */
1309 typedef enum token_type_ty token_type_ty;
1311 typedef struct token_ty token_ty;
1312 struct token_ty
1314 token_type_ty type;
1315 char *string; /* for token_type_string_literal, token_type_symbol */
1316 refcounted_string_list_ty *comment; /* for token_type_string_literal */
1317 int line_number;
1318 int logical_line_number;
1322 /* Free the memory pointed to by a 'struct token_ty'. */
1323 static inline void
1324 free_token (token_ty *tp)
1326 if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
1327 free (tp->string);
1328 if (tp->type == token_type_string_literal)
1329 drop_reference (tp->comment);
1333 /* Read a Unicode escape sequence outside string/character literals.
1334 Reject Unicode escapes that don't fulfill the given predicate.
1335 See ECMA-334 section 9.4.2. */
1336 static int
1337 do_getc_unicode_escaped (bool (*predicate) (int))
1339 int c;
1341 /* Use phase 3, because phase 4 elides comments. */
1342 c = phase3_getc ();
1343 if (c == UEOF)
1344 return '\\';
1345 if (c == 'u' || c == 'U')
1347 unsigned char buf[8];
1348 int expect;
1349 unsigned int n;
1350 int i;
1352 expect = (c == 'U' ? 8 : 4);
1353 n = 0;
1354 for (i = 0; i < expect; i++)
1356 int c1 = phase3_getc ();
1358 if (c1 >= '0' && c1 <= '9')
1359 n = (n << 4) + (c1 - '0');
1360 else if (c1 >= 'A' && c1 <= 'F')
1361 n = (n << 4) + (c1 - 'A' + 10);
1362 else if (c1 >= 'a' && c1 <= 'f')
1363 n = (n << 4) + (c1 - 'a' + 10);
1364 else
1366 phase3_ungetc (c1);
1367 while (--i >= 0)
1368 phase3_ungetc (buf[i]);
1369 phase3_ungetc (c);
1370 return '\\';
1373 buf[i] = c1;
1376 if (n >= 0x110000)
1378 error_with_progname = false;
1379 error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1380 logical_file_name, line_number);
1381 error_with_progname = true;
1383 else if (predicate (n))
1384 return n;
1386 while (--i >= 0)
1387 phase3_ungetc (buf[i]);
1389 phase3_ungetc (c);
1390 return '\\';
1394 /* Read an escape sequence inside a string literal or character literal.
1395 See ECMA-334 sections 9.4.4.4., 9.4.4.5. */
1396 static int
1397 do_getc_escaped ()
1399 int c;
1400 int n;
1401 int i;
1403 /* Use phase 3, because phase 4 elides comments. */
1404 c = phase3_getc ();
1405 if (c == UEOF)
1406 return '\\';
1407 switch (c)
1409 case 'a':
1410 return 0x0007;
1411 case 'b':
1412 return 0x0008;
1413 case 't':
1414 return 0x0009;
1415 case 'n':
1416 return 0x000a;
1417 case 'v':
1418 return 0x000b;
1419 case 'f':
1420 return 0x000c;
1421 case 'r':
1422 return 0x000d;
1423 case '"':
1424 return '"';
1425 case '\'':
1426 return '\'';
1427 case '\\':
1428 return '\\';
1429 case '0':
1430 return 0x0000;
1431 case 'x':
1432 c = phase3_getc ();
1433 switch (c)
1435 default:
1436 phase3_ungetc (c);
1437 phase3_ungetc ('x');
1438 return '\\';
1440 case '0': case '1': case '2': case '3': case '4':
1441 case '5': case '6': case '7': case '8': case '9':
1442 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1443 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1444 break;
1446 n = 0;
1447 for (i = 0;; i++)
1449 switch (c)
1451 default:
1452 phase3_ungetc (c);
1453 return n;
1454 case '0': case '1': case '2': case '3': case '4':
1455 case '5': case '6': case '7': case '8': case '9':
1456 n = n * 16 + c - '0';
1457 break;
1458 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1459 n = n * 16 + 10 + c - 'A';
1460 break;
1461 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1462 n = n * 16 + 10 + c - 'a';
1463 break;
1465 if (i == 3)
1466 break;
1467 c = phase3_getc ();
1469 return n;
1470 case 'u': case 'U':
1471 phase3_ungetc (c);
1472 return do_getc_unicode_escaped (is_any_character);
1473 default:
1474 /* Invalid escape sequence. */
1475 phase3_ungetc (c);
1476 return '\\';
1480 /* Read a regular string literal or character literal.
1481 See ECMA-334 sections 9.4.4.4., 9.4.4.5. */
1482 static void
1483 accumulate_escaped (struct string_buffer *literal, int delimiter)
1485 int c;
1487 for (;;)
1489 /* Use phase 3, because phase 4 elides comments. */
1490 c = phase3_getc ();
1491 if (c == UEOF || c == delimiter)
1492 break;
1493 if (c == UNL)
1495 phase3_ungetc (c);
1496 error_with_progname = false;
1497 if (delimiter == '\'')
1498 error (0, 0, _("%s:%d: warning: unterminated character constant"),
1499 logical_file_name, line_number);
1500 else
1501 error (0, 0, _("%s:%d: warning: unterminated string constant"),
1502 logical_file_name, line_number);
1503 error_with_progname = true;
1504 break;
1506 if (c == '\\')
1507 c = do_getc_escaped ();
1508 string_buffer_append_unicode (literal, c);
1513 /* Combine characters into tokens. Discard whitespace. */
1515 /* Maximum used guaranteed to be < 4. */
1516 static token_ty phase6_pushback[4];
1517 static int phase6_pushback_length;
1519 static void
1520 phase6_get (token_ty *tp)
1522 int c;
1524 if (phase6_pushback_length)
1526 *tp = phase6_pushback[--phase6_pushback_length];
1527 return;
1529 tp->string = NULL;
1531 for (;;)
1533 tp->line_number = line_number;
1534 tp->logical_line_number = logical_line_number;
1535 c = phase5_getc ();
1537 if (c == UEOF)
1539 tp->type = token_type_eof;
1540 return;
1543 switch (c)
1545 case UNL:
1546 if (last_non_comment_line > last_comment_line)
1547 savable_comment_reset ();
1548 /* FALLTHROUGH */
1549 case ' ':
1550 case '\t':
1551 case '\f':
1552 /* Ignore whitespace and comments. */
1553 continue;
1556 last_non_comment_line = tp->logical_line_number;
1558 switch (c)
1560 case '(':
1561 tp->type = token_type_lparen;
1562 return;
1564 case ')':
1565 tp->type = token_type_rparen;
1566 return;
1568 case '{':
1569 tp->type = token_type_lbrace;
1570 return;
1572 case '}':
1573 tp->type = token_type_rbrace;
1574 return;
1576 case ',':
1577 tp->type = token_type_comma;
1578 return;
1580 case '.':
1581 c = phase4_getc ();
1582 if (!(c >= '0' && c <= '9'))
1584 phase4_ungetc (c);
1585 tp->type = token_type_dot;
1586 return;
1588 /* FALLTHROUGH */
1590 case '0': case '1': case '2': case '3': case '4':
1591 case '5': case '6': case '7': case '8': case '9':
1593 /* Don't need to verify the complicated syntax of integers and
1594 floating-point numbers. We assume a valid C# input.
1595 The simplified syntax that we recognize as number is: any
1596 sequence of alphanumeric characters, additionally '+' and '-'
1597 immediately after 'e' or 'E' except in hexadecimal numbers. */
1598 bool hexadecimal = false;
1600 for (;;)
1602 c = phase4_getc ();
1603 if (c >= '0' && c <= '9')
1604 continue;
1605 if ((c >= 'A' && c <= 'Z') || (c >= 'a' &&c <= 'z'))
1607 if (c == 'X' || c == 'x')
1608 hexadecimal = true;
1609 if ((c == 'E' || c == 'e') && !hexadecimal)
1611 c = phase4_getc ();
1612 if (!(c == '+' || c == '-'))
1613 phase4_ungetc (c);
1615 continue;
1617 if (c == '.')
1618 continue;
1619 break;
1621 phase4_ungetc (c);
1622 tp->type = token_type_number;
1623 return;
1626 case '"':
1627 /* Regular string literal. */
1629 struct string_buffer literal;
1631 init_string_buffer (&literal);
1632 accumulate_escaped (&literal, '"');
1633 tp->string = xstrdup (string_buffer_result (&literal));
1634 free_string_buffer (&literal);
1635 tp->comment = add_reference (savable_comment);
1636 tp->type = token_type_string_literal;
1637 return;
1640 case '\'':
1641 /* Character literal. */
1643 struct string_buffer literal;
1645 init_string_buffer (&literal);
1646 accumulate_escaped (&literal, '\'');
1647 free_string_buffer (&literal);
1648 tp->type = token_type_other;
1649 return;
1652 case '+':
1653 c = phase4_getc ();
1654 if (c == '+')
1655 /* Operator ++ */
1656 tp->type = token_type_other;
1657 else if (c == '=')
1658 /* Operator += */
1659 tp->type = token_type_other;
1660 else
1662 /* Operator + */
1663 phase4_ungetc (c);
1664 tp->type = token_type_plus;
1666 return;
1668 case '@':
1669 c = phase4_getc ();
1670 if (c == '"')
1672 /* Verbatim string literal. */
1673 struct string_buffer literal;
1675 init_string_buffer (&literal);
1676 for (;;)
1678 /* Use phase 2, because phase 4 elides comments and phase 3
1679 mixes up the newline characters. */
1680 c = phase2_getc ();
1681 if (c == UEOF)
1682 break;
1683 if (c == '"')
1685 c = phase2_getc ();
1686 if (c != '"')
1688 phase2_ungetc (c);
1689 break;
1692 /* No special treatment of newline and backslash here. */
1693 string_buffer_append_unicode (&literal, c);
1695 tp->string = xstrdup (string_buffer_result (&literal));
1696 free_string_buffer (&literal);
1697 tp->comment = add_reference (savable_comment);
1698 tp->type = token_type_string_literal;
1699 return;
1701 /* FALLTHROUGH, so that @identifier is recognized. */
1703 default:
1704 if (c == '\\')
1705 c = do_getc_unicode_escaped (is_identifier_start);
1706 if (is_identifier_start (c))
1708 static struct string_buffer buffer;
1709 buffer.utf8_buflen = 0;
1710 for (;;)
1712 string_buffer_append_unicode (&buffer, c);
1713 c = phase4_getc ();
1714 if (c == '\\')
1715 c = do_getc_unicode_escaped (is_identifier_part);
1716 if (!is_identifier_part (c))
1717 break;
1719 phase4_ungetc (c);
1720 tp->string = xstrdup (string_buffer_result (&buffer));
1721 tp->type = token_type_symbol;
1722 return;
1724 else
1726 /* Misc. operator. */
1727 tp->type = token_type_other;
1728 return;
1734 /* Supports 3 tokens of pushback. */
1735 static void
1736 phase6_unget (token_ty *tp)
1738 if (tp->type != token_type_eof)
1740 if (phase6_pushback_length == SIZEOF (phase6_pushback))
1741 abort ();
1742 phase6_pushback[phase6_pushback_length++] = *tp;
1747 /* Compile-time optimization of string literal concatenation.
1748 Combine "string1" + ... + "stringN" to the concatenated string if
1749 - the token after this expression is not '.' (because then the last
1750 string could be part of a method call expression). */
1752 static token_ty phase7_pushback[2];
1753 static int phase7_pushback_length;
1755 static void
1756 phase7_get (token_ty *tp)
1758 if (phase7_pushback_length)
1760 *tp = phase7_pushback[--phase7_pushback_length];
1761 return;
1764 phase6_get (tp);
1765 if (tp->type == token_type_string_literal)
1767 char *sum = tp->string;
1768 size_t sum_len = strlen (sum);
1770 for (;;)
1772 token_ty token2;
1774 phase6_get (&token2);
1775 if (token2.type == token_type_plus)
1777 token_ty token3;
1779 phase6_get (&token3);
1780 if (token3.type == token_type_string_literal)
1782 token_ty token_after;
1784 phase6_get (&token_after);
1785 if (token_after.type != token_type_dot)
1787 char *addend = token3.string;
1788 size_t addend_len = strlen (addend);
1790 sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1791 memcpy (sum + sum_len, addend, addend_len + 1);
1792 sum_len += addend_len;
1794 phase6_unget (&token_after);
1795 free_token (&token3);
1796 free_token (&token2);
1797 continue;
1799 phase6_unget (&token_after);
1801 phase6_unget (&token3);
1803 phase6_unget (&token2);
1804 break;
1806 tp->string = sum;
1810 /* Supports 2 tokens of pushback. */
1811 static void
1812 phase7_unget (token_ty *tp)
1814 if (tp->type != token_type_eof)
1816 if (phase7_pushback_length == SIZEOF (phase7_pushback))
1817 abort ();
1818 phase7_pushback[phase7_pushback_length++] = *tp;
1823 static void
1824 x_csharp_lex (token_ty *tp)
1826 phase7_get (tp);
1829 /* Supports 2 tokens of pushback. */
1830 static void
1831 x_csharp_unlex (token_ty *tp)
1833 phase7_unget (tp);
1837 /* ========================= Extracting strings. ========================== */
1840 /* Context lookup table. */
1841 static flag_context_list_table_ty *flag_context_list_table;
1844 /* The file is broken into tokens. Scan the token stream, looking for
1845 a keyword, followed by a left paren, followed by a string. When we
1846 see this sequence, we have something to remember. We assume we are
1847 looking at a valid C or C++ program, and leave the complaints about
1848 the grammar to the compiler.
1850 Normal handling: Look for
1851 keyword ( ... msgid ... )
1852 Plural handling: Look for
1853 keyword ( ... msgid ... msgid_plural ... )
1855 We use recursion because the arguments before msgid or between msgid
1856 and msgid_plural can contain subexpressions of the same form. */
1859 /* Extract messages until the next balanced closing parenthesis or brace,
1860 depending on TERMINATOR.
1861 Extracted messages are added to MLP.
1862 When a specific argument shall be extracted, COMMAS_TO_SKIP >= 0 and,
1863 if also a plural argument shall be extracted, PLURAL_COMMAS > 0,
1864 otherwise PLURAL_COMMAS = 0.
1865 When no specific argument shall be extracted, COMMAS_TO_SKIP < 0.
1866 Return true upon eof, false upon closing parenthesis or brace. */
1867 static bool
1868 extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
1869 flag_context_ty outer_context,
1870 flag_context_list_iterator_ty context_iter,
1871 int commas_to_skip, int plural_commas)
1873 /* Remember the message containing the msgid, for msgid_plural. */
1874 message_ty *plural_mp = NULL;
1876 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1877 int state;
1878 /* Parameters of the keyword just seen. Defined only in state 1. */
1879 int next_commas_to_skip = -1;
1880 int next_plural_commas = 0;
1881 /* Context iterator that will be used if the next token is a '('. */
1882 flag_context_list_iterator_ty next_context_iter =
1883 passthrough_context_list_iterator;
1884 /* Current context. */
1885 flag_context_ty inner_context =
1886 inherited_context (outer_context,
1887 flag_context_list_iterator_advance (&context_iter));
1889 /* Start state is 0. */
1890 state = 0;
1892 for (;;)
1894 token_ty token;
1896 x_csharp_lex (&token);
1897 switch (token.type)
1899 case token_type_symbol:
1901 /* Combine symbol1 . ... . symbolN to a single strings, so that
1902 we can recognize static function calls like
1903 GettextResource.gettext. The information present for
1904 symbolI.....symbolN has precedence over the information for
1905 symbolJ.....symbolN with J > I. */
1906 char *sum = token.string;
1907 size_t sum_len = strlen (sum);
1908 const char *dottedname;
1909 flag_context_list_ty *context_list;
1911 for (;;)
1913 token_ty token2;
1915 x_csharp_lex (&token2);
1916 if (token2.type == token_type_dot)
1918 token_ty token3;
1920 x_csharp_lex (&token3);
1921 if (token3.type == token_type_symbol)
1923 char *addend = token3.string;
1924 size_t addend_len = strlen (addend);
1926 sum =
1927 (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
1928 sum[sum_len] = '.';
1929 memcpy (sum + sum_len + 1, addend, addend_len + 1);
1930 sum_len += 1 + addend_len;
1932 free_token (&token3);
1933 free_token (&token2);
1934 continue;
1936 x_csharp_unlex (&token3);
1938 x_csharp_unlex (&token2);
1939 break;
1942 for (dottedname = sum;;)
1944 void *keyword_value;
1946 if (find_entry (&keywords, dottedname, strlen (dottedname),
1947 &keyword_value)
1948 == 0)
1950 int argnum1 = (int) (long) keyword_value & ((1 << 10) - 1);
1951 int argnum2 = (int) (long) keyword_value >> 10;
1953 next_commas_to_skip = argnum1 - 1;
1954 next_plural_commas = (argnum2 > argnum1 ? argnum2 - argnum1 : 0);
1955 state = 1;
1956 break;
1959 dottedname = strchr (dottedname, '.');
1960 if (dottedname == NULL)
1962 state = 0;
1963 break;
1965 dottedname++;
1968 for (dottedname = sum;;)
1970 context_list =
1971 flag_context_list_table_lookup (
1972 flag_context_list_table,
1973 dottedname, strlen (dottedname));
1974 if (context_list != NULL)
1975 break;
1977 dottedname = strchr (dottedname, '.');
1978 if (dottedname == NULL)
1979 break;
1980 dottedname++;
1982 next_context_iter = flag_context_list_iterator (context_list);
1984 free (sum);
1985 continue;
1988 case token_type_lparen:
1989 if (extract_parenthesized (mlp, token_type_rparen,
1990 inner_context, next_context_iter,
1991 state ? next_commas_to_skip : -1,
1992 state ? next_plural_commas : 0))
1993 return true;
1994 next_context_iter = null_context_list_iterator;
1995 state = 0;
1996 continue;
1998 case token_type_rparen:
1999 if (terminator == token_type_rparen)
2000 return false;
2001 if (terminator == token_type_rbrace)
2003 error_with_progname = false;
2004 error (0, 0,
2005 _("%s:%d: warning: ')' found where '}' was expected"),
2006 logical_file_name, token.line_number);
2007 error_with_progname = true;
2009 next_context_iter = null_context_list_iterator;
2010 state = 0;
2011 continue;
2013 case token_type_lbrace:
2014 if (extract_parenthesized (mlp, token_type_rbrace,
2015 null_context, null_context_list_iterator,
2016 -1, 0))
2017 return true;
2018 next_context_iter = null_context_list_iterator;
2019 state = 0;
2020 continue;
2022 case token_type_rbrace:
2023 if (terminator == token_type_rbrace)
2024 return false;
2025 if (terminator == token_type_rparen)
2027 error_with_progname = false;
2028 error (0, 0,
2029 _("%s:%d: warning: '}' found where ')' was expected"),
2030 logical_file_name, token.line_number);
2031 error_with_progname = true;
2033 next_context_iter = null_context_list_iterator;
2034 state = 0;
2035 continue;
2037 case token_type_comma:
2038 if (commas_to_skip >= 0)
2040 if (commas_to_skip > 0)
2041 commas_to_skip--;
2042 else
2043 if (plural_mp != NULL && plural_commas > 0)
2045 commas_to_skip = plural_commas - 1;
2046 plural_commas = 0;
2048 else
2049 commas_to_skip = -1;
2051 inner_context =
2052 inherited_context (outer_context,
2053 flag_context_list_iterator_advance (
2054 &context_iter));
2055 next_context_iter = passthrough_context_list_iterator;
2056 state = 0;
2057 continue;
2059 case token_type_string_literal:
2061 lex_pos_ty pos;
2062 pos.file_name = logical_file_name;
2063 pos.line_number = token.line_number;
2065 if (extract_all)
2067 xgettext_current_source_encoding = po_charset_utf8;
2068 savable_comment_to_xgettext_comment (token.comment);
2069 remember_a_message (mlp, token.string, inner_context, &pos);
2070 savable_comment_reset ();
2071 xgettext_current_source_encoding = xgettext_global_source_encoding;
2073 else
2075 if (commas_to_skip == 0)
2077 if (plural_mp == NULL)
2079 /* Seen an msgid. */
2080 message_ty *mp;
2082 xgettext_current_source_encoding = po_charset_utf8;
2083 savable_comment_to_xgettext_comment (token.comment);
2084 mp = remember_a_message (mlp, token.string,
2085 inner_context, &pos);
2086 savable_comment_reset ();
2087 xgettext_current_source_encoding = xgettext_global_source_encoding;
2088 if (plural_commas > 0)
2089 plural_mp = mp;
2091 else
2093 /* Seen an msgid_plural. */
2094 xgettext_current_source_encoding = po_charset_utf8;
2095 remember_a_message_plural (plural_mp, token.string,
2096 inner_context, &pos);
2097 xgettext_current_source_encoding = xgettext_global_source_encoding;
2098 plural_mp = NULL;
2101 else
2102 free (token.string);
2105 drop_reference (token.comment);
2106 next_context_iter = null_context_list_iterator;
2107 state = 0;
2108 continue;
2110 case token_type_eof:
2111 return true;
2113 case token_type_dot:
2114 case token_type_number:
2115 case token_type_plus:
2116 case token_type_other:
2117 next_context_iter = null_context_list_iterator;
2118 state = 0;
2119 continue;
2121 default:
2122 abort ();
2128 void
2129 extract_csharp (FILE *f,
2130 const char *real_filename, const char *logical_filename,
2131 flag_context_list_table_ty *flag_table,
2132 msgdomain_list_ty *mdlp)
2134 message_list_ty *mlp = mdlp->item[0]->messages;
2136 fp = f;
2137 real_file_name = real_filename;
2138 logical_file_name = xstrdup (logical_filename);
2139 line_number = 1;
2141 logical_line_number = 1;
2142 last_comment_line = -1;
2143 last_non_comment_line = -1;
2145 flag_context_list_table = flag_table;
2147 init_keywords ();
2149 /* Eat tokens until eof is seen. When extract_parenthesized returns
2150 due to an unbalanced closing parenthesis, just restart it. */
2151 while (!extract_parenthesized (mlp, token_type_eof,
2152 null_context, null_context_list_iterator,
2153 -1, 0))
2156 fp = NULL;
2157 real_file_name = NULL;
2158 logical_file_name = NULL;
2159 line_number = 0;