Updates
[glib.git] / glib / gmarkup.c
blobe1e9aa579bbea95fba858a6615a23768f9dfdb9d
1 /* gmarkup.c - Simple XML-like parser
3 * Copyright 2000, 2003 Red Hat, Inc.
5 * GLib is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU Lesser General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * GLib is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with GLib; see the file COPYING.LIB. If not,
17 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 02111-1307, USA.
21 #include "config.h"
23 #include <stdarg.h>
24 #include <string.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <errno.h>
29 #include "glib.h"
30 #include "glibintl.h"
31 #include "galias.h"
33 GQuark
34 g_markup_error_quark (void)
36 return g_quark_from_static_string ("g-markup-error-quark");
39 typedef enum
41 STATE_START,
42 STATE_AFTER_OPEN_ANGLE,
43 STATE_AFTER_CLOSE_ANGLE,
44 STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */
45 STATE_INSIDE_OPEN_TAG_NAME,
46 STATE_INSIDE_ATTRIBUTE_NAME,
47 STATE_AFTER_ATTRIBUTE_NAME,
48 STATE_BETWEEN_ATTRIBUTES,
49 STATE_AFTER_ATTRIBUTE_EQUALS_SIGN,
50 STATE_INSIDE_ATTRIBUTE_VALUE_SQ,
51 STATE_INSIDE_ATTRIBUTE_VALUE_DQ,
52 STATE_INSIDE_TEXT,
53 STATE_AFTER_CLOSE_TAG_SLASH,
54 STATE_INSIDE_CLOSE_TAG_NAME,
55 STATE_AFTER_CLOSE_TAG_NAME,
56 STATE_INSIDE_PASSTHROUGH,
57 STATE_ERROR
58 } GMarkupParseState;
60 struct _GMarkupParseContext
62 const GMarkupParser *parser;
64 GMarkupParseFlags flags;
66 gint line_number;
67 gint char_number;
69 gpointer user_data;
70 GDestroyNotify dnotify;
72 /* A piece of character data or an element that
73 * hasn't "ended" yet so we haven't yet called
74 * the callback for it.
76 GString *partial_chunk;
78 GMarkupParseState state;
79 GSList *tag_stack;
80 gchar **attr_names;
81 gchar **attr_values;
82 gint cur_attr;
83 gint alloc_attrs;
85 const gchar *current_text;
86 gssize current_text_len;
87 const gchar *current_text_end;
89 GString *leftover_char_portion;
91 /* used to save the start of the last interesting thingy */
92 const gchar *start;
94 const gchar *iter;
96 guint document_empty : 1;
97 guint parsing : 1;
98 gint balance;
102 * g_markup_parse_context_new:
103 * @parser: a #GMarkupParser
104 * @flags: one or more #GMarkupParseFlags
105 * @user_data: user data to pass to #GMarkupParser functions
106 * @user_data_dnotify: user data destroy notifier called when the parse context is freed
108 * Creates a new parse context. A parse context is used to parse
109 * marked-up documents. You can feed any number of documents into
110 * a context, as long as no errors occur; once an error occurs,
111 * the parse context can't continue to parse text (you have to free it
112 * and create a new parse context).
114 * Return value: a new #GMarkupParseContext
116 GMarkupParseContext *
117 g_markup_parse_context_new (const GMarkupParser *parser,
118 GMarkupParseFlags flags,
119 gpointer user_data,
120 GDestroyNotify user_data_dnotify)
122 GMarkupParseContext *context;
124 g_return_val_if_fail (parser != NULL, NULL);
126 context = g_new (GMarkupParseContext, 1);
128 context->parser = parser;
129 context->flags = flags;
130 context->user_data = user_data;
131 context->dnotify = user_data_dnotify;
133 context->line_number = 1;
134 context->char_number = 1;
136 context->partial_chunk = NULL;
138 context->state = STATE_START;
139 context->tag_stack = NULL;
140 context->attr_names = NULL;
141 context->attr_values = NULL;
142 context->cur_attr = -1;
143 context->alloc_attrs = 0;
145 context->current_text = NULL;
146 context->current_text_len = -1;
147 context->current_text_end = NULL;
148 context->leftover_char_portion = NULL;
150 context->start = NULL;
151 context->iter = NULL;
153 context->document_empty = TRUE;
154 context->parsing = FALSE;
156 context->balance = 0;
158 return context;
162 * g_markup_parse_context_free:
163 * @context: a #GMarkupParseContext
165 * Frees a #GMarkupParseContext. Can't be called from inside
166 * one of the #GMarkupParser functions.
169 void
170 g_markup_parse_context_free (GMarkupParseContext *context)
172 g_return_if_fail (context != NULL);
173 g_return_if_fail (!context->parsing);
175 if (context->dnotify)
176 (* context->dnotify) (context->user_data);
178 g_strfreev (context->attr_names);
179 g_strfreev (context->attr_values);
181 g_slist_foreach (context->tag_stack, (GFunc)g_free, NULL);
182 g_slist_free (context->tag_stack);
184 if (context->partial_chunk)
185 g_string_free (context->partial_chunk, TRUE);
187 if (context->leftover_char_portion)
188 g_string_free (context->leftover_char_portion, TRUE);
190 g_free (context);
193 static void
194 mark_error (GMarkupParseContext *context,
195 GError *error)
197 context->state = STATE_ERROR;
199 if (context->parser->error)
200 (*context->parser->error) (context, error, context->user_data);
203 static void set_error (GMarkupParseContext *context,
204 GError **error,
205 GMarkupError code,
206 const gchar *format,
207 ...) G_GNUC_PRINTF (4, 5);
209 static void
210 set_error (GMarkupParseContext *context,
211 GError **error,
212 GMarkupError code,
213 const gchar *format,
214 ...)
216 GError *tmp_error;
217 gchar *s;
218 va_list args;
220 va_start (args, format);
221 s = g_strdup_vprintf (format, args);
222 va_end (args);
224 tmp_error = g_error_new_literal (G_MARKUP_ERROR, code, s);
225 g_free (s);
227 g_prefix_error (&tmp_error,
228 _("Error on line %d char %d: "),
229 context->line_number,
230 context->char_number);
232 mark_error (context, tmp_error);
234 g_propagate_error (error, tmp_error);
237 static void
238 propagate_error (GMarkupParseContext *context,
239 GError **dest,
240 GError *src)
242 if (context->flags & G_MARKUP_PREFIX_ERROR_POSITION)
243 g_prefix_error (&src,
244 _("Error on line %d char %d: "),
245 context->line_number,
246 context->char_number);
248 mark_error (context, src);
250 g_propagate_error (dest, src);
253 /* To make these faster, we first use the ascii-only tests, then check
254 * for the usual non-alnum name-end chars, and only then call the
255 * expensive unicode stuff. Nobody uses non-ascii in XML tag/attribute
256 * names, so this is a reasonable hack that virtually always avoids
257 * the guniprop call.
259 #define IS_COMMON_NAME_END_CHAR(c) \
260 ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')
262 static gboolean
263 is_name_start_char (const gchar *p)
265 if (g_ascii_isalpha (*p) ||
266 (!IS_COMMON_NAME_END_CHAR (*p) &&
267 (*p == '_' ||
268 *p == ':' ||
269 g_unichar_isalpha (g_utf8_get_char (p)))))
270 return TRUE;
271 else
272 return FALSE;
275 static gboolean
276 is_name_char (const gchar *p)
278 if (g_ascii_isalnum (*p) ||
279 (!IS_COMMON_NAME_END_CHAR (*p) &&
280 (*p == '.' ||
281 *p == '-' ||
282 *p == '_' ||
283 *p == ':' ||
284 g_unichar_isalpha (g_utf8_get_char (p)))))
285 return TRUE;
286 else
287 return FALSE;
291 static gchar*
292 char_str (gunichar c,
293 gchar *buf)
295 memset (buf, 0, 8);
296 g_unichar_to_utf8 (c, buf);
297 return buf;
300 static gchar*
301 utf8_str (const gchar *utf8,
302 gchar *buf)
304 char_str (g_utf8_get_char (utf8), buf);
305 return buf;
308 static void
309 set_unescape_error (GMarkupParseContext *context,
310 GError **error,
311 const gchar *remaining_text,
312 const gchar *remaining_text_end,
313 GMarkupError code,
314 const gchar *format,
315 ...)
317 GError *tmp_error;
318 gchar *s;
319 va_list args;
320 gint remaining_newlines;
321 const gchar *p;
323 remaining_newlines = 0;
324 p = remaining_text;
325 while (p != remaining_text_end)
327 if (*p == '\n')
328 ++remaining_newlines;
329 ++p;
332 va_start (args, format);
333 s = g_strdup_vprintf (format, args);
334 va_end (args);
336 tmp_error = g_error_new (G_MARKUP_ERROR,
337 code,
338 _("Error on line %d: %s"),
339 context->line_number - remaining_newlines,
342 g_free (s);
344 mark_error (context, tmp_error);
346 g_propagate_error (error, tmp_error);
349 typedef enum
351 USTATE_INSIDE_TEXT,
352 USTATE_AFTER_AMPERSAND,
353 USTATE_INSIDE_ENTITY_NAME,
354 USTATE_AFTER_CHARREF_HASH
355 } UnescapeState;
357 typedef struct
359 GMarkupParseContext *context;
360 GString *str;
361 UnescapeState state;
362 const gchar *text;
363 const gchar *text_end;
364 const gchar *entity_start;
365 } UnescapeContext;
367 static const gchar*
368 unescape_text_state_inside_text (UnescapeContext *ucontext,
369 const gchar *p,
370 GError **error)
372 const gchar *start;
373 gboolean normalize_attribute;
375 if (ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
376 ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
377 normalize_attribute = TRUE;
378 else
379 normalize_attribute = FALSE;
381 start = p;
383 while (p != ucontext->text_end)
385 if (*p == '&')
387 break;
389 else if (normalize_attribute && (*p == '\t' || *p == '\n'))
391 g_string_append_len (ucontext->str, start, p - start);
392 g_string_append_c (ucontext->str, ' ');
393 p = g_utf8_next_char (p);
394 start = p;
396 else if (*p == '\r')
398 g_string_append_len (ucontext->str, start, p - start);
399 g_string_append_c (ucontext->str, normalize_attribute ? ' ' : '\n');
400 p = g_utf8_next_char (p);
401 if (p != ucontext->text_end && *p == '\n')
402 p = g_utf8_next_char (p);
403 start = p;
405 else
406 p = g_utf8_next_char (p);
409 if (p != start)
410 g_string_append_len (ucontext->str, start, p - start);
412 if (p != ucontext->text_end && *p == '&')
414 p = g_utf8_next_char (p);
415 ucontext->state = USTATE_AFTER_AMPERSAND;
418 return p;
421 static const gchar*
422 unescape_text_state_after_ampersand (UnescapeContext *ucontext,
423 const gchar *p,
424 GError **error)
426 ucontext->entity_start = NULL;
428 if (*p == '#')
430 p = g_utf8_next_char (p);
432 ucontext->entity_start = p;
433 ucontext->state = USTATE_AFTER_CHARREF_HASH;
435 else if (!is_name_start_char (p))
437 if (*p == ';')
439 set_unescape_error (ucontext->context, error,
440 p, ucontext->text_end,
441 G_MARKUP_ERROR_PARSE,
442 _("Empty entity '&;' seen; valid "
443 "entities are: &amp; &quot; &lt; &gt; &apos;"));
445 else
447 gchar buf[8];
449 set_unescape_error (ucontext->context, error,
450 p, ucontext->text_end,
451 G_MARKUP_ERROR_PARSE,
452 _("Character '%s' is not valid at "
453 "the start of an entity name; "
454 "the & character begins an entity; "
455 "if this ampersand isn't supposed "
456 "to be an entity, escape it as "
457 "&amp;"),
458 utf8_str (p, buf));
461 else
463 ucontext->entity_start = p;
464 ucontext->state = USTATE_INSIDE_ENTITY_NAME;
467 return p;
470 static const gchar*
471 unescape_text_state_inside_entity_name (UnescapeContext *ucontext,
472 const gchar *p,
473 GError **error)
475 while (p != ucontext->text_end)
477 if (*p == ';')
478 break;
479 else if (!is_name_char (p))
481 gchar ubuf[8];
483 set_unescape_error (ucontext->context, error,
484 p, ucontext->text_end,
485 G_MARKUP_ERROR_PARSE,
486 _("Character '%s' is not valid "
487 "inside an entity name"),
488 utf8_str (p, ubuf));
489 break;
492 p = g_utf8_next_char (p);
495 if (ucontext->context->state != STATE_ERROR)
497 if (p != ucontext->text_end)
499 gint len = p - ucontext->entity_start;
501 /* move to after semicolon */
502 p = g_utf8_next_char (p);
503 ucontext->state = USTATE_INSIDE_TEXT;
505 if (strncmp (ucontext->entity_start, "lt", len) == 0)
506 g_string_append_c (ucontext->str, '<');
507 else if (strncmp (ucontext->entity_start, "gt", len) == 0)
508 g_string_append_c (ucontext->str, '>');
509 else if (strncmp (ucontext->entity_start, "amp", len) == 0)
510 g_string_append_c (ucontext->str, '&');
511 else if (strncmp (ucontext->entity_start, "quot", len) == 0)
512 g_string_append_c (ucontext->str, '"');
513 else if (strncmp (ucontext->entity_start, "apos", len) == 0)
514 g_string_append_c (ucontext->str, '\'');
515 else
517 gchar *name;
519 name = g_strndup (ucontext->entity_start, len);
520 set_unescape_error (ucontext->context, error,
521 p, ucontext->text_end,
522 G_MARKUP_ERROR_PARSE,
523 _("Entity name '%s' is not known"),
524 name);
525 g_free (name);
528 else
530 set_unescape_error (ucontext->context, error,
531 /* give line number of the & */
532 ucontext->entity_start, ucontext->text_end,
533 G_MARKUP_ERROR_PARSE,
534 _("Entity did not end with a semicolon; "
535 "most likely you used an ampersand "
536 "character without intending to start "
537 "an entity - escape ampersand as &amp;"));
540 #undef MAX_ENT_LEN
542 return p;
545 static const gchar*
546 unescape_text_state_after_charref_hash (UnescapeContext *ucontext,
547 const gchar *p,
548 GError **error)
550 gboolean is_hex = FALSE;
551 const char *start;
553 start = ucontext->entity_start;
555 if (*p == 'x')
557 is_hex = TRUE;
558 p = g_utf8_next_char (p);
559 start = p;
562 while (p != ucontext->text_end && *p != ';')
563 p = g_utf8_next_char (p);
565 if (p != ucontext->text_end)
567 g_assert (*p == ';');
569 /* digit is between start and p */
571 if (start != p)
573 gulong l;
574 gchar *end = NULL;
576 errno = 0;
577 if (is_hex)
578 l = strtoul (start, &end, 16);
579 else
580 l = strtoul (start, &end, 10);
582 if (end != p || errno != 0)
584 set_unescape_error (ucontext->context, error,
585 start, ucontext->text_end,
586 G_MARKUP_ERROR_PARSE,
587 _("Failed to parse '%-.*s', which "
588 "should have been a digit "
589 "inside a character reference "
590 "(&#234; for example) - perhaps "
591 "the digit is too large"),
592 p - start, start);
594 else
596 /* characters XML permits */
597 if (l == 0x9 ||
598 l == 0xA ||
599 l == 0xD ||
600 (l >= 0x20 && l <= 0xD7FF) ||
601 (l >= 0xE000 && l <= 0xFFFD) ||
602 (l >= 0x10000 && l <= 0x10FFFF))
604 gchar buf[8];
605 g_string_append (ucontext->str, char_str (l, buf));
607 else
609 set_unescape_error (ucontext->context, error,
610 start, ucontext->text_end,
611 G_MARKUP_ERROR_PARSE,
612 _("Character reference '%-.*s' does not "
613 "encode a permitted character"),
614 p - start, start);
618 /* Move to next state */
619 p = g_utf8_next_char (p); /* past semicolon */
620 ucontext->state = USTATE_INSIDE_TEXT;
622 else
624 set_unescape_error (ucontext->context, error,
625 start, ucontext->text_end,
626 G_MARKUP_ERROR_PARSE,
627 _("Empty character reference; "
628 "should include a digit such as "
629 "&#454;"));
632 else
634 set_unescape_error (ucontext->context, error,
635 start, ucontext->text_end,
636 G_MARKUP_ERROR_PARSE,
637 _("Character reference did not end with a "
638 "semicolon; "
639 "most likely you used an ampersand "
640 "character without intending to start "
641 "an entity - escape ampersand as &amp;"));
644 return p;
647 static gboolean
648 unescape_text (GMarkupParseContext *context,
649 const gchar *text,
650 const gchar *text_end,
651 GString **unescaped,
652 GError **error)
654 UnescapeContext ucontext;
655 const gchar *p;
657 ucontext.context = context;
658 ucontext.text = text;
659 ucontext.text_end = text_end;
660 ucontext.entity_start = NULL;
662 ucontext.str = g_string_sized_new (text_end - text);
664 ucontext.state = USTATE_INSIDE_TEXT;
665 p = text;
667 while (p != text_end && context->state != STATE_ERROR)
669 g_assert (p < text_end);
671 switch (ucontext.state)
673 case USTATE_INSIDE_TEXT:
675 p = unescape_text_state_inside_text (&ucontext,
677 error);
679 break;
681 case USTATE_AFTER_AMPERSAND:
683 p = unescape_text_state_after_ampersand (&ucontext,
685 error);
687 break;
690 case USTATE_INSIDE_ENTITY_NAME:
692 p = unescape_text_state_inside_entity_name (&ucontext,
694 error);
696 break;
698 case USTATE_AFTER_CHARREF_HASH:
700 p = unescape_text_state_after_charref_hash (&ucontext,
702 error);
704 break;
706 default:
707 g_assert_not_reached ();
708 break;
712 if (context->state != STATE_ERROR)
714 switch (ucontext.state)
716 case USTATE_INSIDE_TEXT:
717 break;
718 case USTATE_AFTER_AMPERSAND:
719 case USTATE_INSIDE_ENTITY_NAME:
720 set_unescape_error (context, error,
721 NULL, NULL,
722 G_MARKUP_ERROR_PARSE,
723 _("Unfinished entity reference"));
724 break;
725 case USTATE_AFTER_CHARREF_HASH:
726 set_unescape_error (context, error,
727 NULL, NULL,
728 G_MARKUP_ERROR_PARSE,
729 _("Unfinished character reference"));
730 break;
734 if (context->state == STATE_ERROR)
736 g_string_free (ucontext.str, TRUE);
737 *unescaped = NULL;
738 return FALSE;
740 else
742 *unescaped = ucontext.str;
743 return TRUE;
747 static inline gboolean
748 advance_char (GMarkupParseContext *context)
750 context->iter = g_utf8_next_char (context->iter);
751 context->char_number += 1;
753 if (context->iter == context->current_text_end)
755 return FALSE;
757 else if (*context->iter == '\n')
759 context->line_number += 1;
760 context->char_number = 1;
763 return TRUE;
766 static inline gboolean
767 xml_isspace (char c)
769 return c == ' ' || c == '\t' || c == '\n' || c == '\r';
772 static void
773 skip_spaces (GMarkupParseContext *context)
777 if (!xml_isspace (*context->iter))
778 return;
780 while (advance_char (context));
783 static void
784 advance_to_name_end (GMarkupParseContext *context)
788 if (!is_name_char (context->iter))
789 return;
791 while (advance_char (context));
794 static void
795 add_to_partial (GMarkupParseContext *context,
796 const gchar *text_start,
797 const gchar *text_end)
799 if (context->partial_chunk == NULL)
800 context->partial_chunk = g_string_sized_new (text_end - text_start);
802 if (text_start != text_end)
803 g_string_append_len (context->partial_chunk, text_start,
804 text_end - text_start);
806 /* Invariant here that partial_chunk exists */
809 static void
810 truncate_partial (GMarkupParseContext *context)
812 if (context->partial_chunk != NULL)
814 context->partial_chunk = g_string_truncate (context->partial_chunk, 0);
818 static const gchar*
819 current_element (GMarkupParseContext *context)
821 return context->tag_stack->data;
824 static const gchar*
825 current_attribute (GMarkupParseContext *context)
827 g_assert (context->cur_attr >= 0);
828 return context->attr_names[context->cur_attr];
831 static void
832 find_current_text_end (GMarkupParseContext *context)
834 /* This function must be safe (non-segfaulting) on invalid UTF8.
835 * It assumes the string starts with a character start
837 const gchar *end = context->current_text + context->current_text_len;
838 const gchar *p;
839 const gchar *next;
841 g_assert (context->current_text_len > 0);
843 p = g_utf8_find_prev_char (context->current_text, end);
845 g_assert (p != NULL); /* since current_text was a char start */
847 /* p is now the start of the last character or character portion. */
848 g_assert (p != end);
849 next = g_utf8_next_char (p); /* this only touches *p, nothing beyond */
851 if (next == end)
853 /* whole character */
854 context->current_text_end = end;
856 else
858 /* portion */
859 context->leftover_char_portion = g_string_new_len (p, end - p);
860 context->current_text_len -= (end - p);
861 context->current_text_end = p;
866 static void
867 add_attribute (GMarkupParseContext *context, char *name)
869 if (context->cur_attr + 2 >= context->alloc_attrs)
871 context->alloc_attrs += 5; /* silly magic number */
872 context->attr_names = g_realloc (context->attr_names, sizeof(char*)*context->alloc_attrs);
873 context->attr_values = g_realloc (context->attr_values, sizeof(char*)*context->alloc_attrs);
875 context->cur_attr++;
876 context->attr_names[context->cur_attr] = name;
877 context->attr_values[context->cur_attr] = NULL;
878 context->attr_names[context->cur_attr+1] = NULL;
879 context->attr_values[context->cur_attr+1] = NULL;
883 * g_markup_parse_context_parse:
884 * @context: a #GMarkupParseContext
885 * @text: chunk of text to parse
886 * @text_len: length of @text in bytes
887 * @error: return location for a #GError
889 * Feed some data to the #GMarkupParseContext. The data need not
890 * be valid UTF-8; an error will be signaled if it's invalid.
891 * The data need not be an entire document; you can feed a document
892 * into the parser incrementally, via multiple calls to this function.
893 * Typically, as you receive data from a network connection or file,
894 * you feed each received chunk of data into this function, aborting
895 * the process if an error occurs. Once an error is reported, no further
896 * data may be fed to the #GMarkupParseContext; all errors are fatal.
898 * Return value: %FALSE if an error occurred, %TRUE on success
900 gboolean
901 g_markup_parse_context_parse (GMarkupParseContext *context,
902 const gchar *text,
903 gssize text_len,
904 GError **error)
906 const gchar *first_invalid;
908 g_return_val_if_fail (context != NULL, FALSE);
909 g_return_val_if_fail (text != NULL, FALSE);
910 g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
911 g_return_val_if_fail (!context->parsing, FALSE);
913 if (text_len < 0)
914 text_len = strlen (text);
916 if (text_len == 0)
917 return TRUE;
919 context->parsing = TRUE;
921 if (context->leftover_char_portion)
923 const gchar *first_char;
925 if ((*text & 0xc0) != 0x80)
926 first_char = text;
927 else
928 first_char = g_utf8_find_next_char (text, text + text_len);
930 if (first_char)
932 /* leftover_char_portion was completed. Parse it. */
933 GString *portion = context->leftover_char_portion;
935 g_string_append_len (context->leftover_char_portion,
936 text, first_char - text);
938 /* hacks to allow recursion */
939 context->parsing = FALSE;
940 context->leftover_char_portion = NULL;
942 if (!g_markup_parse_context_parse (context,
943 portion->str, portion->len,
944 error))
946 g_assert (context->state == STATE_ERROR);
949 g_string_free (portion, TRUE);
950 context->parsing = TRUE;
952 /* Skip the fraction of char that was in this text */
953 text_len -= (first_char - text);
954 text = first_char;
956 else
958 /* another little chunk of the leftover char; geez
959 * someone is inefficient.
961 g_string_append_len (context->leftover_char_portion,
962 text, text_len);
964 if (context->leftover_char_portion->len > 7)
966 /* The leftover char portion is too big to be
967 * a UTF-8 character
969 set_error (context,
970 error,
971 G_MARKUP_ERROR_BAD_UTF8,
972 _("Invalid UTF-8 encoded text - overlong sequence"));
975 goto finished;
979 context->current_text = text;
980 context->current_text_len = text_len;
981 context->iter = context->current_text;
982 context->start = context->iter;
984 /* Nothing left after finishing the leftover char, or nothing
985 * passed in to begin with.
987 if (context->current_text_len == 0)
988 goto finished;
990 /* find_current_text_end () assumes the string starts at
991 * a character start, so we need to validate at least
992 * that much. It doesn't assume any following bytes
993 * are valid.
995 if ((*context->current_text & 0xc0) == 0x80) /* not a char start */
997 set_error (context,
998 error,
999 G_MARKUP_ERROR_BAD_UTF8,
1000 _("Invalid UTF-8 encoded text - not a start char"));
1001 goto finished;
1004 /* Initialize context->current_text_end, possibly adjusting
1005 * current_text_len, and add any leftover char portion
1007 find_current_text_end (context);
1009 /* Validate UTF8 (must be done after we find the end, since
1010 * we could have a trailing incomplete char)
1012 if (!g_utf8_validate (context->current_text,
1013 context->current_text_len,
1014 &first_invalid))
1016 gint newlines = 0;
1017 const gchar *p, *q;
1018 q = p = context->current_text;
1019 while (p != first_invalid)
1021 if (*p == '\n')
1023 ++newlines;
1024 q = p + 1;
1025 context->char_number = 1;
1027 ++p;
1030 context->line_number += newlines;
1031 context->char_number += g_utf8_strlen (q, first_invalid - q);
1033 set_error (context,
1034 error,
1035 G_MARKUP_ERROR_BAD_UTF8,
1036 _("Invalid UTF-8 encoded text - not valid '%s'"),
1037 g_strndup (context->current_text,
1038 context->current_text_len));
1039 goto finished;
1042 while (context->iter != context->current_text_end)
1044 switch (context->state)
1046 case STATE_START:
1047 /* Possible next state: AFTER_OPEN_ANGLE */
1049 g_assert (context->tag_stack == NULL);
1051 /* whitespace is ignored outside of any elements */
1052 skip_spaces (context);
1054 if (context->iter != context->current_text_end)
1056 if (*context->iter == '<')
1058 /* Move after the open angle */
1059 advance_char (context);
1061 context->state = STATE_AFTER_OPEN_ANGLE;
1063 /* this could start a passthrough */
1064 context->start = context->iter;
1066 /* document is now non-empty */
1067 context->document_empty = FALSE;
1069 else
1071 set_error (context,
1072 error,
1073 G_MARKUP_ERROR_PARSE,
1074 _("Document must begin with an element (e.g. <book>)"));
1077 break;
1079 case STATE_AFTER_OPEN_ANGLE:
1080 /* Possible next states: INSIDE_OPEN_TAG_NAME,
1081 * AFTER_CLOSE_TAG_SLASH, INSIDE_PASSTHROUGH
1083 if (*context->iter == '?' ||
1084 *context->iter == '!')
1086 /* include < in the passthrough */
1087 const gchar *openangle = "<";
1088 add_to_partial (context, openangle, openangle + 1);
1089 context->start = context->iter;
1090 context->balance = 1;
1091 context->state = STATE_INSIDE_PASSTHROUGH;
1093 else if (*context->iter == '/')
1095 /* move after it */
1096 advance_char (context);
1098 context->state = STATE_AFTER_CLOSE_TAG_SLASH;
1100 else if (is_name_start_char (context->iter))
1102 context->state = STATE_INSIDE_OPEN_TAG_NAME;
1104 /* start of tag name */
1105 context->start = context->iter;
1107 else
1109 gchar buf[8];
1111 set_error (context,
1112 error,
1113 G_MARKUP_ERROR_PARSE,
1114 _("'%s' is not a valid character following "
1115 "a '<' character; it may not begin an "
1116 "element name"),
1117 utf8_str (context->iter, buf));
1119 break;
1121 /* The AFTER_CLOSE_ANGLE state is actually sort of
1122 * broken, because it doesn't correspond to a range
1123 * of characters in the input stream as the others do,
1124 * and thus makes things harder to conceptualize
1126 case STATE_AFTER_CLOSE_ANGLE:
1127 /* Possible next states: INSIDE_TEXT, STATE_START */
1128 if (context->tag_stack == NULL)
1130 context->start = NULL;
1131 context->state = STATE_START;
1133 else
1135 context->start = context->iter;
1136 context->state = STATE_INSIDE_TEXT;
1138 break;
1140 case STATE_AFTER_ELISION_SLASH:
1141 /* Possible next state: AFTER_CLOSE_ANGLE */
1144 /* We need to pop the tag stack and call the end_element
1145 * function, since this is the close tag
1147 GError *tmp_error = NULL;
1149 g_assert (context->tag_stack != NULL);
1151 tmp_error = NULL;
1152 if (context->parser->end_element)
1153 (* context->parser->end_element) (context,
1154 context->tag_stack->data,
1155 context->user_data,
1156 &tmp_error);
1158 if (tmp_error)
1160 mark_error (context, tmp_error);
1161 g_propagate_error (error, tmp_error);
1163 else
1165 if (*context->iter == '>')
1167 /* move after the close angle */
1168 advance_char (context);
1169 context->state = STATE_AFTER_CLOSE_ANGLE;
1171 else
1173 gchar buf[8];
1175 set_error (context,
1176 error,
1177 G_MARKUP_ERROR_PARSE,
1178 _("Odd character '%s', expected a '>' character "
1179 "to end the start tag of element '%s'"),
1180 utf8_str (context->iter, buf),
1181 current_element (context));
1185 g_free (context->tag_stack->data);
1186 context->tag_stack = g_slist_delete_link (context->tag_stack,
1187 context->tag_stack);
1189 break;
1191 case STATE_INSIDE_OPEN_TAG_NAME:
1192 /* Possible next states: BETWEEN_ATTRIBUTES */
1194 /* if there's a partial chunk then it's the first part of the
1195 * tag name. If there's a context->start then it's the start
1196 * of the tag name in current_text, the partial chunk goes
1197 * before that start though.
1199 advance_to_name_end (context);
1201 if (context->iter == context->current_text_end)
1203 /* The name hasn't necessarily ended. Merge with
1204 * partial chunk, leave state unchanged.
1206 add_to_partial (context, context->start, context->iter);
1208 else
1210 /* The name has ended. Combine it with the partial chunk
1211 * if any; push it on the stack; enter next state.
1213 add_to_partial (context, context->start, context->iter);
1214 context->tag_stack =
1215 g_slist_prepend (context->tag_stack,
1216 g_string_free (context->partial_chunk,
1217 FALSE));
1219 context->partial_chunk = NULL;
1221 context->state = STATE_BETWEEN_ATTRIBUTES;
1222 context->start = NULL;
1224 break;
1226 case STATE_INSIDE_ATTRIBUTE_NAME:
1227 /* Possible next states: AFTER_ATTRIBUTE_NAME */
1229 advance_to_name_end (context);
1230 add_to_partial (context, context->start, context->iter);
1232 /* read the full name, if we enter the equals sign state
1233 * then add the attribute to the list (without the value),
1234 * otherwise store a partial chunk to be prepended later.
1236 if (context->iter != context->current_text_end)
1237 context->state = STATE_AFTER_ATTRIBUTE_NAME;
1238 break;
1240 case STATE_AFTER_ATTRIBUTE_NAME:
1241 /* Possible next states: AFTER_ATTRIBUTE_EQUALS_SIGN */
1243 skip_spaces (context);
1245 if (context->iter != context->current_text_end)
1247 /* The name has ended. Combine it with the partial chunk
1248 * if any; push it on the stack; enter next state.
1250 add_attribute (context, g_string_free (context->partial_chunk, FALSE));
1252 context->partial_chunk = NULL;
1253 context->start = NULL;
1255 if (*context->iter == '=')
1257 advance_char (context);
1258 context->state = STATE_AFTER_ATTRIBUTE_EQUALS_SIGN;
1260 else
1262 gchar buf[8];
1264 set_error (context,
1265 error,
1266 G_MARKUP_ERROR_PARSE,
1267 _("Odd character '%s', expected a '=' after "
1268 "attribute name '%s' of element '%s'"),
1269 utf8_str (context->iter, buf),
1270 current_attribute (context),
1271 current_element (context));
1275 break;
1277 case STATE_BETWEEN_ATTRIBUTES:
1278 /* Possible next states: AFTER_CLOSE_ANGLE,
1279 * AFTER_ELISION_SLASH, INSIDE_ATTRIBUTE_NAME
1281 skip_spaces (context);
1283 if (context->iter != context->current_text_end)
1285 if (*context->iter == '/')
1287 advance_char (context);
1288 context->state = STATE_AFTER_ELISION_SLASH;
1290 else if (*context->iter == '>')
1293 advance_char (context);
1294 context->state = STATE_AFTER_CLOSE_ANGLE;
1296 else if (is_name_start_char (context->iter))
1298 context->state = STATE_INSIDE_ATTRIBUTE_NAME;
1299 /* start of attribute name */
1300 context->start = context->iter;
1302 else
1304 gchar buf[8];
1306 set_error (context,
1307 error,
1308 G_MARKUP_ERROR_PARSE,
1309 _("Odd character '%s', expected a '>' or '/' "
1310 "character to end the start tag of "
1311 "element '%s', or optionally an attribute; "
1312 "perhaps you used an invalid character in "
1313 "an attribute name"),
1314 utf8_str (context->iter, buf),
1315 current_element (context));
1318 /* If we're done with attributes, invoke
1319 * the start_element callback
1321 if (context->state == STATE_AFTER_ELISION_SLASH ||
1322 context->state == STATE_AFTER_CLOSE_ANGLE)
1324 const gchar *start_name;
1325 /* Ugly, but the current code expects an empty array instead of NULL */
1326 const gchar *empty = NULL;
1327 const gchar **attr_names = &empty;
1328 const gchar **attr_values = &empty;
1329 GError *tmp_error;
1331 /* Call user callback for element start */
1332 start_name = current_element (context);
1334 if (context->cur_attr >= 0)
1336 attr_names = (const gchar**)context->attr_names;
1337 attr_values = (const gchar**)context->attr_values;
1340 tmp_error = NULL;
1341 if (context->parser->start_element)
1342 (* context->parser->start_element) (context,
1343 start_name,
1344 (const gchar **)attr_names,
1345 (const gchar **)attr_values,
1346 context->user_data,
1347 &tmp_error);
1349 /* Go ahead and free the attributes. */
1350 for (; context->cur_attr >= 0; context->cur_attr--)
1352 int pos = context->cur_attr;
1353 g_free (context->attr_names[pos]);
1354 g_free (context->attr_values[pos]);
1355 context->attr_names[pos] = context->attr_values[pos] = NULL;
1357 g_assert (context->cur_attr == -1);
1358 g_assert (context->attr_names == NULL ||
1359 context->attr_names[0] == NULL);
1360 g_assert (context->attr_values == NULL ||
1361 context->attr_values[0] == NULL);
1363 if (tmp_error != NULL)
1364 propagate_error (context, error, tmp_error);
1367 break;
1369 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1370 /* Possible next state: INSIDE_ATTRIBUTE_VALUE_[SQ/DQ] */
1372 skip_spaces (context);
1374 if (context->iter != context->current_text_end)
1376 if (*context->iter == '"')
1378 advance_char (context);
1379 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_DQ;
1380 context->start = context->iter;
1382 else if (*context->iter == '\'')
1384 advance_char (context);
1385 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_SQ;
1386 context->start = context->iter;
1388 else
1390 gchar buf[8];
1392 set_error (context,
1393 error,
1394 G_MARKUP_ERROR_PARSE,
1395 _("Odd character '%s', expected an open quote mark "
1396 "after the equals sign when giving value for "
1397 "attribute '%s' of element '%s'"),
1398 utf8_str (context->iter, buf),
1399 current_attribute (context),
1400 current_element (context));
1403 break;
1405 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1406 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1407 /* Possible next states: BETWEEN_ATTRIBUTES */
1409 gchar delim;
1411 if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ)
1413 delim = '\'';
1415 else
1417 delim = '"';
1422 if (*context->iter == delim)
1423 break;
1425 while (advance_char (context));
1427 if (context->iter == context->current_text_end)
1429 /* The value hasn't necessarily ended. Merge with
1430 * partial chunk, leave state unchanged.
1432 add_to_partial (context, context->start, context->iter);
1434 else
1436 /* The value has ended at the quote mark. Combine it
1437 * with the partial chunk if any; set it for the current
1438 * attribute.
1440 GString *unescaped;
1442 add_to_partial (context, context->start, context->iter);
1444 g_assert (context->cur_attr >= 0);
1446 if (unescape_text (context,
1447 context->partial_chunk->str,
1448 context->partial_chunk->str +
1449 context->partial_chunk->len,
1450 &unescaped,
1451 error))
1453 /* success, advance past quote and set state. */
1454 context->attr_values[context->cur_attr] = g_string_free (unescaped, FALSE);
1455 advance_char (context);
1456 context->state = STATE_BETWEEN_ATTRIBUTES;
1457 context->start = NULL;
1460 truncate_partial (context);
1462 break;
1464 case STATE_INSIDE_TEXT:
1465 /* Possible next states: AFTER_OPEN_ANGLE */
1468 if (*context->iter == '<')
1469 break;
1471 while (advance_char (context));
1473 /* The text hasn't necessarily ended. Merge with
1474 * partial chunk, leave state unchanged.
1477 add_to_partial (context, context->start, context->iter);
1479 if (context->iter != context->current_text_end)
1481 GString *unescaped = NULL;
1483 /* The text has ended at the open angle. Call the text
1484 * callback.
1487 if (unescape_text (context,
1488 context->partial_chunk->str,
1489 context->partial_chunk->str +
1490 context->partial_chunk->len,
1491 &unescaped,
1492 error))
1494 GError *tmp_error = NULL;
1496 if (context->parser->text)
1497 (*context->parser->text) (context,
1498 unescaped->str,
1499 unescaped->len,
1500 context->user_data,
1501 &tmp_error);
1503 g_string_free (unescaped, TRUE);
1505 if (tmp_error == NULL)
1507 /* advance past open angle and set state. */
1508 advance_char (context);
1509 context->state = STATE_AFTER_OPEN_ANGLE;
1510 /* could begin a passthrough */
1511 context->start = context->iter;
1513 else
1514 propagate_error (context, error, tmp_error);
1517 truncate_partial (context);
1519 break;
1521 case STATE_AFTER_CLOSE_TAG_SLASH:
1522 /* Possible next state: INSIDE_CLOSE_TAG_NAME */
1523 if (is_name_start_char (context->iter))
1525 context->state = STATE_INSIDE_CLOSE_TAG_NAME;
1527 /* start of tag name */
1528 context->start = context->iter;
1530 else
1532 gchar buf[8];
1534 set_error (context,
1535 error,
1536 G_MARKUP_ERROR_PARSE,
1537 _("'%s' is not a valid character following "
1538 "the characters '</'; '%s' may not begin an "
1539 "element name"),
1540 utf8_str (context->iter, buf),
1541 utf8_str (context->iter, buf));
1543 break;
1545 case STATE_INSIDE_CLOSE_TAG_NAME:
1546 /* Possible next state: AFTER_CLOSE_TAG_NAME */
1547 advance_to_name_end (context);
1548 add_to_partial (context, context->start, context->iter);
1550 if (context->iter != context->current_text_end)
1551 context->state = STATE_AFTER_CLOSE_TAG_NAME;
1552 break;
1554 case STATE_AFTER_CLOSE_TAG_NAME:
1555 /* Possible next state: AFTER_CLOSE_TAG_SLASH */
1557 skip_spaces (context);
1559 if (context->iter != context->current_text_end)
1561 gchar *close_name;
1563 /* The name has ended. Combine it with the partial chunk
1564 * if any; check that it matches stack top and pop
1565 * stack; invoke proper callback; enter next state.
1567 close_name = g_string_free (context->partial_chunk, FALSE);
1568 context->partial_chunk = NULL;
1570 if (*context->iter != '>')
1572 gchar buf[8];
1574 set_error (context,
1575 error,
1576 G_MARKUP_ERROR_PARSE,
1577 _("'%s' is not a valid character following "
1578 "the close element name '%s'; the allowed "
1579 "character is '>'"),
1580 utf8_str (context->iter, buf),
1581 close_name);
1583 else if (context->tag_stack == NULL)
1585 set_error (context,
1586 error,
1587 G_MARKUP_ERROR_PARSE,
1588 _("Element '%s' was closed, no element "
1589 "is currently open"),
1590 close_name);
1592 else if (strcmp (close_name, current_element (context)) != 0)
1594 set_error (context,
1595 error,
1596 G_MARKUP_ERROR_PARSE,
1597 _("Element '%s' was closed, but the currently "
1598 "open element is '%s'"),
1599 close_name,
1600 current_element (context));
1602 else
1604 GError *tmp_error;
1605 advance_char (context);
1606 context->state = STATE_AFTER_CLOSE_ANGLE;
1607 context->start = NULL;
1609 /* call the end_element callback */
1610 tmp_error = NULL;
1611 if (context->parser->end_element)
1612 (* context->parser->end_element) (context,
1613 close_name,
1614 context->user_data,
1615 &tmp_error);
1618 /* Pop the tag stack */
1619 g_free (context->tag_stack->data);
1620 context->tag_stack = g_slist_delete_link (context->tag_stack,
1621 context->tag_stack);
1623 if (tmp_error)
1624 propagate_error (context, error, tmp_error);
1627 g_free (close_name);
1629 break;
1631 case STATE_INSIDE_PASSTHROUGH:
1632 /* Possible next state: AFTER_CLOSE_ANGLE */
1635 if (*context->iter == '<')
1636 context->balance++;
1637 if (*context->iter == '>')
1639 gchar *str;
1640 gsize len;
1642 context->balance--;
1643 add_to_partial (context, context->start, context->iter);
1644 context->start = context->iter;
1646 str = context->partial_chunk->str;
1647 len = context->partial_chunk->len;
1649 if (str[1] == '?' && str[len - 1] == '?')
1650 break;
1651 if (strncmp (str, "<!--", 4) == 0 &&
1652 strcmp (str + len - 2, "--") == 0)
1653 break;
1654 if (strncmp (str, "<![CDATA[", 9) == 0 &&
1655 strcmp (str + len - 2, "]]") == 0)
1656 break;
1657 if (strncmp (str, "<!DOCTYPE", 9) == 0 &&
1658 context->balance == 0)
1659 break;
1662 while (advance_char (context));
1664 if (context->iter == context->current_text_end)
1666 /* The passthrough hasn't necessarily ended. Merge with
1667 * partial chunk, leave state unchanged.
1669 add_to_partial (context, context->start, context->iter);
1671 else
1673 /* The passthrough has ended at the close angle. Combine
1674 * it with the partial chunk if any. Call the passthrough
1675 * callback. Note that the open/close angles are
1676 * included in the text of the passthrough.
1678 GError *tmp_error = NULL;
1680 advance_char (context); /* advance past close angle */
1681 add_to_partial (context, context->start, context->iter);
1683 if (context->flags & G_MARKUP_TREAT_CDATA_AS_TEXT &&
1684 strncmp (context->partial_chunk->str, "<![CDATA[", 9) == 0)
1686 if (context->parser->text)
1687 (*context->parser->text) (context,
1688 context->partial_chunk->str + 9,
1689 context->partial_chunk->len - 12,
1690 context->user_data,
1691 &tmp_error);
1693 else if (context->parser->passthrough)
1694 (*context->parser->passthrough) (context,
1695 context->partial_chunk->str,
1696 context->partial_chunk->len,
1697 context->user_data,
1698 &tmp_error);
1700 truncate_partial (context);
1702 if (tmp_error == NULL)
1704 context->state = STATE_AFTER_CLOSE_ANGLE;
1705 context->start = context->iter; /* could begin text */
1707 else
1708 propagate_error (context, error, tmp_error);
1710 break;
1712 case STATE_ERROR:
1713 goto finished;
1714 break;
1716 default:
1717 g_assert_not_reached ();
1718 break;
1722 finished:
1723 context->parsing = FALSE;
1725 return context->state != STATE_ERROR;
1729 * g_markup_parse_context_end_parse:
1730 * @context: a #GMarkupParseContext
1731 * @error: return location for a #GError
1733 * Signals to the #GMarkupParseContext that all data has been
1734 * fed into the parse context with g_markup_parse_context_parse().
1735 * This function reports an error if the document isn't complete,
1736 * for example if elements are still open.
1738 * Return value: %TRUE on success, %FALSE if an error was set
1740 gboolean
1741 g_markup_parse_context_end_parse (GMarkupParseContext *context,
1742 GError **error)
1744 g_return_val_if_fail (context != NULL, FALSE);
1745 g_return_val_if_fail (!context->parsing, FALSE);
1746 g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
1748 if (context->partial_chunk != NULL)
1750 g_string_free (context->partial_chunk, TRUE);
1751 context->partial_chunk = NULL;
1754 if (context->document_empty)
1756 set_error (context, error, G_MARKUP_ERROR_EMPTY,
1757 _("Document was empty or contained only whitespace"));
1758 return FALSE;
1761 context->parsing = TRUE;
1763 switch (context->state)
1765 case STATE_START:
1766 /* Nothing to do */
1767 break;
1769 case STATE_AFTER_OPEN_ANGLE:
1770 set_error (context, error, G_MARKUP_ERROR_PARSE,
1771 _("Document ended unexpectedly just after an open angle bracket '<'"));
1772 break;
1774 case STATE_AFTER_CLOSE_ANGLE:
1775 if (context->tag_stack != NULL)
1777 /* Error message the same as for INSIDE_TEXT */
1778 set_error (context, error, G_MARKUP_ERROR_PARSE,
1779 _("Document ended unexpectedly with elements still open - "
1780 "'%s' was the last element opened"),
1781 current_element (context));
1783 break;
1785 case STATE_AFTER_ELISION_SLASH:
1786 set_error (context, error, G_MARKUP_ERROR_PARSE,
1787 _("Document ended unexpectedly, expected to see a close angle "
1788 "bracket ending the tag <%s/>"), current_element (context));
1789 break;
1791 case STATE_INSIDE_OPEN_TAG_NAME:
1792 set_error (context, error, G_MARKUP_ERROR_PARSE,
1793 _("Document ended unexpectedly inside an element name"));
1794 break;
1796 case STATE_INSIDE_ATTRIBUTE_NAME:
1797 case STATE_AFTER_ATTRIBUTE_NAME:
1798 set_error (context, error, G_MARKUP_ERROR_PARSE,
1799 _("Document ended unexpectedly inside an attribute name"));
1800 break;
1802 case STATE_BETWEEN_ATTRIBUTES:
1803 set_error (context, error, G_MARKUP_ERROR_PARSE,
1804 _("Document ended unexpectedly inside an element-opening "
1805 "tag."));
1806 break;
1808 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1809 set_error (context, error, G_MARKUP_ERROR_PARSE,
1810 _("Document ended unexpectedly after the equals sign "
1811 "following an attribute name; no attribute value"));
1812 break;
1814 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1815 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1816 set_error (context, error, G_MARKUP_ERROR_PARSE,
1817 _("Document ended unexpectedly while inside an attribute "
1818 "value"));
1819 break;
1821 case STATE_INSIDE_TEXT:
1822 g_assert (context->tag_stack != NULL);
1823 set_error (context, error, G_MARKUP_ERROR_PARSE,
1824 _("Document ended unexpectedly with elements still open - "
1825 "'%s' was the last element opened"),
1826 current_element (context));
1827 break;
1829 case STATE_AFTER_CLOSE_TAG_SLASH:
1830 case STATE_INSIDE_CLOSE_TAG_NAME:
1831 case STATE_AFTER_CLOSE_TAG_NAME:
1832 set_error (context, error, G_MARKUP_ERROR_PARSE,
1833 _("Document ended unexpectedly inside the close tag for "
1834 "element '%s'"), current_element (context));
1835 break;
1837 case STATE_INSIDE_PASSTHROUGH:
1838 set_error (context, error, G_MARKUP_ERROR_PARSE,
1839 _("Document ended unexpectedly inside a comment or "
1840 "processing instruction"));
1841 break;
1843 case STATE_ERROR:
1844 default:
1845 g_assert_not_reached ();
1846 break;
1849 context->parsing = FALSE;
1851 return context->state != STATE_ERROR;
1855 * g_markup_parse_context_get_element:
1856 * @context: a #GMarkupParseContext
1857 * @returns: the name of the currently open element, or %NULL
1859 * Retrieves the name of the currently open element.
1861 * If called from the start_element or end_element handlers this will
1862 * give the element_name as passed to those functions. For the parent
1863 * elements, see g_markup_parse_context_get_element_stack().
1865 * Since: 2.2
1867 G_CONST_RETURN gchar *
1868 g_markup_parse_context_get_element (GMarkupParseContext *context)
1870 g_return_val_if_fail (context != NULL, NULL);
1872 if (context->tag_stack == NULL)
1873 return NULL;
1874 else
1875 return current_element (context);
1879 * g_markup_parse_context_get_element_stack:
1880 * @context: a #GMarkupParseContext
1882 * Retrieves the element stack from the internal state of the parser.
1883 * The returned #GSList is a list of strings where the first item is
1884 * the currently open tag (as would be returned by
1885 * g_markup_parse_context_get_element()) and the next item is its
1886 * immediate parent.
1888 * This function is intended to be used in the start_element and
1889 * end_element handlers where g_markup_parse_context_get_element()
1890 * would merely return the name of the element that is being
1891 * processed.
1893 * Returns: the element stack, which must not be modified
1895 * Since 2.16
1897 G_CONST_RETURN GSList *
1898 g_markup_parse_context_get_element_stack (GMarkupParseContext *context)
1900 g_return_val_if_fail (context != NULL, NULL);
1902 return context->tag_stack;
1906 * g_markup_parse_context_get_position:
1907 * @context: a #GMarkupParseContext
1908 * @line_number: return location for a line number, or %NULL
1909 * @char_number: return location for a char-on-line number, or %NULL
1911 * Retrieves the current line number and the number of the character on
1912 * that line. Intended for use in error messages; there are no strict
1913 * semantics for what constitutes the "current" line number other than
1914 * "the best number we could come up with for error messages."
1917 void
1918 g_markup_parse_context_get_position (GMarkupParseContext *context,
1919 gint *line_number,
1920 gint *char_number)
1922 g_return_if_fail (context != NULL);
1924 if (line_number)
1925 *line_number = context->line_number;
1927 if (char_number)
1928 *char_number = context->char_number;
1931 static void
1932 append_escaped_text (GString *str,
1933 const gchar *text,
1934 gssize length)
1936 const gchar *p;
1937 const gchar *end;
1938 gunichar c;
1940 p = text;
1941 end = text + length;
1943 while (p != end)
1945 const gchar *next;
1946 next = g_utf8_next_char (p);
1948 switch (*p)
1950 case '&':
1951 g_string_append (str, "&amp;");
1952 break;
1954 case '<':
1955 g_string_append (str, "&lt;");
1956 break;
1958 case '>':
1959 g_string_append (str, "&gt;");
1960 break;
1962 case '\'':
1963 g_string_append (str, "&apos;");
1964 break;
1966 case '"':
1967 g_string_append (str, "&quot;");
1968 break;
1970 default:
1971 c = g_utf8_get_char (p);
1972 if ((0x1 <= c && c <= 0x8) ||
1973 (0xb <= c && c <= 0xc) ||
1974 (0xe <= c && c <= 0x1f) ||
1975 (0x7f <= c && c <= 0x84) ||
1976 (0x86 <= c && c <= 0x9f))
1977 g_string_append_printf (str, "&#x%x;", c);
1978 else
1979 g_string_append_len (str, p, next - p);
1980 break;
1983 p = next;
1988 * g_markup_escape_text:
1989 * @text: some valid UTF-8 text
1990 * @length: length of @text in bytes, or -1 if the text is nul-terminated
1992 * Escapes text so that the markup parser will parse it verbatim.
1993 * Less than, greater than, ampersand, etc. are replaced with the
1994 * corresponding entities. This function would typically be used
1995 * when writing out a file to be parsed with the markup parser.
1997 * Note that this function doesn't protect whitespace and line endings
1998 * from being processed according to the XML rules for normalization
1999 * of line endings and attribute values.
2001 * Return value: a newly allocated string with the escaped text
2003 gchar*
2004 g_markup_escape_text (const gchar *text,
2005 gssize length)
2007 GString *str;
2009 g_return_val_if_fail (text != NULL, NULL);
2011 if (length < 0)
2012 length = strlen (text);
2014 /* prealloc at least as long as original text */
2015 str = g_string_sized_new (length);
2016 append_escaped_text (str, text, length);
2018 return g_string_free (str, FALSE);
2022 * find_conversion:
2023 * @format: a printf-style format string
2024 * @after: location to store a pointer to the character after
2025 * the returned conversion. On a %NULL return, returns the
2026 * pointer to the trailing NUL in the string
2028 * Find the next conversion in a printf-style format string.
2029 * Partially based on code from printf-parser.c,
2030 * Copyright (C) 1999-2000, 2002-2003 Free Software Foundation, Inc.
2032 * Return value: pointer to the next conversion in @format,
2033 * or %NULL, if none.
2035 static const char *
2036 find_conversion (const char *format,
2037 const char **after)
2039 const char *start = format;
2040 const char *cp;
2042 while (*start != '\0' && *start != '%')
2043 start++;
2045 if (*start == '\0')
2047 *after = start;
2048 return NULL;
2051 cp = start + 1;
2053 if (*cp == '\0')
2055 *after = cp;
2056 return NULL;
2059 /* Test for positional argument. */
2060 if (*cp >= '0' && *cp <= '9')
2062 const char *np;
2064 for (np = cp; *np >= '0' && *np <= '9'; np++)
2066 if (*np == '$')
2067 cp = np + 1;
2070 /* Skip the flags. */
2071 for (;;)
2073 if (*cp == '\'' ||
2074 *cp == '-' ||
2075 *cp == '+' ||
2076 *cp == ' ' ||
2077 *cp == '#' ||
2078 *cp == '0')
2079 cp++;
2080 else
2081 break;
2084 /* Skip the field width. */
2085 if (*cp == '*')
2087 cp++;
2089 /* Test for positional argument. */
2090 if (*cp >= '0' && *cp <= '9')
2092 const char *np;
2094 for (np = cp; *np >= '0' && *np <= '9'; np++)
2096 if (*np == '$')
2097 cp = np + 1;
2100 else
2102 for (; *cp >= '0' && *cp <= '9'; cp++)
2106 /* Skip the precision. */
2107 if (*cp == '.')
2109 cp++;
2110 if (*cp == '*')
2112 /* Test for positional argument. */
2113 if (*cp >= '0' && *cp <= '9')
2115 const char *np;
2117 for (np = cp; *np >= '0' && *np <= '9'; np++)
2119 if (*np == '$')
2120 cp = np + 1;
2123 else
2125 for (; *cp >= '0' && *cp <= '9'; cp++)
2130 /* Skip argument type/size specifiers. */
2131 while (*cp == 'h' ||
2132 *cp == 'L' ||
2133 *cp == 'l' ||
2134 *cp == 'j' ||
2135 *cp == 'z' ||
2136 *cp == 'Z' ||
2137 *cp == 't')
2138 cp++;
2140 /* Skip the conversion character. */
2141 cp++;
2143 *after = cp;
2144 return start;
2148 * g_markup_vprintf_escaped:
2149 * @format: printf() style format string
2150 * @args: variable argument list, similar to vprintf()
2152 * Formats the data in @args according to @format, escaping
2153 * all string and character arguments in the fashion
2154 * of g_markup_escape_text(). See g_markup_printf_escaped().
2156 * Return value: newly allocated result from formatting
2157 * operation. Free with g_free().
2159 * Since: 2.4
2161 char *
2162 g_markup_vprintf_escaped (const char *format,
2163 va_list args)
2165 GString *format1;
2166 GString *format2;
2167 GString *result = NULL;
2168 gchar *output1 = NULL;
2169 gchar *output2 = NULL;
2170 const char *p, *op1, *op2;
2171 va_list args2;
2173 /* The technique here, is that we make two format strings that
2174 * have the identical conversions in the identical order to the
2175 * original strings, but differ in the text in-between. We
2176 * then use the normal g_strdup_vprintf() to format the arguments
2177 * with the two new format strings. By comparing the results,
2178 * we can figure out what segments of the output come from
2179 * the the original format string, and what from the arguments,
2180 * and thus know what portions of the string to escape.
2182 * For instance, for:
2184 * g_markup_printf_escaped ("%s ate %d apples", "Susan & Fred", 5);
2186 * We form the two format strings "%sX%dX" and %sY%sY". The results
2187 * of formatting with those two strings are
2189 * "%sX%dX" => "Susan & FredX5X"
2190 * "%sY%dY" => "Susan & FredY5Y"
2192 * To find the span of the first argument, we find the first position
2193 * where the two arguments differ, which tells us that the first
2194 * argument formatted to "Susan & Fred". We then escape that
2195 * to "Susan &amp; Fred" and join up with the intermediate portions
2196 * of the format string and the second argument to get
2197 * "Susan &amp; Fred ate 5 apples".
2200 /* Create the two modified format strings
2202 format1 = g_string_new (NULL);
2203 format2 = g_string_new (NULL);
2204 p = format;
2205 while (TRUE)
2207 const char *after;
2208 const char *conv = find_conversion (p, &after);
2209 if (!conv)
2210 break;
2212 g_string_append_len (format1, conv, after - conv);
2213 g_string_append_c (format1, 'X');
2214 g_string_append_len (format2, conv, after - conv);
2215 g_string_append_c (format2, 'Y');
2217 p = after;
2220 /* Use them to format the arguments
2222 G_VA_COPY (args2, args);
2224 output1 = g_strdup_vprintf (format1->str, args);
2225 if (!output1)
2227 va_end (args2);
2228 goto cleanup;
2231 output2 = g_strdup_vprintf (format2->str, args2);
2232 va_end (args2);
2233 if (!output2)
2234 goto cleanup;
2236 result = g_string_new (NULL);
2238 /* Iterate through the original format string again,
2239 * copying the non-conversion portions and the escaped
2240 * converted arguments to the output string.
2242 op1 = output1;
2243 op2 = output2;
2244 p = format;
2245 while (TRUE)
2247 const char *after;
2248 const char *output_start;
2249 const char *conv = find_conversion (p, &after);
2250 char *escaped;
2252 if (!conv) /* The end, after points to the trailing \0 */
2254 g_string_append_len (result, p, after - p);
2255 break;
2258 g_string_append_len (result, p, conv - p);
2259 output_start = op1;
2260 while (*op1 == *op2)
2262 op1++;
2263 op2++;
2266 escaped = g_markup_escape_text (output_start, op1 - output_start);
2267 g_string_append (result, escaped);
2268 g_free (escaped);
2270 p = after;
2271 op1++;
2272 op2++;
2275 cleanup:
2276 g_string_free (format1, TRUE);
2277 g_string_free (format2, TRUE);
2278 g_free (output1);
2279 g_free (output2);
2281 if (result)
2282 return g_string_free (result, FALSE);
2283 else
2284 return NULL;
2288 * g_markup_printf_escaped:
2289 * @format: printf() style format string
2290 * @Varargs: the arguments to insert in the format string
2292 * Formats arguments according to @format, escaping
2293 * all string and character arguments in the fashion
2294 * of g_markup_escape_text(). This is useful when you
2295 * want to insert literal strings into XML-style markup
2296 * output, without having to worry that the strings
2297 * might themselves contain markup.
2299 * |[
2300 * const char *store = "Fortnum &amp; Mason";
2301 * const char *item = "Tea";
2302 * char *output;
2303 * &nbsp;
2304 * output = g_markup_printf_escaped ("&lt;purchase&gt;"
2305 * "&lt;store&gt;&percnt;s&lt;/store&gt;"
2306 * "&lt;item&gt;&percnt;s&lt;/item&gt;"
2307 * "&lt;/purchase&gt;",
2308 * store, item);
2309 * ]|
2311 * Return value: newly allocated result from formatting
2312 * operation. Free with g_free().
2314 * Since: 2.4
2316 char *
2317 g_markup_printf_escaped (const char *format, ...)
2319 char *result;
2320 va_list args;
2322 va_start (args, format);
2323 result = g_markup_vprintf_escaped (format, args);
2324 va_end (args);
2326 return result;
2329 static gboolean
2330 g_markup_parse_boolean (const char *string,
2331 gboolean *value)
2333 char const * const falses[] = { "false", "f", "no", "n", "0" };
2334 char const * const trues[] = { "true", "t", "yes", "y", "1" };
2335 int i;
2337 for (i = 0; i < G_N_ELEMENTS (falses); i++)
2339 if (g_ascii_strcasecmp (string, falses[i]) == 0)
2341 if (value != NULL)
2342 *value = FALSE;
2344 return TRUE;
2348 for (i = 0; i < G_N_ELEMENTS (trues); i++)
2350 if (g_ascii_strcasecmp (string, trues[i]) == 0)
2352 if (value != NULL)
2353 *value = TRUE;
2355 return TRUE;
2359 return FALSE;
2363 * GMarkupCollectType:
2364 * @G_MARKUP_COLLECT_INVALID: used to terminate the list of attributes
2365 * to collect.
2366 * @G_MARKUP_COLLECT_STRING: collect the string pointer directly from
2367 * the attribute_values[] array. Expects a
2368 * parameter of type (const char **). If
2369 * %G_MARKUP_COLLECT_OPTIONAL is specified
2370 * and the attribute isn't present then the
2371 * pointer will be set to %NULL.
2372 * @G_MARKUP_COLLECT_STRDUP: as with %G_MARKUP_COLLECT_STRING, but
2373 * expects a paramter of type (char **) and
2374 * g_strdup()s the returned pointer. The
2375 * pointer must be freed with g_free().
2376 * @G_MARKUP_COLLECT_BOOLEAN: expects a parameter of type (gboolean *)
2377 * and parses the attribute value as a
2378 * boolean. Sets %FALSE if the attribute
2379 * isn't present. Valid boolean values
2380 * consist of (case insensitive) "false",
2381 * "f", "no", "n", "0" and "true", "t",
2382 * "yes", "y", "1".
2383 * @G_MARKUP_COLLECT_TRISTATE: as with %G_MARKUP_COLLECT_BOOLEAN, but
2384 * in the case of a missing attribute a
2385 * value is set that compares equal to
2386 * neither %FALSE nor %TRUE.
2387 * G_MARKUP_COLLECT_OPTIONAL is implied.
2388 * @G_MARKUP_COLLECT_OPTIONAL: can be bitwise ORed with the other
2389 * fields. If present, allows the
2390 * attribute not to appear. A default
2391 * value is set depending on what value
2392 * type is used.
2394 * A mixed enumerated type and flags field. You must specify one type
2395 * (string, strdup, boolean, tristate). Additionally, you may
2396 * optionally bitwise OR the type with the flag
2397 * %G_MARKUP_COLLECT_OPTIONAL.
2399 * It is likely that this enum will be extended in the future to
2400 * support other types.
2404 * g_markup_collect_attributes:
2405 * @element_name: the current tag name
2406 * @attribute_names: the attribute names
2407 * @attribute_values: the attribute values
2408 * @error: a pointer to a #GError or %NULL
2409 * @first_type: the #GMarkupCollectType of the
2410 * first attribute
2411 * @first_attr: the name of the first attribute
2412 * @...: a pointer to the storage location of the
2413 * first attribute (or %NULL), followed by
2414 * more types names and pointers, ending
2415 * with %G_MARKUP_COLLECT_INVALID.
2417 * Collects the attributes of the element from the
2418 * data passed to the #GMarkupParser start_element
2419 * function, dealing with common error conditions
2420 * and supporting boolean values.
2422 * This utility function is not required to write
2423 * a parser but can save a lot of typing.
2425 * The @element_name, @attribute_names,
2426 * @attribute_values and @error parameters passed
2427 * to the start_element callback should be passed
2428 * unmodified to this function.
2430 * Following these arguments is a list of
2431 * "supported" attributes to collect. It is an
2432 * error to specify multiple attributes with the
2433 * same name. If any attribute not in the list
2434 * appears in the @attribute_names array then an
2435 * unknown attribute error will result.
2437 * The #GMarkupCollectType field allows specifying
2438 * the type of collection to perform and if a
2439 * given attribute must appear or is optional.
2441 * The attribute name is simply the name of the
2442 * attribute to collect.
2444 * The pointer should be of the appropriate type
2445 * (see the descriptions under
2446 * #GMarkupCollectType) and may be %NULL in case a
2447 * particular attribute is to be allowed but
2448 * ignored.
2450 * This function deals with issuing errors for missing attributes
2451 * (of type %G_MARKUP_ERROR_MISSING_ATTRIBUTE), unknown attributes
2452 * (of type %G_MARKUP_ERROR_UNKNOWN_ATTRIBUTE) and duplicate
2453 * attributes (of type %G_MARKUP_ERROR_INVALID_CONTENT) as well
2454 * as parse errors for boolean-valued attributes (again of type
2455 * %G_MARKUP_ERROR_INVALID_CONTENT). In all of these cases %FALSE
2456 * will be returned and @error will be set as appropriate.
2458 * Return value: %TRUE if successful
2460 * Since: 2.16
2462 gboolean
2463 g_markup_collect_attributes (const gchar *element_name,
2464 const gchar **attribute_names,
2465 const gchar **attribute_values,
2466 GError **error,
2467 GMarkupCollectType first_type,
2468 const gchar *first_attr,
2469 ...)
2471 GMarkupCollectType type;
2472 const gchar *attr;
2473 guint64 collected;
2474 int written;
2475 va_list ap;
2476 int i;
2478 type = first_type;
2479 attr = first_attr;
2480 collected = 0;
2481 written = 0;
2483 va_start (ap, first_attr);
2484 while (type != G_MARKUP_COLLECT_INVALID)
2486 gboolean mandatory;
2487 const gchar *value;
2489 mandatory = !(type & G_MARKUP_COLLECT_OPTIONAL);
2490 type &= (G_MARKUP_COLLECT_OPTIONAL - 1);
2492 /* tristate records a value != TRUE and != FALSE
2493 * for the case where the attribute is missing
2495 if (type == G_MARKUP_COLLECT_TRISTATE)
2496 mandatory = FALSE;
2498 for (i = 0; attribute_names[i]; i++)
2499 if (i >= 40 || !(collected & (G_GUINT64_CONSTANT(1) << i)))
2500 if (!strcmp (attribute_names[i], attr))
2501 break;
2503 /* ISO C99 only promises that the user can pass up to 127 arguments.
2504 * Subtracting the first 4 arguments plus the final NULL and dividing
2505 * by 3 arguments per collected attribute, we are left with a maximum
2506 * number of supported attributes of (127 - 5) / 3 = 40.
2508 * In reality, nobody is ever going to call us with anywhere close to
2509 * 40 attributes to collect, so it is safe to assume that if i > 40
2510 * then the user has given some invalid or repeated arguments. These
2511 * problems will be caught and reported at the end of the function.
2513 * We know at this point that we have an error, but we don't know
2514 * what error it is, so just continue...
2516 if (i < 40)
2517 collected |= (G_GUINT64_CONSTANT(1) << i);
2519 value = attribute_values[i];
2521 if (value == NULL && mandatory)
2523 g_set_error (error, G_MARKUP_ERROR,
2524 G_MARKUP_ERROR_MISSING_ATTRIBUTE,
2525 "element '%s' requires attribute '%s'",
2526 element_name, attr);
2528 va_end (ap);
2529 goto failure;
2532 switch (type)
2534 case G_MARKUP_COLLECT_STRING:
2536 const char **str_ptr;
2538 str_ptr = va_arg (ap, const char **);
2540 if (str_ptr != NULL)
2541 *str_ptr = value;
2543 break;
2545 case G_MARKUP_COLLECT_STRDUP:
2547 char **str_ptr;
2549 str_ptr = va_arg (ap, char **);
2551 if (str_ptr != NULL)
2552 *str_ptr = g_strdup (value);
2554 break;
2556 case G_MARKUP_COLLECT_BOOLEAN:
2557 case G_MARKUP_COLLECT_TRISTATE:
2558 if (value == NULL)
2560 gboolean *bool_ptr;
2562 bool_ptr = va_arg (ap, gboolean *);
2564 if (bool_ptr != NULL)
2566 if (type == G_MARKUP_COLLECT_TRISTATE)
2567 /* constructivists rejoice!
2568 * neither false nor true...
2570 *bool_ptr = -1;
2572 else /* G_MARKUP_COLLECT_BOOLEAN */
2573 *bool_ptr = FALSE;
2576 else
2578 if (!g_markup_parse_boolean (value, va_arg (ap, gboolean *)))
2580 g_set_error (error, G_MARKUP_ERROR,
2581 G_MARKUP_ERROR_INVALID_CONTENT,
2582 "element '%s', attribute '%s', value '%s' "
2583 "cannot be parsed as a boolean value",
2584 element_name, attr, value);
2586 va_end (ap);
2587 goto failure;
2591 break;
2593 default:
2594 g_assert_not_reached ();
2597 type = va_arg (ap, GMarkupCollectType);
2598 attr = va_arg (ap, const char *);
2599 written++;
2601 va_end (ap);
2603 /* ensure we collected all the arguments */
2604 for (i = 0; attribute_names[i]; i++)
2605 if ((collected & (G_GUINT64_CONSTANT(1) << i)) == 0)
2607 /* attribute not collected: could be caused by two things.
2609 * 1) it doesn't exist in our list of attributes
2610 * 2) it existed but was matched by a duplicate attribute earlier
2612 * find out.
2614 int j;
2616 for (j = 0; j < i; j++)
2617 if (strcmp (attribute_names[i], attribute_names[j]) == 0)
2618 /* duplicate! */
2619 break;
2621 /* j is now the first occurance of attribute_names[i] */
2622 if (i == j)
2623 g_set_error (error, G_MARKUP_ERROR,
2624 G_MARKUP_ERROR_UNKNOWN_ATTRIBUTE,
2625 "attribute '%s' invalid for element '%s'",
2626 attribute_names[i], element_name);
2627 else
2628 g_set_error (error, G_MARKUP_ERROR,
2629 G_MARKUP_ERROR_INVALID_CONTENT,
2630 "attribute '%s' given multiple times for element '%s'",
2631 attribute_names[i], element_name);
2633 goto failure;
2636 return TRUE;
2638 failure:
2639 /* replay the above to free allocations */
2640 type = first_type;
2641 attr = first_attr;
2643 va_start (ap, first_attr);
2644 while (type != G_MARKUP_COLLECT_INVALID)
2646 gpointer ptr;
2648 ptr = va_arg (ap, gpointer);
2650 if (ptr == NULL)
2651 continue;
2653 switch (type & (G_MARKUP_COLLECT_OPTIONAL - 1))
2655 case G_MARKUP_COLLECT_STRDUP:
2656 if (written)
2657 g_free (*(char **) ptr);
2659 case G_MARKUP_COLLECT_STRING:
2660 *(char **) ptr = NULL;
2661 break;
2663 case G_MARKUP_COLLECT_BOOLEAN:
2664 *(gboolean *) ptr = FALSE;
2665 break;
2667 case G_MARKUP_COLLECT_TRISTATE:
2668 *(gboolean *) ptr = -1;
2669 break;
2672 type = va_arg (ap, GMarkupCollectType);
2673 attr = va_arg (ap, const char *);
2675 if (written)
2676 written--;
2678 va_end (ap);
2680 return FALSE;
2683 #define __G_MARKUP_C__
2684 #include "galiasdef.c"