iconv: Bail out of the loop when an illegal sequence of bytes occurs.
[elinks/elinks-j605.git] / src / dom / node.h
blob90dc3f73905219134cd649d4e91ca591b5cb914b
1 /** DOM node module
3 * @file dom/node.h
5 * This module defines the various node and node list data structures
6 * and functionality to modify and access them, such as adding a node as
7 * a child to a given node and getting the text string of a node as
8 * defined by the DOM specification.
10 * @par Node hierarchy
12 * DOM documents are represented as a collection of nodes arranged in a
13 * hierarchic structure. At the root is either a #DOM_NODE_DOCUMENT or
14 * #DOM_NODE_DOCUMENT_FRAGMENT node, each of which may have multiple
15 * child nodes. There is a well-defined order that dictates which child
16 * nodes may be descendants of a given type of node. For example, text
17 * and attribute nodes can have no children, while elements node may
18 * have both attribute and element nodes as children but with each type
19 * in different node lists. The hierarchy is somewhat encoded in the
20 * type specific node data, however, certain node types also define
21 * "custom" node lists for conveniently storing additional "embedded"
22 * data, such as processing instruction nodes having an attribute node
23 * list for conveniently accessing variable-value pairs given for
24 * XML-specific processing instructions:
26 * @verbatim <?xml version="1.0"?> @endverbatim
28 * @par Node lists
30 * There are two types of list: unordered (the default) and
31 * alphabetically ordered (also called "maps"). Both types of list
32 * stores all contained nodes in the index-oriented #dom_node_list data
33 * structure.
35 * When inserting a node into a list, first use either
36 * #get_dom_node_list_index or #get_dom_node_map_index (depending on
37 * whether the list is unordered or ordered respectively) to calculate
38 * the index at which to insert the new node. Then use
39 * #add_to_dom_node_list to insert the node in the list at the given
40 * position. Alternatively (and mostly preferred), simply use
41 * #add_dom_node to have all of the above done automatically plus some
42 * additional checks.
44 * A variety of node list accessors are defined. The node structure does
45 * not define any "next" or "previous" members to get siblings due to
46 * reduce memory usage (this might have to change --jonas). Instead, use
47 * #get_dom_node_next and #get_dom_node_next to access siblings. To
48 * lookup the existence of a node in a sorted node list (map) use
49 * #get_dom_node_map_entry. If a specific and unique node subtype should
50 * be found use #get_dom_node_child that given a parent node will find a
51 * child node based on a specific child node type and subtype. Finally,
52 * list can be iterated in forward and reverse order using
53 * #foreach_dom_node and #foreachback_dom_node.
56 #ifndef EL_DOM_NODE_H
57 #define EL_DOM_NODE_H
59 #include "dom/string.h"
61 struct dom_node_list;
62 struct dom_document;
64 /** DOM node types */
65 enum dom_node_type {
66 DOM_NODE_UNKNOWN = 0, /**< Node type used internally. */
68 DOM_NODE_ELEMENT = 1, /**< Element node */
69 DOM_NODE_ATTRIBUTE = 2, /**< Attribute node */
70 DOM_NODE_TEXT = 3, /**< Text node */
71 DOM_NODE_CDATA_SECTION = 4, /**< CData section node */
72 DOM_NODE_ENTITY_REFERENCE = 5, /**< Entity reference node */
73 DOM_NODE_ENTITY = 6, /**< Entity node */
74 DOM_NODE_PROCESSING_INSTRUCTION = 7, /**< Processing instruction node */
75 DOM_NODE_COMMENT = 8, /**< Comment node */
76 DOM_NODE_DOCUMENT = 9, /**< Document root node */
77 DOM_NODE_DOCUMENT_TYPE = 10, /**< Document type (DTD) node */
78 DOM_NODE_DOCUMENT_FRAGMENT = 11, /**< Document fragment node */
79 DOM_NODE_NOTATION = 12, /**< Notation node */
81 DOM_NODES /**< The number of DOM nodes */
84 /* Following is the node specific data structures. They may contain no
85 * more than 4 pointers or something equivalent. */
87 /* The document URI is stored in the string / length members. */
88 struct dom_document_node {
89 /* The document. */
90 struct dom_document *document;
92 /* The child nodes. May be NULL. Ordered like they where inserted. */
93 /* FIXME: Should be just one element (root) node reference. */
94 struct dom_node_list *children;
97 struct dom_id {
98 struct dom_string public_id;
99 struct dom_string system_id;
102 struct dom_doctype_subset_info {
103 struct dom_string internal;
104 struct dom_id external;
107 struct dom_document_type_node {
108 /* These are really maps and should be sorted alphabetically. */
109 struct dom_node_list *entities;
110 struct dom_node_list *notations;
112 /* The string/length members of dom_node hold the name of the document
113 * type "<!DOCTYPE {name} ...>". This holds the ids for the external
114 * subset and the string of the internal subset. */
115 struct dom_doctype_subset_infot *subset;
118 /* Element nodes are indexed nodes stored in node lists of either
119 * other child nodes or the root node. */
120 struct dom_element_node {
121 /* The child nodes. May be NULL. Ordered like they where inserted. */
122 struct dom_node_list *children;
124 /* Only element nodes can have attributes and element nodes can only be
125 * child nodes so the map is put here.
127 * The @map may be NULL if there are none. The @map nodes are sorted
128 * alphabetically according to the attributes name so it has fast
129 * lookup. */
130 struct dom_node_list *map;
132 /* For <xsl:stylesheet ...> elements this holds the offset of
133 * 'stylesheet' */
134 uint16_t namespace_offset;
136 /* Special implementation dependent type specifier for example
137 * containing an enum value representing the element to reduce string
138 * comparing and only do one fast find mapping. */
139 uint16_t type;
142 /* Attribute nodes are named nodes stored in a node map of an element node. */
143 struct dom_attribute_node {
144 /* The string that hold the attribute value. The @string / @length
145 * members of {struct dom_node} holds the name that identifies the node
146 * in the map. */
147 struct dom_string value;
149 /* For xml:lang="en" attributes this holds the offset of 'lang' */
150 uint16_t namespace_offset;
152 /* Special implementation dependent type specifier. For HTML it (will)
153 * contain an enum value representing the attribute HTML_CLASS, HTML_ID etc.
154 * to reduce string comparing and only do one fast find mapping. */
155 uint16_t type;
157 /* The attribute value is delimited by quotes. Can be NUL, ' or ". */
158 unsigned char quoted;
160 /* Was the attribute specified in the DTD as a default attribute or was
161 * it added from the document source. */
162 unsigned int specified:1;
164 /* Has the node->string been converted to internal charset. */
165 unsigned int converted:1;
167 /* Is the attribute a unique identifier. */
168 unsigned int id:1;
170 /* The attribute value references some other resource */
171 unsigned int reference:1;
174 struct dom_text_node {
175 /* The number of newlines the text string contains */
176 unsigned int newlines;
178 /* We will need to add text nodes even if they contain only whitespace.
179 * In order to quickly identify such nodes this member is used. */
180 unsigned int only_space:1;
182 /* Has the node->string been converted to internal charset. */
183 unsigned int converted:1;
186 enum dom_proc_instruction_type {
187 DOM_PROC_INSTRUCTION,
189 /* Keep this group sorted */
190 DOM_PROC_INSTRUCTION_XML, /* XML header */
191 DOM_PROC_INSTRUCTION_XML_STYLESHEET, /* XML stylesheet link */
193 DOM_PROC_INSTRUCTION_TYPES
196 struct dom_proc_instruction_node {
197 /* The target of the processing instruction (xml for '<?xml ... ?>')
198 * is in the @string / @length members. */
199 /* This holds the value to be processed */
200 struct dom_string instruction;
202 /* For fast checking of the target type */
203 uint16_t type; /* enum dom_proc_instruction_type */
205 /* For some processing instructions like xml the instructions contain
206 * attributes and those attribute can be collected in this @map. */
207 struct dom_node_list *map;
210 union dom_node_data {
211 struct dom_document_node document;
212 struct dom_document_type_node document_type;
213 struct dom_element_node element;
214 struct dom_attribute_node attribute;
215 struct dom_text_node text;
216 struct dom_id notation;
217 /* For entities string/length hold the notation name */
218 struct dom_id entity;
219 struct dom_proc_instruction_node proc_instruction;
221 /* Node types without a union member yet (mostly because it hasn't
222 * been necessary):
224 * DOM_NODE_CDATA_SECTION: Use dom_text_node?
225 * DOM_NODE_DOCUMENT_FRAGMENT: struct dom_node_list children;
226 * DOM_NODE_ENTITY_REFERENCE: unicode_val_T
227 * DOM_NODE_COMMENT
231 /** DOM node
233 * The node data structure is an abstract container that can be used to
234 * represent the hierarchic structure of a document, such as relation
235 * between elements, attributes, etc.
237 * @note This structure is size critical so keep ordering to make
238 * it easier to pack and avoid unneeded members.
240 struct dom_node {
241 /** The type of the node. Holds a #dom_node_type enum value. */
242 uint16_t type; /* -> enum dom_node_type */
244 /** Was the node string allocated? */
245 unsigned int allocated:1;
247 /** Type specific node string. Can contain either stuff like
248 * element name or for attributes the attribute name. */
249 struct dom_string string;
251 /** The parent node. The parent node is NULL for the root node. */
252 struct dom_node *parent;
254 /** Type specific node data. */
255 union dom_node_data data;
258 /** DOM node list
260 * A node list can be used for storing indexed nodes. If a node list
261 * should be sorted alphabetically use the #get_dom_node_map_index
262 * function to find the index of new nodes before inserting them. */
263 struct dom_node_list {
264 size_t size;
265 struct dom_node *entries[1];
268 #define foreach_dom_node(list, node, i) \
269 for ((i) = 0; (i) < (list)->size; (i)++) \
270 if (((node) = (list)->entries[(i)]))
272 #define foreachback_dom_node(list, node, i) \
273 for ((i) = (list)->size - 1; (i) > 0; (i)--) \
274 if (((node) = (list)->entries[(i)]))
276 #define is_dom_node_list_member(list, member) \
277 ((list) && 0 <= (member) && (member) < (list)->size)
279 /* Adds @node to the list pointed to by @list_ptr at the given @position. If
280 * @position is -1 the node is added at the end. */
281 struct dom_node_list *
282 add_to_dom_node_list(struct dom_node_list **list_ptr,
283 struct dom_node *node, int position);
285 void done_dom_node_list(struct dom_node_list *list);
287 /* Returns the position or index where the @node has been inserted into the
288 * 'default' list of the @parent node. (Default means use get_dom_node_list()
289 * to acquire the list to search in. Returns -1, if the node is not found. */
290 int get_dom_node_list_index(struct dom_node *parent, struct dom_node *node);
292 /* Returns the position or index where the @node should be inserted into the
293 * node @list in order to the list to be alphabetically sorted. Assumes that
294 * @list is already sorted properly. */
295 int get_dom_node_map_index(struct dom_node_list *list, struct dom_node *node);
297 /* Returns the previous sibling to the node. */
298 struct dom_node *get_dom_node_prev(struct dom_node *node);
300 /* Returns the next sibling to the node. */
301 struct dom_node *get_dom_node_next(struct dom_node *node);
303 /* Returns first text node of the element or NULL. */
304 struct dom_node *
305 get_dom_node_child(struct dom_node *node, enum dom_node_type child_type,
306 int16_t child_subtype);
308 /* Looks up the @node_map for a node matching the requested type and name.
309 * The @subtype maybe be 0 indication unknown subtype and only name should be
310 * tested else it will indicate either the element or attribute private
311 * subtype. */
312 struct dom_node *
313 get_dom_node_map_entry(struct dom_node_list *node_map,
314 enum dom_node_type type, uint16_t subtype,
315 struct dom_string *name);
317 /* Removes the node and all its children and free()s itself.
318 * A dom_stack_callback_T must not use this to free the node
319 * it gets as a parameter. */
320 void done_dom_node(struct dom_node *node);
322 #ifndef DEBUG_MEMLEAK
324 /* The allocated argument is used as the value of node->allocated if >= 0.
325 * Use -1 to default node->allocated to the value of parent->allocated. */
327 struct dom_node *
328 init_dom_node_at(struct dom_node *parent, enum dom_node_type type,
329 struct dom_string *string, int allocated);
331 #define init_dom_node(type, string, allocated) \
332 init_dom_node_at(NULL, type, string, allocated)
334 #define add_dom_node(parent, type, string) \
335 init_dom_node_at(parent, type, string, -1)
337 #else
338 struct dom_node *
339 init_dom_node_at(unsigned char *file, int line,
340 struct dom_node *parent, enum dom_node_type type,
341 struct dom_string *string, int allocated);
343 #define init_dom_node(type, string, allocated) \
344 init_dom_node_at(__FILE__, __LINE__, NULL, type, string, allocated)
346 #define add_dom_node(parent, type, string) \
347 init_dom_node_at(__FILE__, __LINE__, parent, type, string, -1)
349 #endif /* DEBUG_MEMLEAK */
351 #define add_dom_element(parent, string) \
352 add_dom_node(parent, DOM_NODE_ELEMENT, string)
354 static inline struct dom_node *
355 add_dom_attribute(struct dom_node *parent, struct dom_string *name,
356 struct dom_string *value)
358 struct dom_node *node = add_dom_node(parent, DOM_NODE_ATTRIBUTE, name);
360 if (node && value) {
361 struct dom_string *str = &node->data.attribute.value;
363 if (node->allocated) {
364 if (!init_dom_string(str, value->string, value->length)) {
365 done_dom_node(node);
366 return NULL;
368 } else {
369 copy_dom_string(str, value);
373 return node;
376 static inline struct dom_node *
377 add_dom_proc_instruction(struct dom_node *parent, struct dom_string *string,
378 struct dom_string *instruction)
380 struct dom_node *node = add_dom_node(parent, DOM_NODE_PROCESSING_INSTRUCTION, string);
382 if (node && instruction) {
383 struct dom_string *str = &node->data.proc_instruction.instruction;
385 if (node->allocated) {
386 if (!init_dom_string(str, instruction->string, instruction->length)) {
387 done_dom_node(node);
388 return NULL;
390 } else {
391 copy_dom_string(str, instruction);
395 return node;
398 /* Compare two nodes returning non-zero if they differ. */
399 int dom_node_casecmp(struct dom_node *node1, struct dom_node *node2);
401 /* Returns the name of the node in an allocated string. */
402 struct dom_string *get_dom_node_name(struct dom_node *node);
404 /* Returns the value of the node or NULL if no value is defined for the node
405 * type. */
406 struct dom_string *get_dom_node_value(struct dom_node *node);
408 /* Returns the name used for identifying the node type. */
409 struct dom_string *get_dom_node_type_name(enum dom_node_type type);
411 /** Based on the type of the @a parent and the node @a type return a
412 * proper list or NULL. This is useful when adding a node to a parent
413 * node.
415 * With a <code>struct dom_node_list **list</code> returned by this
416 * function, there are four possibilities:
418 * - <code>list == NULL</code>. This means @a parent does not support
419 * child nodes of the given @a type.
421 * - <code>*list == NULL</code>. This means @a parent does not yet
422 * have any child nodes of the given @a type and so no list has been
423 * allocated for them. Callers should treat the lack of a list in
424 * the same way as an empty list.
426 * - <code>(*list)->size == 0</code>. This is an empty list. It is
427 * unspecified whether the DOM code keeps such lists; it could
428 * instead change them back to NULL.
430 * - <code>(*list)->size != 0</code>. This is a nonempty list.
431 * However, the nodes in it might not actually be of the given
432 * @a type because some lists are used for multiple types. */
433 static inline struct dom_node_list **
434 get_dom_node_list_by_type(struct dom_node *parent, enum dom_node_type type)
436 switch (parent->type) {
437 case DOM_NODE_DOCUMENT:
438 return &parent->data.document.children;
440 case DOM_NODE_ELEMENT:
441 switch (type) {
442 case DOM_NODE_ATTRIBUTE:
443 return &parent->data.element.map;
445 default:
446 return &parent->data.element.children;
449 case DOM_NODE_DOCUMENT_TYPE:
450 switch (type) {
451 case DOM_NODE_ENTITY:
452 return &parent->data.document_type.entities;
454 case DOM_NODE_NOTATION:
455 return &parent->data.document_type.notations;
457 default:
458 return NULL;
461 case DOM_NODE_PROCESSING_INSTRUCTION:
462 switch (type) {
463 case DOM_NODE_ATTRIBUTE:
464 return &parent->data.proc_instruction.map;
466 default:
467 return NULL;
470 default:
471 return NULL;
475 #define get_dom_node_list(parent, node) \
476 get_dom_node_list_by_type(parent, (node)->type)
478 #endif