4 * Parser interface for DOM-based parser (libxml) rather than
5 * stream-based SAX-type parser
9 #include "executor/spi.h"
12 #include "lib/stringinfo.h"
13 #include "miscadmin.h"
14 #include "utils/builtins.h"
18 #include <libxml/xpath.h>
19 #include <libxml/tree.h>
20 #include <libxml/xmlmemory.h>
21 #include <libxml/xmlerror.h>
22 #include <libxml/parserInternals.h>
29 static void *pgxml_palloc(size_t size
);
30 static void *pgxml_repalloc(void *ptr
, size_t size
);
31 static void pgxml_pfree(void *ptr
);
32 static char *pgxml_pstrdup(const char *string
);
33 static void pgxml_errorHandler(void *ctxt
, const char *msg
,...);
35 void elog_error(int level
, char *explain
, int force
);
36 void pgxml_parser_init(void);
38 static xmlChar
*pgxmlNodeSetToText(xmlNodeSetPtr nodeset
,
39 xmlChar
*toptagname
, xmlChar
*septagname
,
42 text
*pgxml_result_to_text(xmlXPathObjectPtr res
, xmlChar
*toptag
,
43 xmlChar
*septag
, xmlChar
*plainsep
);
45 xmlChar
*pgxml_texttoxmlchar(text
*textstring
);
47 static xmlXPathObjectPtr
pgxml_xpath(text
*document
, xmlChar
*xpath
);
50 Datum
xml_is_well_formed(PG_FUNCTION_ARGS
);
51 Datum
xml_encode_special_chars(PG_FUNCTION_ARGS
);
52 Datum
xpath_nodeset(PG_FUNCTION_ARGS
);
53 Datum
xpath_string(PG_FUNCTION_ARGS
);
54 Datum
xpath_number(PG_FUNCTION_ARGS
);
55 Datum
xpath_bool(PG_FUNCTION_ARGS
);
56 Datum
xpath_list(PG_FUNCTION_ARGS
);
57 Datum
xpath_table(PG_FUNCTION_ARGS
);
59 /* Global variables */
60 char *errbuf
; /* per line error buffer */
61 char *pgxml_errorMsg
= NULL
; /* overall error message */
63 #define ERRBUF_SIZE 200
65 /* memory handling passthrough functions (e.g. palloc, pstrdup are
66 currently macros, and the others might become so...) */
69 pgxml_palloc(size_t size
)
71 /* elog(DEBUG1,"Alloc %d in CMC %p",size,CurrentMemoryContext); */
76 pgxml_repalloc(void *ptr
, size_t size
)
78 /* elog(DEBUG1,"ReAlloc in CMC %p",CurrentMemoryContext);*/
79 return repalloc(ptr
, size
);
83 pgxml_pfree(void *ptr
)
85 /* elog(DEBUG1,"Free in CMC %p",CurrentMemoryContext); */
90 pgxml_pstrdup(const char *string
)
92 return pstrdup(string
);
95 /* The error handling function. This formats an error message and sets
96 * a flag - an ereport will be issued prior to return
100 pgxml_errorHandler(void *ctxt
, const char *msg
,...)
105 vsnprintf(errbuf
, ERRBUF_SIZE
, msg
, args
);
107 /* Now copy the argument across */
108 if (pgxml_errorMsg
== NULL
)
109 pgxml_errorMsg
= pstrdup(errbuf
);
112 int32 xsize
= strlen(pgxml_errorMsg
);
114 pgxml_errorMsg
= repalloc(pgxml_errorMsg
,
115 (size_t) (xsize
+ strlen(errbuf
) + 1));
116 strncpy(&pgxml_errorMsg
[xsize
- 1], errbuf
, strlen(errbuf
));
117 pgxml_errorMsg
[xsize
+ strlen(errbuf
) - 1] = '\0';
120 memset(errbuf
, 0, ERRBUF_SIZE
);
123 /* This function reports the current message at the level specified */
125 elog_error(int level
, char *explain
, int force
)
127 if (force
|| (pgxml_errorMsg
!= NULL
))
129 if (pgxml_errorMsg
== NULL
)
131 ereport(level
, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION
),
136 ereport(level
, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION
),
137 errmsg("%s:%s", explain
, pgxml_errorMsg
)));
138 pfree(pgxml_errorMsg
);
147 * This code could also set parser settings from user-supplied info.
148 * Quite how these settings are made is another matter :)
151 xmlMemSetup(pgxml_pfree
, pgxml_palloc
, pgxml_repalloc
, pgxml_pstrdup
);
154 xmlSetGenericErrorFunc(NULL
, pgxml_errorHandler
);
156 xmlSubstituteEntitiesDefault(1);
157 xmlLoadExtDtdDefaultValue
= 1;
159 pgxml_errorMsg
= NULL
;
161 errbuf
= palloc(200);
162 memset(errbuf
, 0, 200);
167 /* Returns true if document is well-formed */
169 PG_FUNCTION_INFO_V1(xml_is_well_formed
);
172 xml_is_well_formed(PG_FUNCTION_ARGS
)
174 /* called as xml_is_well_formed(document) */
176 text
*t
= PG_GETARG_TEXT_P(0); /* document buffer */
177 int32 docsize
= VARSIZE(t
) - VARHDRSZ
;
181 doctree
= xmlParseMemory((char *) VARDATA(t
), docsize
);
185 PG_RETURN_BOOL(false); /* i.e. not well-formed */
189 PG_RETURN_BOOL(true);
193 /* Encodes special characters (<, >, &, " and \r) as XML entities */
195 PG_FUNCTION_INFO_V1(xml_encode_special_chars
);
198 xml_encode_special_chars(PG_FUNCTION_ARGS
)
200 text
*tin
= PG_GETARG_TEXT_P(0);
205 ts
= pgxml_texttoxmlchar(tin
);
207 tt
= xmlEncodeSpecialChars(NULL
, ts
);
211 tout
= cstring_to_text((char *) tt
);
215 PG_RETURN_TEXT_P(tout
);
220 pgxmlNodeSetToText(xmlNodeSetPtr nodeset
,
225 /* Function translates a nodeset into a text representation */
228 * iterates over each node in the set and calls xmlNodeDump to write it to
229 * an xmlBuffer -from which an xmlChar * string is returned.
232 /* each representation is surrounded by <tagname> ... </tagname> */
235 * plainsep is an ordinary (not tag) seperator - if used, then nodes are
236 * cast to string as output method
244 buf
= xmlBufferCreate();
246 if ((toptagname
!= NULL
) && (xmlStrlen(toptagname
) > 0))
248 xmlBufferWriteChar(buf
, "<");
249 xmlBufferWriteCHAR(buf
, toptagname
);
250 xmlBufferWriteChar(buf
, ">");
254 for (i
= 0; i
< nodeset
->nodeNr
; i
++)
257 if (plainsep
!= NULL
)
259 xmlBufferWriteCHAR(buf
,
260 xmlXPathCastNodeToString(nodeset
->nodeTab
[i
]));
262 /* If this isn't the last entry, write the plain sep. */
263 if (i
< (nodeset
->nodeNr
) - 1)
264 xmlBufferWriteChar(buf
, (char *) plainsep
);
270 if ((septagname
!= NULL
) && (xmlStrlen(septagname
) > 0))
272 xmlBufferWriteChar(buf
, "<");
273 xmlBufferWriteCHAR(buf
, septagname
);
274 xmlBufferWriteChar(buf
, ">");
277 nodeset
->nodeTab
[i
]->doc
,
281 if ((septagname
!= NULL
) && (xmlStrlen(septagname
) > 0))
283 xmlBufferWriteChar(buf
, "</");
284 xmlBufferWriteCHAR(buf
, septagname
);
285 xmlBufferWriteChar(buf
, ">");
291 if ((toptagname
!= NULL
) && (xmlStrlen(toptagname
) > 0))
293 xmlBufferWriteChar(buf
, "</");
294 xmlBufferWriteCHAR(buf
, toptagname
);
295 xmlBufferWriteChar(buf
, ">");
297 result
= xmlStrdup(buf
->content
);
303 /* Translate a PostgreSQL "varlena" -i.e. a variable length parameter
304 * into the libxml2 representation
308 pgxml_texttoxmlchar(text
*textstring
)
310 return (xmlChar
*) text_to_cstring(textstring
);
313 /* Public visible XPath functions */
315 /* This is a "raw" xpath function. Check that it returns child elements
319 PG_FUNCTION_INFO_V1(xpath_nodeset
);
322 xpath_nodeset(PG_FUNCTION_ARGS
)
332 /* PG_GETARG_TEXT_P(0) is document buffer */
333 xpathsupp
= PG_GETARG_TEXT_P(1); /* XPath expression */
335 toptag
= pgxml_texttoxmlchar(PG_GETARG_TEXT_P(2));
336 septag
= pgxml_texttoxmlchar(PG_GETARG_TEXT_P(3));
338 pathsize
= VARSIZE(xpathsupp
) - VARHDRSZ
;
340 xpath
= pgxml_texttoxmlchar(xpathsupp
);
342 xpres
= pgxml_result_to_text(
343 pgxml_xpath(PG_GETARG_TEXT_P(0), xpath
),
344 toptag
, septag
, NULL
);
346 /* xmlCleanupParser(); done by result_to_text routine */
351 PG_RETURN_TEXT_P(xpres
);
354 /* The following function is almost identical, but returns the elements in */
357 PG_FUNCTION_INFO_V1(xpath_list
);
360 xpath_list(PG_FUNCTION_ARGS
)
369 /* PG_GETARG_TEXT_P(0) is document buffer */
370 xpathsupp
= PG_GETARG_TEXT_P(1); /* XPath expression */
372 plainsep
= pgxml_texttoxmlchar(PG_GETARG_TEXT_P(2));
374 pathsize
= VARSIZE(xpathsupp
) - VARHDRSZ
;
376 xpath
= pgxml_texttoxmlchar(xpathsupp
);
378 xpres
= pgxml_result_to_text(
379 pgxml_xpath(PG_GETARG_TEXT_P(0), xpath
),
380 NULL
, NULL
, plainsep
);
382 /* xmlCleanupParser(); done by result_to_text routine */
387 PG_RETURN_TEXT_P(xpres
);
391 PG_FUNCTION_INFO_V1(xpath_string
);
394 xpath_string(PG_FUNCTION_ARGS
)
402 /* PG_GETARG_TEXT_P(0) is document buffer */
403 xpathsupp
= PG_GETARG_TEXT_P(1); /* XPath expression */
405 pathsize
= VARSIZE(xpathsupp
) - VARHDRSZ
;
408 * We encapsulate the supplied path with "string()" = 8 chars + 1 for NUL
411 /* We could try casting to string using the libxml function? */
413 xpath
= (xmlChar
*) palloc(pathsize
+ 9);
414 memcpy((char *) (xpath
+ 7), VARDATA(xpathsupp
), pathsize
);
415 strncpy((char *) xpath
, "string(", 7);
416 xpath
[pathsize
+ 7] = ')';
417 xpath
[pathsize
+ 8] = '\0';
419 xpres
= pgxml_result_to_text(
420 pgxml_xpath(PG_GETARG_TEXT_P(0), xpath
),
428 PG_RETURN_TEXT_P(xpres
);
432 PG_FUNCTION_INFO_V1(xpath_number
);
435 xpath_number(PG_FUNCTION_ARGS
)
444 xmlXPathObjectPtr res
;
446 /* PG_GETARG_TEXT_P(0) is document buffer */
447 xpathsupp
= PG_GETARG_TEXT_P(1); /* XPath expression */
449 pathsize
= VARSIZE(xpathsupp
) - VARHDRSZ
;
451 xpath
= pgxml_texttoxmlchar(xpathsupp
);
453 res
= pgxml_xpath(PG_GETARG_TEXT_P(0), xpath
);
462 fRes
= xmlXPathCastToNumber(res
);
464 if (xmlXPathIsNaN(fRes
))
467 PG_RETURN_FLOAT4(fRes
);
472 PG_FUNCTION_INFO_V1(xpath_bool
);
475 xpath_bool(PG_FUNCTION_ARGS
)
484 xmlXPathObjectPtr res
;
486 /* PG_GETARG_TEXT_P(0) is document buffer */
487 xpathsupp
= PG_GETARG_TEXT_P(1); /* XPath expression */
489 pathsize
= VARSIZE(xpathsupp
) - VARHDRSZ
;
491 xpath
= pgxml_texttoxmlchar(xpathsupp
);
493 res
= pgxml_xpath(PG_GETARG_TEXT_P(0), xpath
);
499 PG_RETURN_BOOL(false);
502 bRes
= xmlXPathCastToBoolean(res
);
504 PG_RETURN_BOOL(bRes
);
510 /* Core function to evaluate XPath query */
513 pgxml_xpath(text
*document
, xmlChar
*xpath
)
517 xmlXPathContextPtr ctxt
;
518 xmlXPathObjectPtr res
;
520 xmlXPathCompExprPtr comppath
;
525 docsize
= VARSIZE(document
) - VARHDRSZ
;
529 doctree
= xmlParseMemory((char *) VARDATA(document
), docsize
);
531 { /* not well-formed */
535 ctxt
= xmlXPathNewContext(doctree
);
536 ctxt
->node
= xmlDocGetRootElement(doctree
);
539 /* compile the path */
540 comppath
= xmlXPathCompile(xpath
);
541 if (comppath
== NULL
)
545 elog_error(ERROR
, "XPath Syntax Error", 1);
550 /* Now evaluate the path expression. */
551 res
= xmlXPathCompiledEval(comppath
, ctxt
);
552 xmlXPathFreeCompExpr(comppath
);
556 xmlXPathFreeContext(ctxt
);
557 /* xmlCleanupParser(); */
562 /* xmlFreeDoc(doctree); */
568 pgxml_result_to_text(xmlXPathObjectPtr res
,
584 xpresstr
= pgxmlNodeSetToText(res
->nodesetval
,
590 xpresstr
= xmlStrdup(res
->stringval
);
594 elog(NOTICE
, "unsupported XQuery result: %d", res
->type
);
595 xpresstr
= xmlStrdup((const xmlChar
*) "<unsupported/>");
599 /* Now convert this result back to text */
600 xpres
= cstring_to_text((char *) xpresstr
);
602 /* Free various storage */
604 /* xmlFreeDoc(doctree); -- will die at end of tuple anyway */
608 elog_error(ERROR
, "XPath error", 0);
614 /* xpath_table is a table function. It needs some tidying (as do the
615 * other functions here!
618 PG_FUNCTION_INFO_V1(xpath_table
);
621 xpath_table(PG_FUNCTION_ARGS
)
623 /* SPI (input tuple) support */
624 SPITupleTable
*tuptable
;
626 TupleDesc spi_tupdesc
;
628 /* Output tuple (tuplestore) support */
629 Tuplestorestate
*tupstore
= NULL
;
630 TupleDesc ret_tupdesc
;
633 ReturnSetInfo
*rsinfo
= (ReturnSetInfo
*) fcinfo
->resultinfo
;
634 AttInMetadata
*attinmeta
;
635 MemoryContext per_query_ctx
;
636 MemoryContext oldcontext
;
638 /* Function parameters */
639 char *pkeyfield
= text_to_cstring(PG_GETARG_TEXT_PP(0));
640 char *xmlfield
= text_to_cstring(PG_GETARG_TEXT_PP(1));
641 char *relname
= text_to_cstring(PG_GETARG_TEXT_PP(2));
642 char *xpathset
= text_to_cstring(PG_GETARG_TEXT_PP(3));
643 char *condition
= text_to_cstring(PG_GETARG_TEXT_PP(4));
648 const char *pathsep
= "|";
655 int rownr
; /* For issuing multiple rows from one original
657 int had_values
; /* To determine end of nodeset results */
659 StringInfoData query_buf
;
661 /* We only have a valid tuple description in table function mode */
662 if (rsinfo
== NULL
|| !IsA(rsinfo
, ReturnSetInfo
))
664 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED
),
665 errmsg("set-valued function called in context that cannot accept a set")));
666 if (rsinfo
->expectedDesc
== NULL
)
668 (errcode(ERRCODE_SYNTAX_ERROR
),
669 errmsg("xpath_table must be called as a table function")));
672 * We want to materialise because it means that we don't have to carry
673 * libxml2 parser state between invocations of this function
675 if (!(rsinfo
->allowedModes
& SFRM_Materialize
))
677 (errcode(ERRCODE_SYNTAX_ERROR
),
678 errmsg("xpath_table requires Materialize mode, but it is not "
679 "allowed in this context")));
682 * The tuplestore must exist in a higher context than this function call
683 * (per_query_ctx is used)
686 per_query_ctx
= rsinfo
->econtext
->ecxt_per_query_memory
;
687 oldcontext
= MemoryContextSwitchTo(per_query_ctx
);
690 * Create the tuplestore - work_mem is the max in-memory size before a
691 * file is created on disk to hold it.
694 tuplestore_begin_heap(rsinfo
->allowedModes
& SFRM_Materialize_Random
,
697 MemoryContextSwitchTo(oldcontext
);
699 /* get the requested return tuple description */
700 ret_tupdesc
= CreateTupleDescCopy(rsinfo
->expectedDesc
);
703 * At the moment we assume that the returned attributes make sense for the
704 * XPath specififed (i.e. we trust the caller). It's not fatal if they get
705 * it wrong - the input function for the column type will raise an error
706 * if the path result can't be converted into the correct binary
710 attinmeta
= TupleDescGetAttInMetadata(ret_tupdesc
);
712 /* Set return mode and allocate value space. */
713 rsinfo
->returnMode
= SFRM_Materialize
;
714 rsinfo
->setDesc
= ret_tupdesc
;
716 values
= (char **) palloc(ret_tupdesc
->natts
* sizeof(char *));
718 xpaths
= (xmlChar
**) palloc(ret_tupdesc
->natts
* sizeof(xmlChar
*));
720 /* Split XPaths. xpathset is a writable CString. */
722 /* Note that we stop splitting once we've done all needed for tupdesc */
728 xpaths
[numpaths
] = (xmlChar
*) pos
;
729 pos
= strstr(pos
, pathsep
);
736 } while ((pos
!= NULL
) && (numpaths
< (ret_tupdesc
->natts
- 1)));
738 /* Now build query */
739 initStringInfo(&query_buf
);
741 /* Build initial sql statement */
742 appendStringInfo(&query_buf
, "SELECT %s, %s FROM %s WHERE %s",
750 if ((ret
= SPI_connect()) < 0)
751 elog(ERROR
, "xpath_table: SPI_connect returned %d", ret
);
753 if ((ret
= SPI_exec(query_buf
.data
, 0)) != SPI_OK_SELECT
)
754 elog(ERROR
, "xpath_table: SPI execution failed for query %s", query_buf
.data
);
756 proc
= SPI_processed
;
757 /* elog(DEBUG1,"xpath_table: SPI returned %d rows",proc); */
758 tuptable
= SPI_tuptable
;
759 spi_tupdesc
= tuptable
->tupdesc
;
761 /* Switch out of SPI context */
762 MemoryContextSwitchTo(oldcontext
);
765 /* Check that SPI returned correct result. If you put a comma into one of
766 * the function parameters, this will catch it when the SPI query returns
770 if (spi_tupdesc
->natts
!= 2)
772 ereport(ERROR
, (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
773 errmsg("expression returning multiple columns is not valid in parameter list"),
774 errdetail("Expected two columns in SPI result, got %d.", spi_tupdesc
->natts
)));
777 /* Setup the parser. Beware that this must happen in the same context as the
778 * cleanup - which means that any error from here on must do cleanup to
779 * ensure that the entity table doesn't get freed by being out of context.
783 /* For each row i.e. document returned from SPI */
784 for (i
= 0; i
< proc
; i
++)
790 xmlXPathContextPtr ctxt
;
791 xmlXPathObjectPtr res
;
795 xmlXPathCompExprPtr comppath
;
797 /* Extract the row data as C Strings */
798 spi_tuple
= tuptable
->vals
[i
];
799 pkey
= SPI_getvalue(spi_tuple
, spi_tupdesc
, 1);
800 xmldoc
= SPI_getvalue(spi_tuple
, spi_tupdesc
, 2);
803 * Clear the values array, so that not-well-formed documents return
804 * NULL in all columns.
807 /* Note that this also means that spare columns will be NULL. */
808 for (j
= 0; j
< ret_tupdesc
->natts
; j
++)
811 /* Insert primary key */
814 /* Parse the document */
816 doctree
= xmlParseMemory(xmldoc
, strlen(xmldoc
));
817 else /* treat NULL as not well-formed */
822 /* not well-formed, so output all-NULL tuple */
823 ret_tuple
= BuildTupleFromCStrings(attinmeta
, values
);
824 oldcontext
= MemoryContextSwitchTo(per_query_ctx
);
825 tuplestore_puttuple(tupstore
, ret_tuple
);
826 MemoryContextSwitchTo(oldcontext
);
827 heap_freetuple(ret_tuple
);
831 /* New loop here - we have to deal with nodeset results */
836 /* Now evaluate the set of xpaths. */
838 for (j
= 0; j
< numpaths
; j
++)
841 ctxt
= xmlXPathNewContext(doctree
);
842 ctxt
->node
= xmlDocGetRootElement(doctree
);
843 xmlSetGenericErrorFunc(ctxt
, pgxml_errorHandler
);
845 /* compile the path */
846 comppath
= xmlXPathCompile(xpaths
[j
]);
847 if (comppath
== NULL
)
852 elog_error(ERROR
, "XPath Syntax Error", 1);
854 PG_RETURN_NULL(); /* Keep compiler happy */
857 /* Now evaluate the path expression. */
858 res
= xmlXPathCompiledEval(comppath
, ctxt
);
859 xmlXPathFreeCompExpr(comppath
);
866 /* We see if this nodeset has enough nodes */
867 if ((res
->nodesetval
!= NULL
) && (rownr
< res
->nodesetval
->nodeNr
))
870 xmlXPathCastNodeToString(res
->nodesetval
->nodeTab
[rownr
]);
879 resstr
= xmlStrdup(res
->stringval
);
883 elog(NOTICE
, "unsupported XQuery result: %d", res
->type
);
884 resstr
= xmlStrdup((const xmlChar
*) "<unsupported/>");
889 * Insert this into the appropriate column in the
892 values
[j
+ 1] = (char *) resstr
;
894 xmlXPathFreeContext(ctxt
);
896 /* Now add the tuple to the output, if there is one. */
899 ret_tuple
= BuildTupleFromCStrings(attinmeta
, values
);
900 oldcontext
= MemoryContextSwitchTo(per_query_ctx
);
901 tuplestore_puttuple(tupstore
, ret_tuple
);
902 MemoryContextSwitchTo(oldcontext
);
903 heap_freetuple(ret_tuple
);
908 } while (had_values
);
921 /* Needed to flag completeness in 7.3.1. 7.4 defines it as a no-op. */
922 tuplestore_donestoring(tupstore
);
926 rsinfo
->setResult
= tupstore
;
929 * SFRM_Materialize mode expects us to return a NULL Datum. The actual
930 * tuples are in our tuplestore and passed back through rsinfo->setResult.
931 * rsinfo->setDesc is set to the tuple description that we actually used
932 * to build our tuples with, so the caller can verify we did what it was