1 /*==========================================================================
2 * Copyright (c) 2004 University of Massachusetts. All Rights Reserved.
4 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
5 * is subject to the terms of the software license set forth in the LICENSE
6 * file included with this software, and also available at
7 * http://www.lemurproject.org/license.html
9 *==========================================================================
16 // 8 October 2003 - tds
19 #include "indri/XMLReader.hpp"
20 #include "lemur/Exception.hpp"
22 int indri::xml::XMLReader::_tryFindChar( char ch
, const char* buffer
, int start
, int finish
) {
25 for( i
=start
; i
<finish
; i
++ ) {
36 int indri::xml::XMLReader::_findChar( char ch
, const char* buffer
, int start
, int finish
) {
37 int result
= _tryFindChar( ch
, buffer
, start
, finish
);
40 LEMUR_THROW( LEMUR_PARSE_ERROR
, "Was looking for '" + ch
+ "', but couldn't find it." );
45 int indri::xml::XMLReader::_tryFindBeginTag( const char* buffer
, int start
, int finish
) {
46 return _tryFindChar( '<', buffer
, start
, finish
);
49 int indri::xml::XMLReader::_findBeginTag( const char* buffer
, int start
, int finish
) {
50 int result
= _tryFindBeginTag( buffer
, start
, finish
);
53 LEMUR_THROW( LEMUR_PARSE_ERROR
, "Ran off the end of a buffer while looking for a begin tag" );
58 int indri::xml::XMLReader::_findEndTag( const char* buffer
, int start
, int finish
) {
59 return _findChar( '>', buffer
, start
, finish
);
62 int indri::xml::XMLReader::_tryFindText( const char* buffer
, int start
, int finish
) {
65 for( i
=start
; i
<finish
; i
++ ) {
66 if( !isspace(buffer
[i
]) )
73 int indri::xml::XMLReader::_findText( const char* buffer
, int start
, int finish
) {
74 int result
= _tryFindText( buffer
, start
, finish
);
76 LEMUR_THROW( LEMUR_GENERIC_ERROR
, "Was looking for text, but couldn't find any" );
81 int indri::xml::XMLReader::_findNotName( const char* buffer
, int start
, int finish
) {
84 for( i
=start
; i
<finish
; i
++ ) {
85 // this isn't unicode-safe, but it should be good for now
86 if( !isalpha(buffer
[i
]) &&
87 !isdigit(buffer
[i
]) &&
97 LEMUR_THROW( LEMUR_PARSE_ERROR
, "Was looking for the end of a tag name, but couldn't find it." );
102 int _findSpace( char* buffer
, int start
, int finish
) {
105 for( i
=start
; i
<finish
; i
++ ) {
106 if( isspace(buffer
[i
]) )
111 LEMUR_THROW( LEMUR_PARSE_ERROR
, "Was looking for a space, but couldn't find it." );
116 int indri::xml::XMLReader::_readTag( const char* buffer
, int bufferStart
, int bufferEnd
, std::string
* tagName
, std::map
<std::string
, std::string
>* attributes
, int* tagType
) {
117 // skip opening whitespace
118 int startLocation
= bufferStart
;
119 int endLocation
= _findEndTag( buffer
, startLocation
, bufferEnd
);
120 int position
= startLocation
+1;
121 int trueEndLocation
= endLocation
+1;
123 if( endLocation
- position
< 1 )
124 LEMUR_THROW( LEMUR_GENERIC_ERROR
, "Found a tag with no body" );
126 // is it a <!CDATA[ tag?
127 // expand this test for completeness
128 if ( buffer
[position
] == '!' && buffer
[position
+ 1] == 'C' ) {
130 *tagType
= TAG_CDATA_TYPE
;
131 trueEndLocation
= position
+ 7;
132 return trueEndLocation
;
134 // is it an opening tag?
135 if( buffer
[position
] == '/' ) {
137 *tagType
= TAG_CLOSE_TYPE
;
140 if( position
>= endLocation
)
141 LEMUR_THROW( LEMUR_GENERIC_ERROR
, "Found a tag with no body" );
143 if( buffer
[endLocation
-1] == '/' ) {
145 *tagType
= TAG_OPEN_CLOSE_TYPE
;
147 } else if( tagType
) {
148 *tagType
= TAG_OPEN_TYPE
;
152 if( tagName
|| attributes
) {
153 int textBegin
= _findText( buffer
, position
, endLocation
);
154 int textEnd
= _findNotName( buffer
, textBegin
, endLocation
+1 );
157 tagName
->assign( &buffer
[textBegin
], &buffer
[textEnd
] );
164 textBegin
= _findText( buffer
, position
, endLocation
+1 );
165 position
= textBegin
;
167 for( ; position
!= endLocation
; position
= _tryFindText( buffer
, position
, endLocation
) ) {
168 textEnd
= _findNotName( buffer
, position
, endLocation
);
169 int equalsPosition
= _findChar( '=', buffer
, textEnd
, endLocation
);
170 int quotePosition
= _findText( buffer
, equalsPosition
+1, endLocation
);
171 int endQuotePosition
= _findChar( buffer
[quotePosition
], buffer
, quotePosition
+1, endLocation
);
173 std::string attributeName
;
174 std::string valueText
;
176 assert( position
<= textEnd
);
177 assert( quotePosition
+1 <= endQuotePosition
);
178 assert( textEnd
< quotePosition
+1 );
180 attributeName
.assign( &buffer
[position
], &buffer
[textEnd
] );
181 valueText
.assign( &buffer
[quotePosition
+1], &buffer
[endQuotePosition
] );
183 attributes
->insert( std::make_pair( attributeName
, valueText
) );
184 position
= endQuotePosition
+1;
189 return trueEndLocation
;
192 int indri::xml::XMLReader::_findClosingTag( const char* buffer
, int start
, int finish
, std::string
& openingTagName
, bool* tagsBetween
) {
195 int position
= start
;
201 *tagsBetween
= false;
205 position
= _findBeginTag( buffer
, position
, finish
);
206 int end
= _readTag( buffer
, position
, finish
, &tagName
, NULL
, &tagType
);
208 if( tagType
== TAG_CDATA_TYPE
) {
209 std::string cdata
= &buffer
[end
];
210 std::string::size_type dataEnd
= cdata
.find("]]>");
211 position
= end
+ dataEnd
+ 1;
212 } else if( tagType
!= TAG_CLOSE_TYPE
) {
216 if( tagType
== TAG_OPEN_TYPE
)
220 while( openingTags
> closingTags
) {
221 // don't need to check for matching tags here, we just need to
222 // count open and closed tags
223 position
= _findBeginTag( buffer
, position
, finish
);
224 end
= _readTag( buffer
, position
, finish
, NULL
, NULL
, &tagType
);
227 if( tagType
== TAG_CDATA_TYPE
) {
228 std::string cdata
= &buffer
[end
];
229 std::string::size_type dataEnd
= cdata
.find("]]>");
230 position
= end
+ dataEnd
+ 1;
231 } else if( tagType
== TAG_OPEN_TYPE
) {
233 } else if( tagType
== TAG_CLOSE_TYPE
) {
238 match
= (tagName
== openingTagName
);
242 } catch( lemur::api::Exception
& e
) {
243 LEMUR_RETHROW( e
, std::string() + "Caught an error while looking for an end tag for '" + openingTagName
+ "'" );
253 void indri::xml::XMLReader::_read( indri::xml::XMLNode
** parent
, const char* buffer
, int start
, int end
) {
256 for( int current
= _tryFindBeginTag( buffer
, start
, end
);
258 current
= _tryFindBeginTag( buffer
, current
, end
) ) {
259 indri::xml::XMLNode
* node
;
261 std::map
<std::string
, std::string
> attributes
;
265 int endTag
= _readTag( buffer
, current
, end
, &tagName
, &attributes
, &tagType
);
267 if( tagType
== TAG_CLOSE_TYPE
)
268 LEMUR_THROW( LEMUR_GENERIC_ERROR
, "Found a close tag for '" + tagName
+ "' while looking for an open tag." );
270 if( tagType
== TAG_OPEN_TYPE
) {
271 int closingTag
= _findClosingTag( buffer
, endTag
, end
, tagName
, &tagsBetween
);
272 if( closingTag
== -1 )
273 LEMUR_THROW( LEMUR_GENERIC_ERROR
, "Could not find a close tag for '" + tagName
+ "'");
276 node
= new indri::xml::XMLNode( tagName
, attributes
);
277 _read( &node
, buffer
, endTag
, closingTag
);
279 std::string nodeValue
;
280 nodeValue
.assign( &buffer
[endTag
], &buffer
[closingTag
] );
281 std::string::size_type dataStart
= nodeValue
.find("<!CDATA[");
282 while (dataStart
!= std::string::npos
) {
283 // munch any CDATA tags in the element's value.
284 nodeValue
.erase(dataStart
, 8);
285 std::string::size_type dataEnd
= nodeValue
.find("]]>");
286 if (dataEnd
!= std::string::npos
)
287 nodeValue
.erase(dataEnd
, 3);
288 // else bad things here, should throw.
289 dataStart
= nodeValue
.find("<!CDATA[");
291 node
= new indri::xml::XMLNode( tagName
, attributes
, nodeValue
);
294 endLevel
= _findEndTag( buffer
, closingTag
, end
)+1;
296 assert( tagType
== TAG_OPEN_CLOSE_TYPE
);
297 node
= new indri::xml::XMLNode( tagName
, attributes
);
302 (*parent
)->addChild( node
);
312 indri::xml::XMLNode
* indri::xml::XMLReader::read( const char* buffer
, size_t length
) {
313 indri::xml::XMLNode
* result
= NULL
;
314 std::string s
= buffer
;
315 std::string::size_type commentstart
= s
.find("<!--",0);
316 while (commentstart
!= std::string::npos
) {
317 std::string::size_type commentend
= s
.find("-->",0);
318 s
.erase(commentstart
, (commentend
+ 3) - commentstart
);
319 commentstart
= s
.find("<!--",0);
321 std::string::size_type xmlDeclStart
= s
.find("<?xml");
322 while (xmlDeclStart
!= std::string::npos
) {
323 std::string::size_type end
= s
.find("?>");
324 s
.erase(xmlDeclStart
, (end
+ 2) - xmlDeclStart
);
325 xmlDeclStart
= s
.find("<?xml");
327 _read( &result
, s
.c_str(), 0, int(s
.length()) );
331 indri::xml::XMLNode
* indri::xml::XMLReader::read( const std::string
& str
) {
332 return read( str
.c_str(), str
.length() );