Initial import into git.
[galago.git] / cpp / galago / contrib / indri / src / XMLReader.cpp
blob593ae0247c4223a8df4f302940e136ceb4089435
1 /*==========================================================================
2 * Copyright (c) 2004 University of Massachusetts. All Rights Reserved.
4 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
5 * is subject to the terms of the software license set forth in the LICENSE
6 * file included with this software, and also available at
7 * http://www.lemurproject.org/license.html
9 *==========================================================================
14 // XMLReader
16 // 8 October 2003 - tds
19 #include "indri/XMLReader.hpp"
20 #include "lemur/Exception.hpp"
22 int indri::xml::XMLReader::_tryFindChar( char ch, const char* buffer, int start, int finish ) {
23 int i;
25 for( i=start; i<finish; i++ ) {
26 if( buffer[i] == ch )
27 break;
30 if( i==finish )
31 return -1;
33 return i;
36 int indri::xml::XMLReader::_findChar( char ch, const char* buffer, int start, int finish ) {
37 int result = _tryFindChar( ch, buffer, start, finish );
39 if( result == -1 )
40 LEMUR_THROW( LEMUR_PARSE_ERROR, "Was looking for '" + ch + "', but couldn't find it." );
42 return result;
45 int indri::xml::XMLReader::_tryFindBeginTag( const char* buffer, int start, int finish ) {
46 return _tryFindChar( '<', buffer, start, finish );
49 int indri::xml::XMLReader::_findBeginTag( const char* buffer, int start, int finish ) {
50 int result = _tryFindBeginTag( buffer, start, finish );
52 if( result == -1 )
53 LEMUR_THROW( LEMUR_PARSE_ERROR, "Ran off the end of a buffer while looking for a begin tag" );
55 return result;
58 int indri::xml::XMLReader::_findEndTag( const char* buffer, int start, int finish ) {
59 return _findChar( '>', buffer, start, finish );
62 int indri::xml::XMLReader::_tryFindText( const char* buffer, int start, int finish ) {
63 int i;
65 for( i=start; i<finish; i++ ) {
66 if( !isspace(buffer[i]) )
67 break;
70 return i;
73 int indri::xml::XMLReader::_findText( const char* buffer, int start, int finish ) {
74 int result = _tryFindText( buffer, start, finish );
75 if( result==finish )
76 LEMUR_THROW( LEMUR_GENERIC_ERROR, "Was looking for text, but couldn't find any" );
78 return result;
81 int indri::xml::XMLReader::_findNotName( const char* buffer, int start, int finish ) {
82 int i;
84 for( i=start; i<finish; i++ ) {
85 // this isn't unicode-safe, but it should be good for now
86 if( !isalpha(buffer[i]) &&
87 !isdigit(buffer[i]) &&
88 buffer[i] != '-' &&
89 buffer[i] != '_' &&
90 buffer[i] != ':' &&
91 buffer[i] != '.' ) {
92 break;
96 if( i==finish )
97 LEMUR_THROW( LEMUR_PARSE_ERROR, "Was looking for the end of a tag name, but couldn't find it." );
99 return i;
102 int _findSpace( char* buffer, int start, int finish ) {
103 int i;
105 for( i=start; i<finish; i++ ) {
106 if( isspace(buffer[i]) )
107 break;
110 if( i==finish )
111 LEMUR_THROW( LEMUR_PARSE_ERROR, "Was looking for a space, but couldn't find it." );
113 return i;
116 int indri::xml::XMLReader::_readTag( const char* buffer, int bufferStart, int bufferEnd, std::string* tagName, std::map<std::string, std::string>* attributes, int* tagType ) {
117 // skip opening whitespace
118 int startLocation = bufferStart;
119 int endLocation = _findEndTag( buffer, startLocation, bufferEnd );
120 int position = startLocation+1;
121 int trueEndLocation = endLocation+1;
123 if( endLocation - position < 1 )
124 LEMUR_THROW( LEMUR_GENERIC_ERROR, "Found a tag with no body" );
126 // is it a <!CDATA[ tag?
127 // expand this test for completeness
128 if ( buffer[position] == '!' && buffer[position + 1] == 'C' ) {
129 if( tagType )
130 *tagType = TAG_CDATA_TYPE;
131 trueEndLocation = position + 7;
132 return trueEndLocation;
134 // is it an opening tag?
135 if( buffer[position] == '/' ) {
136 if( tagType )
137 *tagType = TAG_CLOSE_TYPE;
138 position++;
140 if( position >= endLocation )
141 LEMUR_THROW( LEMUR_GENERIC_ERROR, "Found a tag with no body" );
142 } else {
143 if( buffer[endLocation-1] == '/' ) {
144 if( tagType )
145 *tagType = TAG_OPEN_CLOSE_TYPE;
146 endLocation--;
147 } else if( tagType ) {
148 *tagType = TAG_OPEN_TYPE;
152 if( tagName || attributes ) {
153 int textBegin = _findText( buffer, position, endLocation );
154 int textEnd = _findNotName( buffer, textBegin, endLocation+1 );
156 if( tagName )
157 tagName->assign( &buffer[textBegin], &buffer[textEnd] );
159 position = textEnd;
161 if( attributes ) {
162 attributes->clear();
164 textBegin = _findText( buffer, position, endLocation+1 );
165 position = textBegin;
167 for( ; position != endLocation; position = _tryFindText( buffer, position, endLocation ) ) {
168 textEnd = _findNotName( buffer, position, endLocation );
169 int equalsPosition = _findChar( '=', buffer, textEnd, endLocation );
170 int quotePosition = _findText( buffer, equalsPosition+1, endLocation );
171 int endQuotePosition = _findChar( buffer[quotePosition], buffer, quotePosition+1, endLocation );
173 std::string attributeName;
174 std::string valueText;
176 assert( position <= textEnd );
177 assert( quotePosition+1 <= endQuotePosition );
178 assert( textEnd < quotePosition+1 );
180 attributeName.assign( &buffer[position], &buffer[textEnd] );
181 valueText.assign( &buffer[quotePosition+1], &buffer[endQuotePosition] );
183 attributes->insert( std::make_pair( attributeName, valueText ) );
184 position = endQuotePosition+1;
189 return trueEndLocation;
192 int indri::xml::XMLReader::_findClosingTag( const char* buffer, int start, int finish, std::string& openingTagName, bool* tagsBetween ) {
193 int openingTags = 0;
194 int closingTags = 0;
195 int position = start;
196 bool done = false;
197 bool match = false;
198 int tagType;
200 if( tagsBetween )
201 *tagsBetween = false;
202 try {
203 while( !done ) {
204 std::string tagName;
205 position = _findBeginTag( buffer, position, finish );
206 int end = _readTag( buffer, position, finish, &tagName, NULL, &tagType );
208 if( tagType == TAG_CDATA_TYPE ) {
209 std::string cdata = &buffer[end];
210 std::string::size_type dataEnd = cdata.find("]]>");
211 position = end + dataEnd + 1;
212 } else if( tagType != TAG_CLOSE_TYPE ) {
213 if( tagsBetween )
214 *tagsBetween = true;
216 if( tagType == TAG_OPEN_TYPE )
217 openingTags++;
218 position = end;
220 while( openingTags > closingTags ) {
221 // don't need to check for matching tags here, we just need to
222 // count open and closed tags
223 position = _findBeginTag( buffer, position, finish );
224 end = _readTag( buffer, position, finish, NULL, NULL, &tagType );
225 position = end;
227 if( tagType == TAG_CDATA_TYPE ) {
228 std::string cdata = &buffer[end];
229 std::string::size_type dataEnd = cdata.find("]]>");
230 position = end + dataEnd + 1;
231 } else if( tagType == TAG_OPEN_TYPE ) {
232 openingTags++;
233 } else if( tagType == TAG_CLOSE_TYPE ) {
234 closingTags++;
237 } else {
238 match = (tagName == openingTagName);
239 done = true;
242 } catch( lemur::api::Exception& e ) {
243 LEMUR_RETHROW( e, std::string() + "Caught an error while looking for an end tag for '" + openingTagName + "'" );
246 if( match ) {
247 return position;
248 } else {
249 return -1;
253 void indri::xml::XMLReader::_read( indri::xml::XMLNode** parent, const char* buffer, int start, int end ) {
254 int tagType;
256 for( int current = _tryFindBeginTag( buffer, start, end );
257 current >= 0;
258 current = _tryFindBeginTag( buffer, current, end ) ) {
259 indri::xml::XMLNode* node;
260 std::string tagName;
261 std::map<std::string, std::string> attributes;
262 bool tagsBetween;
264 int endLevel;
265 int endTag = _readTag( buffer, current, end, &tagName, &attributes, &tagType );
267 if( tagType == TAG_CLOSE_TYPE )
268 LEMUR_THROW( LEMUR_GENERIC_ERROR, "Found a close tag for '" + tagName + "' while looking for an open tag." );
270 if( tagType == TAG_OPEN_TYPE ) {
271 int closingTag = _findClosingTag( buffer, endTag, end, tagName, &tagsBetween );
272 if( closingTag == -1 )
273 LEMUR_THROW( LEMUR_GENERIC_ERROR, "Could not find a close tag for '" + tagName + "'");
275 if( tagsBetween ) {
276 node = new indri::xml::XMLNode( tagName, attributes );
277 _read( &node, buffer, endTag, closingTag );
278 } else {
279 std::string nodeValue;
280 nodeValue.assign( &buffer[endTag], &buffer[closingTag] );
281 std::string::size_type dataStart = nodeValue.find("<!CDATA[");
282 while (dataStart != std::string::npos) {
283 // munch any CDATA tags in the element's value.
284 nodeValue.erase(dataStart, 8);
285 std::string::size_type dataEnd = nodeValue.find("]]>");
286 if (dataEnd != std::string::npos)
287 nodeValue.erase(dataEnd, 3);
288 // else bad things here, should throw.
289 dataStart = nodeValue.find("<!CDATA[");
291 node = new indri::xml::XMLNode( tagName, attributes, nodeValue );
294 endLevel = _findEndTag( buffer, closingTag, end )+1;
295 } else {
296 assert( tagType == TAG_OPEN_CLOSE_TYPE );
297 node = new indri::xml::XMLNode( tagName, attributes );
298 endLevel = endTag;
301 if( *parent ) {
302 (*parent)->addChild( node );
303 } else {
304 *parent = node;
305 break;
308 current = endLevel;
312 indri::xml::XMLNode* indri::xml::XMLReader::read( const char* buffer, size_t length ) {
313 indri::xml::XMLNode* result = NULL;
314 std::string s = buffer;
315 std::string::size_type commentstart = s.find("<!--",0);
316 while (commentstart != std::string::npos) {
317 std::string::size_type commentend = s.find("-->",0);
318 s.erase(commentstart, (commentend + 3) - commentstart);
319 commentstart = s.find("<!--",0);
321 std::string::size_type xmlDeclStart = s.find("<?xml");
322 while (xmlDeclStart != std::string::npos) {
323 std::string::size_type end = s.find("?>");
324 s.erase(xmlDeclStart, (end + 2) - xmlDeclStart);
325 xmlDeclStart = s.find("<?xml");
327 _read( &result, s.c_str(), 0, int(s.length()) );
328 return result;
331 indri::xml::XMLNode* indri::xml::XMLReader::read( const std::string& str ) {
332 return read( str.c_str(), str.length() );