Dont throw EncodingFoundException unless asked to. Should remove the occassional...
[beagle.git] / Filters / HtmlAgilityPack / MixedCodeDocument.cs
blob84597316cc21fda55265da1a6be601c01c99955e
1 // HtmlAgilityPack V1.0
3 /*
4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
5 All rights reserved.
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10 1. Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3. The name of the author may not be used to endorse or promote products
16 derived from this software without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 using System;
31 using System.IO;
32 using System.Text;
33 using System.Collections;
35 namespace HtmlAgilityPack
37 /// <summary>
38 /// Represents the type of fragement in a mixed code document.
39 /// </summary>
40 public enum MixedCodeDocumentFragmentType
42 /// <summary>
43 /// The fragment contains code.
44 /// </summary>
45 Code,
47 /// <summary>
48 /// The fragment contains text.
49 /// </summary>
50 Text,
53 /// <summary>
54 /// Represents a fragment of code in a mixed code document.
55 /// </summary>
56 public class MixedCodeDocumentCodeFragment: MixedCodeDocumentFragment
58 internal string _code;
60 internal MixedCodeDocumentCodeFragment(MixedCodeDocument doc):
61 base(doc, MixedCodeDocumentFragmentType.Code)
65 /// <summary>
66 /// Gets the fragment code text.
67 /// </summary>
68 public string Code
70 get
72 if (_code == null)
74 _code = FragmentText.Substring(_doc.TokenCodeStart.Length,
75 FragmentText.Length - _doc.TokenCodeEnd.Length - _doc.TokenCodeStart.Length -1).Trim();
76 if (_code.StartsWith("="))
78 _code = _doc.TokenResponseWrite + _code.Substring(1, _code.Length-1);
81 return _code;
83 set
85 _code = value;
90 /// <summary>
91 /// Represents a fragment of text in a mixed code document.
92 /// </summary>
93 public class MixedCodeDocumentTextFragment: MixedCodeDocumentFragment
95 internal MixedCodeDocumentTextFragment(MixedCodeDocument doc):
96 base(doc, MixedCodeDocumentFragmentType.Text)
100 /// <summary>
101 /// Gets the fragment text.
102 /// </summary>
103 public string Text
107 return FragmentText;
111 base._fragmenttext = value;
116 /// <summary>
117 /// Represents a base class for fragments in a mixed code document.
118 /// </summary>
119 public abstract class MixedCodeDocumentFragment
121 internal MixedCodeDocumentFragmentType _type;
122 internal MixedCodeDocument _doc;
123 internal int _index;
124 internal int _length;
125 internal int _line;
126 internal int _lineposition;
127 internal string _fragmenttext;
129 internal MixedCodeDocumentFragment(MixedCodeDocument doc, MixedCodeDocumentFragmentType type)
131 _doc = doc;
132 _type = type;
133 switch(type)
135 case MixedCodeDocumentFragmentType.Text:
136 _doc._textfragments.Append(this);
137 break;
139 case MixedCodeDocumentFragmentType.Code:
140 _doc._codefragments.Append(this);
141 break;
143 _doc._fragments.Append(this);
146 /// <summary>
147 /// Gets the type of fragment.
148 /// </summary>
149 public MixedCodeDocumentFragmentType FragmentType
153 return _type;
157 /// <summary>
158 /// Gets the fragment position in the document's stream.
159 /// </summary>
160 public int StreamPosition
164 return _index;
168 /// <summary>
169 /// Gets the line number of the fragment.
170 /// </summary>
171 public int Line
175 return _line;
179 /// <summary>
180 /// Gets the line position (column) of the fragment.
181 /// </summary>
182 public int LinePosition
186 return _lineposition;
190 /// <summary>
191 /// Gets the fragement text.
192 /// </summary>
193 public string FragmentText
197 if (_fragmenttext == null)
199 _fragmenttext = _doc._text.Substring(_index, _length);
201 return _fragmenttext;
206 /// <summary>
207 /// Represents a list of mixed code fragments.
208 /// </summary>
209 public class MixedCodeDocumentFragmentList: IEnumerable
211 private MixedCodeDocument _doc;
212 private ArrayList _items = new ArrayList();
214 internal MixedCodeDocumentFragmentList(MixedCodeDocument doc)
216 _doc = doc;
219 /// <summary>
220 /// Appends a fragment to the list of fragments.
221 /// </summary>
222 /// <param name="newFragment">The fragment to append. May not be null.</param>
223 public void Append(MixedCodeDocumentFragment newFragment)
225 if (newFragment == null)
227 throw new ArgumentNullException("newFragment");
229 _items.Add(newFragment);
232 /// <summary>
233 /// Prepends a fragment to the list of fragments.
234 /// </summary>
235 /// <param name="newFragment">The fragment to append. May not be null.</param>
236 public void Prepend(MixedCodeDocumentFragment newFragment)
238 if (newFragment == null)
240 throw new ArgumentNullException("newFragment");
242 _items.Insert(0, newFragment);
245 /// <summary>
246 /// Remove a fragment from the list of fragments. If this fragment was not in the list, an exception will be raised.
247 /// </summary>
248 /// <param name="fragment">The fragment to remove. May not be null.</param>
249 public void Remove(MixedCodeDocumentFragment fragment)
251 if (fragment == null)
253 throw new ArgumentNullException("fragment");
255 int index = GetFragmentIndex(fragment);
256 if (index == -1)
258 throw new IndexOutOfRangeException();
260 RemoveAt(index);
263 /// <summary>
264 /// Remove a fragment from the list of fragments, using its index in the list.
265 /// </summary>
266 /// <param name="index">The index of the fragment to remove.</param>
267 public void RemoveAt(int index)
269 _items.RemoveAt(index);
272 /// <summary>
273 /// Remove all fragments from the list.
274 /// </summary>
275 public void RemoveAll()
277 _items.Clear();
280 /// <summary>
281 /// Gets the number of fragments contained in the list.
282 /// </summary>
283 public int Count
287 return _items.Count;
291 internal int GetFragmentIndex(MixedCodeDocumentFragment fragment)
293 if (fragment == null)
295 throw new ArgumentNullException("fragment");
297 for(int i=0;i<_items.Count;i++)
299 if (((MixedCodeDocumentFragment)_items[i])==fragment)
301 return i;
304 return -1;
307 /// <summary>
308 /// Gets a fragment from the list using its index.
309 /// </summary>
310 public MixedCodeDocumentFragment this[int index]
314 return _items[index] as MixedCodeDocumentFragment;
318 internal void Clear()
320 _items.Clear();
323 /// <summary>
324 /// Gets an enumerator that can iterate through the fragment list.
325 /// </summary>
326 public MixedCodeDocumentFragmentEnumerator GetEnumerator()
328 return new MixedCodeDocumentFragmentEnumerator(_items);
331 /// <summary>
332 /// Gets an enumerator that can iterate through the fragment list.
333 /// </summary>
334 IEnumerator IEnumerable.GetEnumerator()
336 return GetEnumerator();
339 /// <summary>
340 /// Represents a fragment enumerator.
341 /// </summary>
342 public class MixedCodeDocumentFragmentEnumerator: IEnumerator
344 int _index;
345 ArrayList _items;
347 internal MixedCodeDocumentFragmentEnumerator(ArrayList items)
349 _items = items;
350 _index = -1;
353 /// <summary>
354 /// Sets the enumerator to its initial position, which is before the first element in the collection.
355 /// </summary>
356 public void Reset()
358 _index = -1;
361 /// <summary>
362 /// Advances the enumerator to the next element of the collection.
363 /// </summary>
364 /// <returns>true if the enumerator was successfully advanced to the next element; false if the enumerator has passed the end of the collection.</returns>
365 public bool MoveNext()
367 _index++;
368 return (_index<_items.Count);
371 /// <summary>
372 /// Gets the current element in the collection.
373 /// </summary>
374 public MixedCodeDocumentFragment Current
376 get
378 return (MixedCodeDocumentFragment)(_items[_index]);
382 /// <summary>
383 /// Gets the current element in the collection.
384 /// </summary>
385 object IEnumerator.Current
387 get
389 return (Current);
395 /// <summary>
396 /// Represents a document with mixed code and text. ASP, ASPX, JSP, are good example of such documents.
397 /// </summary>
398 public class MixedCodeDocument
400 private System.Text.Encoding _streamencoding = null;
401 internal string _text;
402 internal MixedCodeDocumentFragmentList _fragments;
403 internal MixedCodeDocumentFragmentList _codefragments;
404 internal MixedCodeDocumentFragmentList _textfragments;
405 private ParseState _state;
406 private int _index;
407 private int _c;
408 private int _line;
409 private int _lineposition;
410 private MixedCodeDocumentFragment _currentfragment;
412 /// <summary>
413 /// Gets or sets the token representing code start.
414 /// </summary>
415 public string TokenCodeStart = "<%";
417 /// <summary>
418 /// Gets or sets the token representing code end.
419 /// </summary>
420 public string TokenCodeEnd = "%>";
422 /// <summary>
423 /// Gets or sets the token representing code directive.
424 /// </summary>
425 public string TokenDirective = "@";
427 /// <summary>
428 /// Gets or sets the token representing response write directive.
429 /// </summary>
430 public string TokenResponseWrite = "Response.Write ";
433 private string TokenTextBlock = "TextBlock({0})";
435 /// <summary>
436 /// Creates a mixed code document instance.
437 /// </summary>
438 public MixedCodeDocument()
440 _codefragments = new MixedCodeDocumentFragmentList(this);
441 _textfragments = new MixedCodeDocumentFragmentList(this);
442 _fragments = new MixedCodeDocumentFragmentList(this);
445 /// <summary>
446 /// Loads a mixed code document from a stream.
447 /// </summary>
448 /// <param name="stream">The input stream.</param>
449 public void Load(Stream stream)
451 Load(new StreamReader(stream));
454 /// <summary>
455 /// Loads a mixed code document from a stream.
456 /// </summary>
457 /// <param name="stream">The input stream.</param>
458 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
459 public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)
461 Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));
464 /// <summary>
465 /// Loads a mixed code document from a stream.
466 /// </summary>
467 /// <param name="stream">The input stream.</param>
468 /// <param name="encoding">The character encoding to use.</param>
469 public void Load(Stream stream, Encoding encoding)
471 Load(new StreamReader(stream, encoding));
474 /// <summary>
475 /// Loads a mixed code document from a stream.
476 /// </summary>
477 /// <param name="stream">The input stream.</param>
478 /// <param name="encoding">The character encoding to use.</param>
479 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
480 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
482 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));
485 /// <summary>
486 /// Loads a mixed code document from a stream.
487 /// </summary>
488 /// <param name="stream">The input stream.</param>
489 /// <param name="encoding">The character encoding to use.</param>
490 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
491 /// <param name="buffersize">The minimum buffer size.</param>
492 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
494 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));
497 /// <summary>
498 /// Loads a mixed code document from a file.
499 /// </summary>
500 /// <param name="path">The complete file path to be read.</param>
501 public void Load(string path)
503 Load(new StreamReader(path));
506 /// <summary>
507 /// Loads a mixed code document from a file.
508 /// </summary>
509 /// <param name="path">The complete file path to be read.</param>
510 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
511 public void Load(string path, bool detectEncodingFromByteOrderMarks)
513 Load(new StreamReader(path, detectEncodingFromByteOrderMarks));
516 /// <summary>
517 /// Loads a mixed code document from a file.
518 /// </summary>
519 /// <param name="path">The complete file path to be read.</param>
520 /// <param name="encoding">The character encoding to use.</param>
521 public void Load(string path, Encoding encoding)
523 Load(new StreamReader(path, encoding));
526 /// <summary>
527 /// Loads a mixed code document from a file.
528 /// </summary>
529 /// <param name="path">The complete file path to be read.</param>
530 /// <param name="encoding">The character encoding to use.</param>
531 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
532 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
534 Load(new StreamReader(path, encoding, detectEncodingFromByteOrderMarks));
537 /// <summary>
538 /// Loads a mixed code document from a file.
539 /// </summary>
540 /// <param name="path">The complete file path to be read.</param>
541 /// <param name="encoding">The character encoding to use.</param>
542 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
543 /// <param name="buffersize">The minimum buffer size.</param>
544 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
546 Load(new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize));
549 /// <summary>
550 /// Loads a mixed document from a text
551 /// </summary>
552 /// <param name="html">The text to load.</param>
553 public void LoadHtml(string html)
555 Load(new StringReader(html));
558 /// <summary>
559 /// Loads the mixed code document from the specified TextReader.
560 /// </summary>
561 /// <param name="reader">The TextReader used to feed the HTML data into the document.</param>
562 public void Load(TextReader reader)
564 _codefragments.Clear();
565 _textfragments.Clear();
567 // all pseudo constructors get down to this one
568 StreamReader sr = reader as StreamReader;
569 if (sr != null)
571 _streamencoding = sr.CurrentEncoding;
574 _text = reader.ReadToEnd();
575 reader.Close();
576 Parse();
579 internal System.Text.Encoding GetOutEncoding()
581 if (_streamencoding != null)
582 return _streamencoding;
583 return System.Text.Encoding.Default;
586 /// <summary>
587 /// Gets the encoding of the stream used to read the document.
588 /// </summary>
589 public System.Text.Encoding StreamEncoding
593 return _streamencoding;
597 /// <summary>
598 /// Gets the list of code fragments in the document.
599 /// </summary>
600 public MixedCodeDocumentFragmentList CodeFragments
604 return _codefragments;
608 /// <summary>
609 /// Gets the list of text fragments in the document.
610 /// </summary>
611 public MixedCodeDocumentFragmentList TextFragments
615 return _textfragments;
619 /// <summary>
620 /// Gets the list of all fragments in the document.
621 /// </summary>
622 public MixedCodeDocumentFragmentList Fragments
626 return _fragments;
630 /// <summary>
631 /// Saves the mixed document to the specified stream.
632 /// </summary>
633 /// <param name="outStream">The stream to which you want to save.</param>
634 public void Save(Stream outStream)
636 StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());
637 Save(sw);
640 /// <summary>
641 /// Saves the mixed document to the specified stream.
642 /// </summary>
643 /// <param name="outStream">The stream to which you want to save.</param>
644 /// <param name="encoding">The character encoding to use.</param>
645 public void Save(Stream outStream, System.Text.Encoding encoding)
647 StreamWriter sw = new StreamWriter(outStream, encoding);
648 Save(sw);
651 /// <summary>
652 /// Saves the mixed document to the specified file.
653 /// </summary>
654 /// <param name="filename">The location of the file where you want to save the document.</param>
655 public void Save(string filename)
657 StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());
658 Save(sw);
661 /// <summary>
662 /// Saves the mixed document to the specified file.
663 /// </summary>
664 /// <param name="filename">The location of the file where you want to save the document.</param>
665 /// <param name="encoding">The character encoding to use.</param>
666 public void Save(string filename, System.Text.Encoding encoding)
668 StreamWriter sw = new StreamWriter(filename, false, encoding);
669 Save(sw);
672 /// <summary>
673 /// Saves the mixed document to the specified StreamWriter.
674 /// </summary>
675 /// <param name="writer">The StreamWriter to which you want to save.</param>
676 public void Save(StreamWriter writer)
678 Save((TextWriter)writer);
681 /// <summary>
682 /// Saves the mixed document to the specified TextWriter.
683 /// </summary>
684 /// <param name="writer">The TextWriter to which you want to save.</param>
685 public void Save(TextWriter writer)
688 writer.Flush();
691 /// <summary>
692 /// Gets the code represented by the mixed code document seen as a template.
693 /// </summary>
694 public string Code
698 string s = "";
699 int i = 0;
700 foreach(MixedCodeDocumentFragment frag in _fragments)
702 switch(frag._type)
704 case MixedCodeDocumentFragmentType.Text:
705 s += TokenResponseWrite + string.Format(TokenTextBlock, i) + "\n";
706 i++;
707 break;
709 case MixedCodeDocumentFragmentType.Code:
710 s += ((MixedCodeDocumentCodeFragment)frag).Code + "\n";
711 break;
714 return s;
718 /// <summary>
719 /// Create a text fragment instances.
720 /// </summary>
721 /// <returns>The newly created text fragment instance.</returns>
722 public MixedCodeDocumentTextFragment CreateTextFragment()
724 return (MixedCodeDocumentTextFragment)CreateFragment(MixedCodeDocumentFragmentType.Text);
727 /// <summary>
728 /// Create a code fragment instances.
729 /// </summary>
730 /// <returns>The newly created code fragment instance.</returns>
731 public MixedCodeDocumentCodeFragment CreateCodeFragment()
733 return (MixedCodeDocumentCodeFragment)CreateFragment(MixedCodeDocumentFragmentType.Code);
736 internal MixedCodeDocumentFragment CreateFragment(MixedCodeDocumentFragmentType type)
738 switch(type)
740 case MixedCodeDocumentFragmentType.Text:
741 return new MixedCodeDocumentTextFragment(this);
743 case MixedCodeDocumentFragmentType.Code:
744 return new MixedCodeDocumentCodeFragment(this);
746 default:
747 throw new NotSupportedException();
751 private void SetPosition()
753 _currentfragment._line = _line;
754 _currentfragment._lineposition = _lineposition;
755 _currentfragment._index = _index - 1;
756 _currentfragment._length = 0;
759 private void IncrementPosition()
761 _index++;
762 if (_c == 10)
764 _lineposition = 1;
765 _line++;
767 else
768 _lineposition++;
771 private enum ParseState
773 Text,
774 Code
777 private void Parse()
779 _state = ParseState.Text;
780 _index = 0;
781 _currentfragment = CreateFragment(MixedCodeDocumentFragmentType.Text);
783 while (_index<_text.Length)
785 _c = _text[_index];
786 IncrementPosition();
788 switch(_state)
790 case ParseState.Text:
791 if (_index+TokenCodeStart.Length<_text.Length)
793 if (_text.Substring(_index-1, TokenCodeStart.Length) == TokenCodeStart)
795 _state = ParseState.Code;
796 _currentfragment._length = _index -1 - _currentfragment._index;
797 _currentfragment = CreateFragment(MixedCodeDocumentFragmentType.Code);
798 SetPosition();
799 continue;
802 break;
804 case ParseState.Code:
805 if (_index+TokenCodeEnd.Length<_text.Length)
807 if (_text.Substring(_index-1, TokenCodeEnd.Length) == TokenCodeEnd)
809 _state = ParseState.Text;
810 _currentfragment._length = _index + TokenCodeEnd.Length - _currentfragment._index;
811 _index += TokenCodeEnd.Length;
812 _lineposition += TokenCodeEnd.Length;
813 _currentfragment = CreateFragment(MixedCodeDocumentFragmentType.Text);
814 SetPosition();
815 continue;
818 break;
822 _currentfragment._length = _index - _currentfragment._index;