Compute lucene-style scores for our hits.
[beagle.git] / Filters / HtmlAgilityPack / MixedCodeDocument.cs
blobb99ce004a6ce7053dbaf656570cc6acb5e8970af
1 // HtmlAgilityPack V1.0
3 /*
4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
5 All rights reserved.
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10 1. Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3. The name of the author may not be used to endorse or promote products
16 derived from this software without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 using System;
31 using System.IO;
32 using System.Text;
33 using System.Collections;
35 namespace HtmlAgilityPack
37 /// <summary>
38 /// Represents the type of fragement in a mixed code document.
39 /// </summary>
40 public enum MixedCodeDocumentFragmentType
42 /// <summary>
43 /// The fragment contains code.
44 /// </summary>
45 Code,
47 /// <summary>
48 /// The fragment contains text.
49 /// </summary>
50 Text,
53 /// <summary>
54 /// Represents a fragment of code in a mixed code document.
55 /// </summary>
56 public class MixedCodeDocumentCodeFragment: MixedCodeDocumentFragment
58 internal string _code;
60 internal MixedCodeDocumentCodeFragment(MixedCodeDocument doc):
61 base(doc, MixedCodeDocumentFragmentType.Code)
65 /// <summary>
66 /// Gets the fragment code text.
67 /// </summary>
68 public string Code
70 get
72 if (_code == null)
74 _code = FragmentText.Substring(_doc.TokenCodeStart.Length,
75 FragmentText.Length - _doc.TokenCodeEnd.Length - _doc.TokenCodeStart.Length -1).Trim();
76 if (_code.StartsWith("="))
78 _code = _doc.TokenResponseWrite + _code.Substring(1, _code.Length-1);
81 return _code;
83 set
85 _code = value;
90 /// <summary>
91 /// Represents a fragment of text in a mixed code document.
92 /// </summary>
93 public class MixedCodeDocumentTextFragment: MixedCodeDocumentFragment
95 internal MixedCodeDocumentTextFragment(MixedCodeDocument doc):
96 base(doc, MixedCodeDocumentFragmentType.Text)
100 /// <summary>
101 /// Gets the fragment text.
102 /// </summary>
103 public string Text
107 return FragmentText;
111 base._fragmenttext = value;
116 /// <summary>
117 /// Represents a base class for fragments in a mixed code document.
118 /// </summary>
119 public abstract class MixedCodeDocumentFragment
121 internal MixedCodeDocumentFragmentType _type;
122 internal MixedCodeDocument _doc;
123 internal int _index;
124 internal int _length;
125 internal int _line;
126 internal int _lineposition;
127 internal string _fragmenttext;
129 internal MixedCodeDocumentFragment(MixedCodeDocument doc, MixedCodeDocumentFragmentType type)
131 _doc = doc;
132 _type = type;
133 switch(type)
135 case MixedCodeDocumentFragmentType.Text:
136 _doc._textfragments.Append(this);
137 break;
139 case MixedCodeDocumentFragmentType.Code:
140 _doc._codefragments.Append(this);
141 break;
143 _doc._fragments.Append(this);
146 /// <summary>
147 /// Gets the type of fragment.
148 /// </summary>
149 public MixedCodeDocumentFragmentType FragmentType
153 return _type;
157 /// <summary>
158 /// Gets the fragment position in the document's stream.
159 /// </summary>
160 public int StreamPosition
164 return _index;
168 /// <summary>
169 /// Gets the line number of the fragment.
170 /// </summary>
171 public int Line
175 return _line;
179 /// <summary>
180 /// Gets the line position (column) of the fragment.
181 /// </summary>
182 public int LinePosition
186 return _lineposition;
190 /// <summary>
191 /// Gets the fragement text.
192 /// </summary>
193 public string FragmentText
197 if (_fragmenttext == null)
199 _fragmenttext = _doc._text.Substring(_index, _length);
201 return _fragmenttext;
206 /// <summary>
207 /// Represents a list of mixed code fragments.
208 /// </summary>
209 public class MixedCodeDocumentFragmentList: IEnumerable
211 private MixedCodeDocument _doc;
212 private ArrayList _items = new ArrayList();
214 internal MixedCodeDocumentFragmentList(MixedCodeDocument doc)
216 _doc = doc;
219 /// <summary>
220 /// Appends a fragment to the list of fragments.
221 /// </summary>
222 /// <param name="newFragment">The fragment to append. May not be null.</param>
223 public void Append(MixedCodeDocumentFragment newFragment)
225 if (newFragment == null)
227 throw new ArgumentNullException("newFragment");
229 _items.Add(newFragment);
232 /// <summary>
233 /// Prepends a fragment to the list of fragments.
234 /// </summary>
235 /// <param name="newFragment">The fragment to append. May not be null.</param>
236 public void Prepend(MixedCodeDocumentFragment newFragment)
238 if (newFragment == null)
240 throw new ArgumentNullException("newFragment");
242 _items.Insert(0, newFragment);
245 /// <summary>
246 /// Remove a fragment from the list of fragments. If this fragment was not in the list, an exception will be raised.
247 /// </summary>
248 /// <param name="fragment">The fragment to remove. May not be null.</param>
249 public void Remove(MixedCodeDocumentFragment fragment)
251 if (fragment == null)
253 throw new ArgumentNullException("fragment");
255 int index = GetFragmentIndex(fragment);
256 if (index == -1)
258 throw new IndexOutOfRangeException();
260 RemoveAt(index);
263 /// <summary>
264 /// Remove a fragment from the list of fragments, using its index in the list.
265 /// </summary>
266 /// <param name="index">The index of the fragment to remove.</param>
267 public void RemoveAt(int index)
269 MixedCodeDocumentFragment frag = (MixedCodeDocumentFragment)_items[index];
270 _items.RemoveAt(index);
273 /// <summary>
274 /// Remove all fragments from the list.
275 /// </summary>
276 public void RemoveAll()
278 _items.Clear();
281 /// <summary>
282 /// Gets the number of fragments contained in the list.
283 /// </summary>
284 public int Count
288 return _items.Count;
292 internal int GetFragmentIndex(MixedCodeDocumentFragment fragment)
294 if (fragment == null)
296 throw new ArgumentNullException("fragment");
298 for(int i=0;i<_items.Count;i++)
300 if (((MixedCodeDocumentFragment)_items[i])==fragment)
302 return i;
305 return -1;
308 /// <summary>
309 /// Gets a fragment from the list using its index.
310 /// </summary>
311 public MixedCodeDocumentFragment this[int index]
315 return _items[index] as MixedCodeDocumentFragment;
319 internal void Clear()
321 _items.Clear();
324 /// <summary>
325 /// Gets an enumerator that can iterate through the fragment list.
326 /// </summary>
327 public MixedCodeDocumentFragmentEnumerator GetEnumerator()
329 return new MixedCodeDocumentFragmentEnumerator(_items);
332 /// <summary>
333 /// Gets an enumerator that can iterate through the fragment list.
334 /// </summary>
335 IEnumerator IEnumerable.GetEnumerator()
337 return GetEnumerator();
340 /// <summary>
341 /// Represents a fragment enumerator.
342 /// </summary>
343 public class MixedCodeDocumentFragmentEnumerator: IEnumerator
345 int _index;
346 ArrayList _items;
348 internal MixedCodeDocumentFragmentEnumerator(ArrayList items)
350 _items = items;
351 _index = -1;
354 /// <summary>
355 /// Sets the enumerator to its initial position, which is before the first element in the collection.
356 /// </summary>
357 public void Reset()
359 _index = -1;
362 /// <summary>
363 /// Advances the enumerator to the next element of the collection.
364 /// </summary>
365 /// <returns>true if the enumerator was successfully advanced to the next element; false if the enumerator has passed the end of the collection.</returns>
366 public bool MoveNext()
368 _index++;
369 return (_index<_items.Count);
372 /// <summary>
373 /// Gets the current element in the collection.
374 /// </summary>
375 public MixedCodeDocumentFragment Current
377 get
379 return (MixedCodeDocumentFragment)(_items[_index]);
383 /// <summary>
384 /// Gets the current element in the collection.
385 /// </summary>
386 object IEnumerator.Current
388 get
390 return (Current);
396 /// <summary>
397 /// Represents a document with mixed code and text. ASP, ASPX, JSP, are good example of such documents.
398 /// </summary>
399 public class MixedCodeDocument
401 private System.Text.Encoding _streamencoding = null;
402 internal string _text;
403 internal MixedCodeDocumentFragmentList _fragments;
404 internal MixedCodeDocumentFragmentList _codefragments;
405 internal MixedCodeDocumentFragmentList _textfragments;
406 private ParseState _state;
407 private int _index;
408 private int _c;
409 private int _line;
410 private int _lineposition;
411 private MixedCodeDocumentFragment _currentfragment;
413 /// <summary>
414 /// Gets or sets the token representing code start.
415 /// </summary>
416 public string TokenCodeStart = "<%";
418 /// <summary>
419 /// Gets or sets the token representing code end.
420 /// </summary>
421 public string TokenCodeEnd = "%>";
423 /// <summary>
424 /// Gets or sets the token representing code directive.
425 /// </summary>
426 public string TokenDirective = "@";
428 /// <summary>
429 /// Gets or sets the token representing response write directive.
430 /// </summary>
431 public string TokenResponseWrite = "Response.Write ";
434 private string TokenTextBlock = "TextBlock({0})";
436 /// <summary>
437 /// Creates a mixed code document instance.
438 /// </summary>
439 public MixedCodeDocument()
441 _codefragments = new MixedCodeDocumentFragmentList(this);
442 _textfragments = new MixedCodeDocumentFragmentList(this);
443 _fragments = new MixedCodeDocumentFragmentList(this);
446 /// <summary>
447 /// Loads a mixed code document from a stream.
448 /// </summary>
449 /// <param name="stream">The input stream.</param>
450 public void Load(Stream stream)
452 Load(new StreamReader(stream));
455 /// <summary>
456 /// Loads a mixed code document from a stream.
457 /// </summary>
458 /// <param name="stream">The input stream.</param>
459 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
460 public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)
462 Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));
465 /// <summary>
466 /// Loads a mixed code document from a stream.
467 /// </summary>
468 /// <param name="stream">The input stream.</param>
469 /// <param name="encoding">The character encoding to use.</param>
470 public void Load(Stream stream, Encoding encoding)
472 Load(new StreamReader(stream, encoding));
475 /// <summary>
476 /// Loads a mixed code document from a stream.
477 /// </summary>
478 /// <param name="stream">The input stream.</param>
479 /// <param name="encoding">The character encoding to use.</param>
480 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
481 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
483 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));
486 /// <summary>
487 /// Loads a mixed code document from a stream.
488 /// </summary>
489 /// <param name="stream">The input stream.</param>
490 /// <param name="encoding">The character encoding to use.</param>
491 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
492 /// <param name="buffersize">The minimum buffer size.</param>
493 public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
495 Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));
498 /// <summary>
499 /// Loads a mixed code document from a file.
500 /// </summary>
501 /// <param name="path">The complete file path to be read.</param>
502 public void Load(string path)
504 Load(new StreamReader(path));
507 /// <summary>
508 /// Loads a mixed code document from a file.
509 /// </summary>
510 /// <param name="path">The complete file path to be read.</param>
511 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
512 public void Load(string path, bool detectEncodingFromByteOrderMarks)
514 Load(new StreamReader(path, detectEncodingFromByteOrderMarks));
517 /// <summary>
518 /// Loads a mixed code document from a file.
519 /// </summary>
520 /// <param name="path">The complete file path to be read.</param>
521 /// <param name="encoding">The character encoding to use.</param>
522 public void Load(string path, Encoding encoding)
524 Load(new StreamReader(path, encoding));
527 /// <summary>
528 /// Loads a mixed code document from a file.
529 /// </summary>
530 /// <param name="path">The complete file path to be read.</param>
531 /// <param name="encoding">The character encoding to use.</param>
532 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
533 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
535 Load(new StreamReader(path, encoding, detectEncodingFromByteOrderMarks));
538 /// <summary>
539 /// Loads a mixed code document from a file.
540 /// </summary>
541 /// <param name="path">The complete file path to be read.</param>
542 /// <param name="encoding">The character encoding to use.</param>
543 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
544 /// <param name="buffersize">The minimum buffer size.</param>
545 public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
547 Load(new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize));
550 /// <summary>
551 /// Loads a mixed document from a text
552 /// </summary>
553 /// <param name="html">The text to load.</param>
554 public void LoadHtml(string html)
556 Load(new StringReader(html));
559 /// <summary>
560 /// Loads the mixed code document from the specified TextReader.
561 /// </summary>
562 /// <param name="reader">The TextReader used to feed the HTML data into the document.</param>
563 public void Load(TextReader reader)
565 _codefragments.Clear();
566 _textfragments.Clear();
568 // all pseudo constructors get down to this one
569 StreamReader sr = reader as StreamReader;
570 if (sr != null)
572 _streamencoding = sr.CurrentEncoding;
575 _text = reader.ReadToEnd();
576 reader.Close();
577 Parse();
580 internal System.Text.Encoding GetOutEncoding()
582 if (_streamencoding != null)
583 return _streamencoding;
584 return System.Text.Encoding.Default;
587 /// <summary>
588 /// Gets the encoding of the stream used to read the document.
589 /// </summary>
590 public System.Text.Encoding StreamEncoding
594 return _streamencoding;
598 /// <summary>
599 /// Gets the list of code fragments in the document.
600 /// </summary>
601 public MixedCodeDocumentFragmentList CodeFragments
605 return _codefragments;
609 /// <summary>
610 /// Gets the list of text fragments in the document.
611 /// </summary>
612 public MixedCodeDocumentFragmentList TextFragments
616 return _textfragments;
620 /// <summary>
621 /// Gets the list of all fragments in the document.
622 /// </summary>
623 public MixedCodeDocumentFragmentList Fragments
627 return _fragments;
631 /// <summary>
632 /// Saves the mixed document to the specified stream.
633 /// </summary>
634 /// <param name="outStream">The stream to which you want to save.</param>
635 public void Save(Stream outStream)
637 StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());
638 Save(sw);
641 /// <summary>
642 /// Saves the mixed document to the specified stream.
643 /// </summary>
644 /// <param name="outStream">The stream to which you want to save.</param>
645 /// <param name="encoding">The character encoding to use.</param>
646 public void Save(Stream outStream, System.Text.Encoding encoding)
648 StreamWriter sw = new StreamWriter(outStream, encoding);
649 Save(sw);
652 /// <summary>
653 /// Saves the mixed document to the specified file.
654 /// </summary>
655 /// <param name="filename">The location of the file where you want to save the document.</param>
656 public void Save(string filename)
658 StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());
659 Save(sw);
662 /// <summary>
663 /// Saves the mixed document to the specified file.
664 /// </summary>
665 /// <param name="filename">The location of the file where you want to save the document.</param>
666 /// <param name="encoding">The character encoding to use.</param>
667 public void Save(string filename, System.Text.Encoding encoding)
669 StreamWriter sw = new StreamWriter(filename, false, encoding);
670 Save(sw);
673 /// <summary>
674 /// Saves the mixed document to the specified StreamWriter.
675 /// </summary>
676 /// <param name="writer">The StreamWriter to which you want to save.</param>
677 public void Save(StreamWriter writer)
679 Save((TextWriter)writer);
682 /// <summary>
683 /// Saves the mixed document to the specified TextWriter.
684 /// </summary>
685 /// <param name="writer">The TextWriter to which you want to save.</param>
686 public void Save(TextWriter writer)
689 writer.Flush();
692 /// <summary>
693 /// Gets the code represented by the mixed code document seen as a template.
694 /// </summary>
695 public string Code
699 string s = "";
700 int i = 0;
701 foreach(MixedCodeDocumentFragment frag in _fragments)
703 switch(frag._type)
705 case MixedCodeDocumentFragmentType.Text:
706 s += TokenResponseWrite + string.Format(TokenTextBlock, i) + "\n";
707 i++;
708 break;
710 case MixedCodeDocumentFragmentType.Code:
711 s += ((MixedCodeDocumentCodeFragment)frag).Code + "\n";
712 break;
715 return s;
719 /// <summary>
720 /// Create a text fragment instances.
721 /// </summary>
722 /// <returns>The newly created text fragment instance.</returns>
723 public MixedCodeDocumentTextFragment CreateTextFragment()
725 return (MixedCodeDocumentTextFragment)CreateFragment(MixedCodeDocumentFragmentType.Text);
728 /// <summary>
729 /// Create a code fragment instances.
730 /// </summary>
731 /// <returns>The newly created code fragment instance.</returns>
732 public MixedCodeDocumentCodeFragment CreateCodeFragment()
734 return (MixedCodeDocumentCodeFragment)CreateFragment(MixedCodeDocumentFragmentType.Code);
737 internal MixedCodeDocumentFragment CreateFragment(MixedCodeDocumentFragmentType type)
739 switch(type)
741 case MixedCodeDocumentFragmentType.Text:
742 return new MixedCodeDocumentTextFragment(this);
744 case MixedCodeDocumentFragmentType.Code:
745 return new MixedCodeDocumentCodeFragment(this);
747 default:
748 throw new NotSupportedException();
752 private void SetPosition()
754 _currentfragment._line = _line;
755 _currentfragment._lineposition = _lineposition;
756 _currentfragment._index = _index - 1;
757 _currentfragment._length = 0;
760 private void IncrementPosition()
762 _index++;
763 if (_c == 10)
765 _lineposition = 1;
766 _line++;
768 else
769 _lineposition++;
772 private enum ParseState
774 Text,
775 Code
778 private void Parse()
780 _state = ParseState.Text;
781 _index = 0;
782 _currentfragment = CreateFragment(MixedCodeDocumentFragmentType.Text);
784 while (_index<_text.Length)
786 _c = _text[_index];
787 IncrementPosition();
789 switch(_state)
791 case ParseState.Text:
792 if (_index+TokenCodeStart.Length<_text.Length)
794 if (_text.Substring(_index-1, TokenCodeStart.Length) == TokenCodeStart)
796 _state = ParseState.Code;
797 _currentfragment._length = _index -1 - _currentfragment._index;
798 _currentfragment = CreateFragment(MixedCodeDocumentFragmentType.Code);
799 SetPosition();
800 continue;
803 break;
805 case ParseState.Code:
806 if (_index+TokenCodeEnd.Length<_text.Length)
808 if (_text.Substring(_index-1, TokenCodeEnd.Length) == TokenCodeEnd)
810 _state = ParseState.Text;
811 _currentfragment._length = _index + TokenCodeEnd.Length - _currentfragment._index;
812 _index += TokenCodeEnd.Length;
813 _lineposition += TokenCodeEnd.Length;
814 _currentfragment = CreateFragment(MixedCodeDocumentFragmentType.Text);
815 SetPosition();
816 continue;
819 break;
823 _currentfragment._length = _index - _currentfragment._index;