1 // HtmlAgilityPack V1.0
4 Copyright (C) 2003 Simon Mourier <simonm@microsoft.com>
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
10 1. Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 2. Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 3. The name of the author may not be used to endorse or promote products
16 derived from this software without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 using System
.Collections
;
35 namespace HtmlAgilityPack
38 /// Represents the type of fragement in a mixed code document.
40 public enum MixedCodeDocumentFragmentType
43 /// The fragment contains code.
48 /// The fragment contains text.
54 /// Represents a fragment of code in a mixed code document.
56 public class MixedCodeDocumentCodeFragment
: MixedCodeDocumentFragment
58 internal string _code
;
60 internal MixedCodeDocumentCodeFragment(MixedCodeDocument doc
):
61 base(doc
, MixedCodeDocumentFragmentType
.Code
)
66 /// Gets the fragment code text.
74 _code
= FragmentText
.Substring(_doc
.TokenCodeStart
.Length
,
75 FragmentText
.Length
- _doc
.TokenCodeEnd
.Length
- _doc
.TokenCodeStart
.Length
-1).Trim();
76 if (_code
.StartsWith("="))
78 _code
= _doc
.TokenResponseWrite
+ _code
.Substring(1, _code
.Length
-1);
91 /// Represents a fragment of text in a mixed code document.
93 public class MixedCodeDocumentTextFragment
: MixedCodeDocumentFragment
95 internal MixedCodeDocumentTextFragment(MixedCodeDocument doc
):
96 base(doc
, MixedCodeDocumentFragmentType
.Text
)
101 /// Gets the fragment text.
111 base._fragmenttext
= value;
117 /// Represents a base class for fragments in a mixed code document.
119 public abstract class MixedCodeDocumentFragment
121 internal MixedCodeDocumentFragmentType _type
;
122 internal MixedCodeDocument _doc
;
124 internal int _length
;
126 internal int _lineposition
;
127 internal string _fragmenttext
;
129 internal MixedCodeDocumentFragment(MixedCodeDocument doc
, MixedCodeDocumentFragmentType type
)
135 case MixedCodeDocumentFragmentType
.Text
:
136 _doc
._textfragments
.Append(this);
139 case MixedCodeDocumentFragmentType
.Code
:
140 _doc
._codefragments
.Append(this);
143 _doc
._fragments
.Append(this);
147 /// Gets the type of fragment.
149 public MixedCodeDocumentFragmentType FragmentType
158 /// Gets the fragment position in the document's stream.
160 public int StreamPosition
169 /// Gets the line number of the fragment.
180 /// Gets the line position (column) of the fragment.
182 public int LinePosition
186 return _lineposition
;
191 /// Gets the fragement text.
193 public string FragmentText
197 if (_fragmenttext
== null)
199 _fragmenttext
= _doc
._text
.Substring(_index
, _length
);
201 return _fragmenttext
;
207 /// Represents a list of mixed code fragments.
209 public class MixedCodeDocumentFragmentList
: IEnumerable
211 private MixedCodeDocument _doc
;
212 private ArrayList _items
= new ArrayList();
214 internal MixedCodeDocumentFragmentList(MixedCodeDocument doc
)
220 /// Appends a fragment to the list of fragments.
222 /// <param name="newFragment">The fragment to append. May not be null.</param>
223 public void Append(MixedCodeDocumentFragment newFragment
)
225 if (newFragment
== null)
227 throw new ArgumentNullException("newFragment");
229 _items
.Add(newFragment
);
233 /// Prepends a fragment to the list of fragments.
235 /// <param name="newFragment">The fragment to append. May not be null.</param>
236 public void Prepend(MixedCodeDocumentFragment newFragment
)
238 if (newFragment
== null)
240 throw new ArgumentNullException("newFragment");
242 _items
.Insert(0, newFragment
);
246 /// Remove a fragment from the list of fragments. If this fragment was not in the list, an exception will be raised.
248 /// <param name="fragment">The fragment to remove. May not be null.</param>
249 public void Remove(MixedCodeDocumentFragment fragment
)
251 if (fragment
== null)
253 throw new ArgumentNullException("fragment");
255 int index
= GetFragmentIndex(fragment
);
258 throw new IndexOutOfRangeException();
264 /// Remove a fragment from the list of fragments, using its index in the list.
266 /// <param name="index">The index of the fragment to remove.</param>
267 public void RemoveAt(int index
)
269 MixedCodeDocumentFragment frag
= (MixedCodeDocumentFragment
)_items
[index
];
270 _items
.RemoveAt(index
);
274 /// Remove all fragments from the list.
276 public void RemoveAll()
282 /// Gets the number of fragments contained in the list.
292 internal int GetFragmentIndex(MixedCodeDocumentFragment fragment
)
294 if (fragment
== null)
296 throw new ArgumentNullException("fragment");
298 for(int i
=0;i
<_items
.Count
;i
++)
300 if (((MixedCodeDocumentFragment
)_items
[i
])==fragment
)
309 /// Gets a fragment from the list using its index.
311 public MixedCodeDocumentFragment
this[int index
]
315 return _items
[index
] as MixedCodeDocumentFragment
;
319 internal void Clear()
325 /// Gets an enumerator that can iterate through the fragment list.
327 public MixedCodeDocumentFragmentEnumerator
GetEnumerator()
329 return new MixedCodeDocumentFragmentEnumerator(_items
);
333 /// Gets an enumerator that can iterate through the fragment list.
335 IEnumerator IEnumerable
.GetEnumerator()
337 return GetEnumerator();
341 /// Represents a fragment enumerator.
343 public class MixedCodeDocumentFragmentEnumerator
: IEnumerator
348 internal MixedCodeDocumentFragmentEnumerator(ArrayList items
)
355 /// Sets the enumerator to its initial position, which is before the first element in the collection.
363 /// Advances the enumerator to the next element of the collection.
365 /// <returns>true if the enumerator was successfully advanced to the next element; false if the enumerator has passed the end of the collection.</returns>
366 public bool MoveNext()
369 return (_index
<_items
.Count
);
373 /// Gets the current element in the collection.
375 public MixedCodeDocumentFragment Current
379 return (MixedCodeDocumentFragment
)(_items
[_index
]);
384 /// Gets the current element in the collection.
386 object IEnumerator
.Current
397 /// Represents a document with mixed code and text. ASP, ASPX, JSP, are good example of such documents.
399 public class MixedCodeDocument
401 private System
.Text
.Encoding _streamencoding
= null;
402 internal string _text
;
403 internal MixedCodeDocumentFragmentList _fragments
;
404 internal MixedCodeDocumentFragmentList _codefragments
;
405 internal MixedCodeDocumentFragmentList _textfragments
;
406 private ParseState _state
;
410 private int _lineposition
;
411 private MixedCodeDocumentFragment _currentfragment
;
414 /// Gets or sets the token representing code start.
416 public string TokenCodeStart
= "<%";
419 /// Gets or sets the token representing code end.
421 public string TokenCodeEnd
= "%>";
424 /// Gets or sets the token representing code directive.
426 public string TokenDirective
= "@";
429 /// Gets or sets the token representing response write directive.
431 public string TokenResponseWrite
= "Response.Write ";
434 private string TokenTextBlock
= "TextBlock({0})";
437 /// Creates a mixed code document instance.
439 public MixedCodeDocument()
441 _codefragments
= new MixedCodeDocumentFragmentList(this);
442 _textfragments
= new MixedCodeDocumentFragmentList(this);
443 _fragments
= new MixedCodeDocumentFragmentList(this);
447 /// Loads a mixed code document from a stream.
449 /// <param name="stream">The input stream.</param>
450 public void Load(Stream stream
)
452 Load(new StreamReader(stream
));
456 /// Loads a mixed code document from a stream.
458 /// <param name="stream">The input stream.</param>
459 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
460 public void Load(Stream stream
, bool detectEncodingFromByteOrderMarks
)
462 Load(new StreamReader(stream
, detectEncodingFromByteOrderMarks
));
466 /// Loads a mixed code document from a stream.
468 /// <param name="stream">The input stream.</param>
469 /// <param name="encoding">The character encoding to use.</param>
470 public void Load(Stream stream
, Encoding encoding
)
472 Load(new StreamReader(stream
, encoding
));
476 /// Loads a mixed code document from a stream.
478 /// <param name="stream">The input stream.</param>
479 /// <param name="encoding">The character encoding to use.</param>
480 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
481 public void Load(Stream stream
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
)
483 Load(new StreamReader(stream
, encoding
, detectEncodingFromByteOrderMarks
));
487 /// Loads a mixed code document from a stream.
489 /// <param name="stream">The input stream.</param>
490 /// <param name="encoding">The character encoding to use.</param>
491 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
492 /// <param name="buffersize">The minimum buffer size.</param>
493 public void Load(Stream stream
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
, int buffersize
)
495 Load(new StreamReader(stream
, encoding
, detectEncodingFromByteOrderMarks
, buffersize
));
499 /// Loads a mixed code document from a file.
501 /// <param name="path">The complete file path to be read.</param>
502 public void Load(string path
)
504 Load(new StreamReader(path
));
508 /// Loads a mixed code document from a file.
510 /// <param name="path">The complete file path to be read.</param>
511 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
512 public void Load(string path
, bool detectEncodingFromByteOrderMarks
)
514 Load(new StreamReader(path
, detectEncodingFromByteOrderMarks
));
518 /// Loads a mixed code document from a file.
520 /// <param name="path">The complete file path to be read.</param>
521 /// <param name="encoding">The character encoding to use.</param>
522 public void Load(string path
, Encoding encoding
)
524 Load(new StreamReader(path
, encoding
));
528 /// Loads a mixed code document from a file.
530 /// <param name="path">The complete file path to be read.</param>
531 /// <param name="encoding">The character encoding to use.</param>
532 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
533 public void Load(string path
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
)
535 Load(new StreamReader(path
, encoding
, detectEncodingFromByteOrderMarks
));
539 /// Loads a mixed code document from a file.
541 /// <param name="path">The complete file path to be read.</param>
542 /// <param name="encoding">The character encoding to use.</param>
543 /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
544 /// <param name="buffersize">The minimum buffer size.</param>
545 public void Load(string path
, Encoding encoding
, bool detectEncodingFromByteOrderMarks
, int buffersize
)
547 Load(new StreamReader(path
, encoding
, detectEncodingFromByteOrderMarks
, buffersize
));
551 /// Loads a mixed document from a text
553 /// <param name="html">The text to load.</param>
554 public void LoadHtml(string html
)
556 Load(new StringReader(html
));
560 /// Loads the mixed code document from the specified TextReader.
562 /// <param name="reader">The TextReader used to feed the HTML data into the document.</param>
563 public void Load(TextReader reader
)
565 _codefragments
.Clear();
566 _textfragments
.Clear();
568 // all pseudo constructors get down to this one
569 StreamReader sr
= reader
as StreamReader
;
572 _streamencoding
= sr
.CurrentEncoding
;
575 _text
= reader
.ReadToEnd();
580 internal System
.Text
.Encoding
GetOutEncoding()
582 if (_streamencoding
!= null)
583 return _streamencoding
;
584 return System
.Text
.Encoding
.Default
;
588 /// Gets the encoding of the stream used to read the document.
590 public System
.Text
.Encoding StreamEncoding
594 return _streamencoding
;
599 /// Gets the list of code fragments in the document.
601 public MixedCodeDocumentFragmentList CodeFragments
605 return _codefragments
;
610 /// Gets the list of text fragments in the document.
612 public MixedCodeDocumentFragmentList TextFragments
616 return _textfragments
;
621 /// Gets the list of all fragments in the document.
623 public MixedCodeDocumentFragmentList Fragments
632 /// Saves the mixed document to the specified stream.
634 /// <param name="outStream">The stream to which you want to save.</param>
635 public void Save(Stream outStream
)
637 StreamWriter sw
= new StreamWriter(outStream
, GetOutEncoding());
642 /// Saves the mixed document to the specified stream.
644 /// <param name="outStream">The stream to which you want to save.</param>
645 /// <param name="encoding">The character encoding to use.</param>
646 public void Save(Stream outStream
, System
.Text
.Encoding encoding
)
648 StreamWriter sw
= new StreamWriter(outStream
, encoding
);
653 /// Saves the mixed document to the specified file.
655 /// <param name="filename">The location of the file where you want to save the document.</param>
656 public void Save(string filename
)
658 StreamWriter sw
= new StreamWriter(filename
, false, GetOutEncoding());
663 /// Saves the mixed document to the specified file.
665 /// <param name="filename">The location of the file where you want to save the document.</param>
666 /// <param name="encoding">The character encoding to use.</param>
667 public void Save(string filename
, System
.Text
.Encoding encoding
)
669 StreamWriter sw
= new StreamWriter(filename
, false, encoding
);
674 /// Saves the mixed document to the specified StreamWriter.
676 /// <param name="writer">The StreamWriter to which you want to save.</param>
677 public void Save(StreamWriter writer
)
679 Save((TextWriter
)writer
);
683 /// Saves the mixed document to the specified TextWriter.
685 /// <param name="writer">The TextWriter to which you want to save.</param>
686 public void Save(TextWriter writer
)
693 /// Gets the code represented by the mixed code document seen as a template.
701 foreach(MixedCodeDocumentFragment frag
in _fragments
)
705 case MixedCodeDocumentFragmentType
.Text
:
706 s
+= TokenResponseWrite
+ string.Format(TokenTextBlock
, i
) + "\n";
710 case MixedCodeDocumentFragmentType
.Code
:
711 s
+= ((MixedCodeDocumentCodeFragment
)frag
).Code
+ "\n";
720 /// Create a text fragment instances.
722 /// <returns>The newly created text fragment instance.</returns>
723 public MixedCodeDocumentTextFragment
CreateTextFragment()
725 return (MixedCodeDocumentTextFragment
)CreateFragment(MixedCodeDocumentFragmentType
.Text
);
729 /// Create a code fragment instances.
731 /// <returns>The newly created code fragment instance.</returns>
732 public MixedCodeDocumentCodeFragment
CreateCodeFragment()
734 return (MixedCodeDocumentCodeFragment
)CreateFragment(MixedCodeDocumentFragmentType
.Code
);
737 internal MixedCodeDocumentFragment
CreateFragment(MixedCodeDocumentFragmentType type
)
741 case MixedCodeDocumentFragmentType
.Text
:
742 return new MixedCodeDocumentTextFragment(this);
744 case MixedCodeDocumentFragmentType
.Code
:
745 return new MixedCodeDocumentCodeFragment(this);
748 throw new NotSupportedException();
752 private void SetPosition()
754 _currentfragment
._line
= _line
;
755 _currentfragment
._lineposition
= _lineposition
;
756 _currentfragment
._index
= _index
- 1;
757 _currentfragment
._length
= 0;
760 private void IncrementPosition()
772 private enum ParseState
780 _state
= ParseState
.Text
;
782 _currentfragment
= CreateFragment(MixedCodeDocumentFragmentType
.Text
);
784 while (_index
<_text
.Length
)
791 case ParseState
.Text
:
792 if (_index
+TokenCodeStart
.Length
<_text
.Length
)
794 if (_text
.Substring(_index
-1, TokenCodeStart
.Length
) == TokenCodeStart
)
796 _state
= ParseState
.Code
;
797 _currentfragment
._length
= _index
-1 - _currentfragment
._index
;
798 _currentfragment
= CreateFragment(MixedCodeDocumentFragmentType
.Code
);
805 case ParseState
.Code
:
806 if (_index
+TokenCodeEnd
.Length
<_text
.Length
)
808 if (_text
.Substring(_index
-1, TokenCodeEnd
.Length
) == TokenCodeEnd
)
810 _state
= ParseState
.Text
;
811 _currentfragment
._length
= _index
+ TokenCodeEnd
.Length
- _currentfragment
._index
;
812 _index
+= TokenCodeEnd
.Length
;
813 _lineposition
+= TokenCodeEnd
.Length
;
814 _currentfragment
= CreateFragment(MixedCodeDocumentFragmentType
.Text
);
823 _currentfragment
._length
= _index
- _currentfragment
._index
;