Fix strikethough XHTML-IM.
[thrasher.git] / perl / lib / Thrasher / HTMLNormalize.pm
blobab5ec10bf765474c11e25f51a4551d5a5aeaf218
1 package Thrasher::HTMLNormalize;
2 use strict;
3 use warnings;
5 use Thrasher::HTMLNormalize::Entities qw(html_entity);
7 use base 'Exporter';
9 our @EXPORT_OK = qw(&escape $CONSERVATIVE_PERMISSIONS &normalize
10 &escape_quote $CONSERVATIVE_PERMISSIONS_NO_LINKS);
12 our %EXPORT_TAGS = (all => \@EXPORT_OK);
14 my $VALIDATE_AS_XML = 0;
16 ###
17 ### PLEASE NOTE: This module has unit tests. Changing this module
18 ### without verifying them is a bad idea.
19 ###
21 # Also note that this file isn't as dirty as it looks; that's
22 # just the natural result of sticking an entire compiler in one
23 # file.
25 ###
26 ### CONSTANT AND DATA DECLARATIONS
27 ###
28 ### Recall "my" is actually file scoped, not package scoped.
29 ### use constant, however, *is* package scoped, so it's darned inconvenient.
31 # A set-like hash indicating which entities are allowed
32 # to exist as text-named entities, and not converted into numeric
33 # entities.
34 my %LEGAL_ENTITIES = map { $_ => 1 } qw(amp lt gt quot apos);
36 my %ESCAPE_ENTITIES = ('&' => '&',
37 '>' => '>',
38 '<' => '&lt;' );
39 my %QUOTE_ENTITIES = (%ESCAPE_ENTITIES,
40 "'" => '&apos;',
41 '"' => '&quot;');
43 # Tags that sort of close themselves
44 my %SELF_CLOSERS = map { $_ => 1 } qw(li p);
46 # Token types
47 my $TEXT = 1;
48 my $TAG = 2;
49 my $TAGCLOSE = 3;
50 my $SELFCLOSEDTAG = 4;
51 my $ENTITY = 5;
52 my $NEWLINE = 6;
53 my $DOUBLENEWLINE = 7;
54 my $CDATA = 8;
56 # Tag types. These are used for formatting the resulting text.
57 # A block tag, as in CSS
58 my $BLOCK = 10;
59 # An inline tag, as in CSS
60 my $INLINE = 11;
61 # A 'block' tag that should be converted to a self-closing
62 # tag if an empty instance is found (i.e., <br></br> => <br />)
63 my $BLOCK_SELF_CLOSE = 12;
65 my %TAG_TYPES =
67 p => $BLOCK,
68 div => $BLOCK,
69 span => $INLINE,
70 br => $BLOCK_SELF_CLOSE,
71 a => $INLINE,
72 li => $BLOCK,
73 dl => $BLOCK,
74 dt => $BLOCK,
75 dd => $BLOCK,
76 ul => $BLOCK,
77 font => $INLINE,
78 abbr => $INLINE,
79 acronym => $INLINE,
80 b => $INLINE,
81 bgsound => $INLINE,
82 big => $INLINE,
83 caption => $INLINE,
84 img => $INLINE,
85 blockquote => $BLOCK,
86 h1 => $BLOCK,
87 h2 => $BLOCK,
88 h3 => $BLOCK,
89 h4 => $BLOCK,
90 h5 => $BLOCK,
91 h6 => $BLOCK,
92 em => $INLINE,
93 strong => $INLINE,
94 dfn => $INLINE,
95 code => $INLINE,
96 samp => $INLINE,
97 kbd => $INLINE,
98 var => $INLINE,
99 cite => $INLINE,
100 hr => $BLOCK_SELF_CLOSE,
101 i => $INLINE,
102 ol => $BLOCK,
103 pre => $BLOCK,
104 small => $INLINE,
105 s => $INLINE,
106 strike => $INLINE,
107 'sub' => $INLINE,
108 sup => $INLINE,
109 tt => $INLINE,
110 table => $BLOCK,
111 tr => $BLOCK,
112 td => $BLOCK,
113 u => $INLINE,
114 html => $BLOCK,
115 body => $BLOCK
118 # A reasonable pre-defined behavior for URLs. If it has a protocol
119 # in the front, it *must* be http or https. Anything else gets
120 # destroyed. If it doesn't have a protocol in the front, we'll
121 # at least take a shot and prepend http://.
122 sub fixURL {
123 my $url = shift;
124 if ($url !~ /^https?:\/\//) {
125 # has non-http protocol. Evil. Die now.
126 # or maybe an incorrectly-specified port. Die now anyhow.
127 if ($url =~ /^[^:]+:/) {
128 return undef;
131 return 'http://' . $url;
133 return $url;
136 # A reasonable pre-defined conservative set. Allows a reasonable
137 # range of simple formatting tags, but the only attributes allowed
138 # are URLS (and those only with http or https protocols), and
139 # titles for abbr and acronyms. Certainly no code execution allowed.
140 our $CONSERVATIVE_PERMISSIONS = {
141 p => {},
142 br => {},
143 a => {href => \&fixURL},
144 ul => {},
145 li => {},
146 abbr => {title => 1},
147 acronym => {title => 1},
148 b => {},
149 big => {},
150 blockquote => {},
151 em => {},
152 strong => {},
153 dfn => {},
154 code => {},
155 samp => {},
156 kbd => {},
157 var => {},
158 cite => {},
159 hr => {},
160 i => {},
161 ol => {},
162 pre => {},
163 small => {},
164 strike => {},
165 sub => {},
166 sup => {},
167 tt => {},
168 u => {}
171 my %CONSERVATIVE_PERMISSIONS_NO_LINKS = %$CONSERVATIVE_PERMISSIONS;
172 our $CONSERVATIVE_PERMISSIONS_NO_LINKS = \%CONSERVATIVE_PERMISSIONS_NO_LINKS;
173 delete $CONSERVATIVE_PERMISSIONS_NO_LINKS->{a};
176 ### UTILITY FUNCTIONS
179 # Return whether $s is the name of a valid entity
180 sub is_valid_entity {
181 my $s = shift;
182 return !!($s =~ /\#\d{1,5}/) || $LEGAL_ENTITIES{$s};
185 # Given an entity name, either return that entity or an empty
186 # string to elide it.
187 sub make_entity {
188 my $s = shift;
189 if (is_valid_entity($s)) {
190 return "&$s;";
191 } else {
192 return '';
196 sub escape {
197 my $s = shift;
198 $s =~ s/([&<>])/$ESCAPE_ENTITIES{$1}/ge;
199 return $s;
201 sub escape_quote {
202 my $s = shift;
203 $s =~ s/([&<>'"])/$QUOTE_ENTITIES{$1}/ge; #'])/;
204 return $s;
207 # Nicer interface to create tokens; just specify the last bit.
208 sub token {
209 my $token_type = shift;
210 $token_type = 'Thrasher::HTMLNormalize::Token::' . $token_type;
211 return $token_type->new(@_);
214 # Tokenize a tag or entity. We know it's one of them because of
215 # how it was parsed. (Pulled out for simple testing.)
216 sub tokenize_tag_or_entity {
217 my $tag_or_entity = shift;
218 my $allow_taglike = shift;
219 my $tag_translation = shift;
221 # \# is to convince emacs syntax highlighting that # is not a comment
222 if ($tag_or_entity !~ /((<(\/)?(\w+)([^>]*)>)|(&(\#?\w+);))/) {
223 die "tokenize_tag_or_entity must be passed a tag or entity: "
224 .$tag_or_entity;
227 # Convert that mass of matches into something comprehensible
228 my $groups = {tag => $2, # the whole of the tag
229 slash => $3, # if it has a slash to close the tag
230 tagname => $4,# the tag name
231 tagremains => $5, # the rest of the tag (atts)
232 entity => $6, # the whole of the entity
233 entityname => $7 };
235 $groups->{tagname} = lc $groups->{tagname};
237 if (defined(my $tag_translator =
238 $tag_translation->{lc $groups->{tagname}})) {
239 # use Data::Dumper;
240 #print "Initial: " . Dumper($groups);
241 if (ref($tag_translator) eq '') {
242 $groups->{tagname} = $tag_translator;
243 } elsif (ref($tag_translator) eq 'CODE') {
244 $tag_translator->($groups);
246 #print "Post-translation: " . Dumper($groups);
249 if ($groups->{tag} && $allow_taglike &&
250 !$TAG_TYPES{lc $groups->{tagname}}) {
251 return token('Text', $tag_or_entity);
252 } elsif ($groups->{tag}) {
253 # handle tokenizing a tag
254 if ($groups->{slash}) {
255 return token('TagClose', $groups->{tagname});
258 my $token_type = 'Tag';
260 my $atts = $groups->{tagremains};
261 if ($atts && substr($atts, -1) eq '/') {
262 $token_type = 'SelfClosedTag';
263 $atts = substr($atts, 0, length($atts) - 1); # pop off /
266 my $atts_hash = {};
268 # Pull out the attributes
269 while ($atts =~ /([a-zA-Z]\w+)(?:=(?:(?:\"([^\">]*)\")|(?:'([^'>]*)\')|(?:([^ \n\t>]*))))?/g) {
270 my $name = $1;
271 # one value delimited by ", the other by ',
272 # the final by whitespace, the final case
273 # covers those few atts in HTML that can
274 # show up with no value at all
275 my $value = $2 || $3 || $4 || 1;
277 $atts_hash->{lc $name} = $value;
280 return token($token_type, $groups->{tagname}, $atts_hash);
281 } elsif ($groups->{entity}) {
282 return token('Entity', $groups->{entityname});
285 # shouldn't be able to get here.
286 die "tokenize_tag_or_entity didn't get anything it expected.";
289 # Given something known to be character-based text with no
290 # (X)HTML in it, tokenize it.
291 sub tokenize_text {
292 my $s = shift;
293 my @tokens;
295 my @texthunks = split(/(\n+)/, $s);
296 my $i = 0;
298 for my $fragment (@texthunks) {
299 if ($i %2) {
300 my $newlines = length($fragment); # all newlines
301 if ($newlines == 1) {
302 push @tokens, token('Newline');
303 } else {
304 push @tokens, token('DoubleNewline');
306 } else {
307 if ($fragment) { # skip empty matches
308 push @tokens, token('Text', $fragment);
312 $i++;
315 return \@tokens;
318 # Given a tag and our validation criteria, verify the tag is
319 # allowed. If it is not allowed, this returns 0. It may also
320 # destructively manipulate the tag's attributes to make them valid.
321 sub validate_tag {
322 my $tag = shift;
323 my $allowed_elements = shift;
325 # It's all allowed.
326 if (!defined($allowed_elements)) {
327 return $tag;
330 my $legal_attributes = $allowed_elements->{$tag->{name}};
332 # Is this allowed at all?
333 if (!$legal_attributes) {
334 return 0;
337 # If this is a close tag, pass it through.
338 if ($tag->{type} == $TAGCLOSE) {
339 return $tag;
342 # Now, actually validate the attributes;
343 while (my ($name, $value) = each %{$tag->{atts}}) {
344 my $attribute_validation = $legal_attributes->{$name};
346 # Is the attribute even allowed?
347 if (!$attribute_validation) {
348 # Nope, toast it.
349 delete($tag->{atts}->{$name});
350 next;
353 if ($attribute_validation eq '1') { # note string compare
354 # everything is permitted here, pass through
355 next;
358 my $new_value;
359 if (ref($attribute_validation) eq 'CODE') {
360 $new_value = $attribute_validation->($value);
361 if (!defined($new_value)) {
362 # This attribute is so evil that the entire tag
363 # has to go away!
364 return 0;
367 if (!$new_value) { # either false or empty string
368 delete($tag->{atts}->{$name});
369 next;
371 } elsif (ref($attribute_validation) eq 'Regexp') {
372 my $match = $value =~ /$attribute_validation/;
373 if (!$match) {
374 return 0;
376 # Otherwise pass through; for better control, use a sub
377 $new_value = $value;
378 } elsif ($attribute_validation) { # any true value
379 $new_value = $value;
380 } else {
381 # any false value; redundent since unspecified = false
382 delete($tag->{atts}->{$name});
383 next;
386 $tag->{atts}->{$name} = $new_value;
389 return $tag;
392 # The money function at last.
393 sub normalize {
394 my $html = shift;
395 my $allowed_elements = shift; # default - allow everything
396 my $allow_cdata = shift; # default - no
397 my $allow_taglike = shift;
398 my $tag_translation = shift || {};
400 my $text_only = shift; # don't automatically slap in <p>s
402 # Whitespace trimmer
404 # Apparently, applying this to something that consists
405 # entirely of whitespace gets a warning about using
406 # an uninitialized value. Probably just the implementation
407 # poking out and doesn't seem to be worth worrying about.
408 no warnings 'uninitialized';
409 $html =~ s/(^\s+)|(\s+$ )//gx;
412 # FIXME: Verify in utf-8 text encoding
414 # If we elimated the HTML in that step, return nothing
415 if (!$html) {
416 return '';
419 my @tokens;
421 if ($allow_cdata) {
422 my @cdata_fragments = split(/<!\[CDATA\[(.*?)\]\]>/,
423 $html);
424 my $i = 0;
426 for my $fragment (@cdata_fragments) {
427 if ($i % 2) {
428 push @tokens, token('CData', $fragment);
429 } else {
430 tokenize_fragment(\@tokens, $fragment,
431 $allowed_elements, $allow_taglike,
432 $tag_translation);
434 $i++;
436 } else {
437 tokenize_fragment(\@tokens, $html, $allowed_elements,
438 $allow_taglike, $tag_translation);
441 # Push a p on front if the first token is not a block
442 # tag. Note that <html> is a block tag, so complete
443 # documents don't get this done, only fragments. This work
444 # because the TagStack will normalize the p correctly.
445 if (!$text_only &&
446 @tokens && ($tokens[0]->{type} != $TAG ||
447 ($TAG_TYPES{$tokens[0]->{name}} || $BLOCK)
448 != $BLOCK)) {
449 unshift @tokens, token('Tag', 'p');
452 my @new_tokens;
453 # we abusively share modification between this func
454 # and the tag stack :(
455 my $tag_stack = new Thrasher::HTMLNormalize::TagStack(\@new_tokens);
457 for (my $i = 0; $i < @tokens; $i++) {
458 my $token = $tokens[$i];
460 my $type = $token->{type};
461 if ($type == $TAG) {
462 $tag_stack->push($token);
464 if ($type == $TAGCLOSE) {
465 $tag_stack->pop($token);
467 if ($type == $TEXT || $type == $ENTITY ||
468 $type == $SELFCLOSEDTAG || $type == $CDATA) {
469 push @new_tokens, $token
472 # If a newline is already preceded by a self-closed
473 # br, pass it through, otherwise, prepend a self-closed br
474 if ($type == $NEWLINE) {
475 # Well, actually, if we're in a "pre", skip all fancy
476 # processing
477 if ($tag_stack->in_tag('pre')) {
478 push @new_tokens, token('Text', "\n");
479 next;
482 if (!@new_tokens) {
483 next;
486 my $previous_token = $tokens[$i - 1];
487 if ($previous_token->{type} == $SELFCLOSEDTAG &&
488 $previous_token->{name} eq 'br') {
489 push @new_tokens, $token;
490 } else {
491 # add a br somehowe
493 # we're not at the end of the tokens.
494 if ($i + 1 < @tokens) {
495 my $next_token = $tokens[$i + 1];
496 if ($next_token->{type} == $TAG) {
497 my $tag_type = $TAG_TYPES{$next_token->{name}}
498 || $BLOCK;
499 if ($tag_type != $BLOCK &&
500 $tag_type != $BLOCK_SELF_CLOSE) {
501 # this automatically gets eaten if there
502 # was no previous <p>
503 if (!$text_only) {
504 push @new_tokens,
505 token('SelfClosedTag', 'br');
507 push @new_tokens, $token;
508 } else {
509 push @new_tokens, $token;
511 } else {
512 if (!$text_only) {
513 push @new_tokens, token('SelfClosedTag', 'br');
515 push @new_tokens, $token;
521 if ($type == $DOUBLENEWLINE) {
522 if ($tag_stack->in_tag('pre')) {
523 push @new_tokens, token('Text', "\n\n");
524 next;
527 # Always assume we're closing a <p>. It gets eaten later
528 # by the TagStack if this turns out to be false.
529 if (!$text_only) {
530 $tag_stack->pop(token('TagClose', 'p'));
533 # If this is followed by a non-block tag, add a <p>.
534 if ($i < @tokens - 1) {
535 my $next_token = $tokens[$i + 1];
536 if ($next_token->{type} == $TAG) {
537 my $tag_type = $TAG_TYPES{$next_token->{name}} || $BLOCK;
538 if ($tag_type != $BLOCK &&
539 $tag_type != $BLOCK_SELF_CLOSE) {
540 push @new_tokens, $token;
541 if (!$text_only) {
542 $tag_stack->push(token('Tag', 'p'));
544 } else {
545 push @new_tokens, $token;
547 } else {
548 push @new_tokens, $token;
549 if (!$text_only) {
550 $tag_stack->push(token('Tag', 'p'));
557 $tag_stack->terminate();
559 my $new_html = join '', map { $_->render } @new_tokens;
561 if ($VALIDATE_AS_XML) {
562 # Do a simple validation pass.
563 die "XML validation not implemented yet.";
566 return $new_html;
569 # Performs low-level tokenization on a bit of text known not to have
570 # (valid) CDATA in it.
571 sub tokenize_fragment {
572 my $tokens = shift;
573 my $html = shift;
574 my $allowed_elements = shift;
575 my $allow_taglike = shift;
576 my $tag_translation = shift;
578 # Same as the validation re in tokenize_tag_or_entity, but
579 # with all but the outer parens turned into non-capturing.
580 my @fragments = split (/((?:<(?:\/)?(?:\w+)(?:[^>]*)>)|(?:&(?:\#?\w+);))/,
581 $html);
583 for (my $i = 0; $i < @fragments; $i++) {
584 my $text = $fragments[$i];
586 if ($i % 2) {
587 my $token = tokenize_tag_or_entity($text, $allow_taglike,
588 $tag_translation);
589 if ($token->{type} == $TAG) {
590 $token = validate_tag($token, $allowed_elements);
592 if ($token) {
593 push @$tokens, $token;
595 } else {
596 push @$tokens, @{tokenize_text($text)};
602 ### TOKEN CLASSES
605 package Thrasher::HTMLNormalize::Token;
606 # A base class for tokens, used for 'isa' checks.
608 # This is only used by children; do not call directly.
609 sub _new {
610 my $class = shift;
611 my %values = @_;
612 my $self = \%values;
613 bless $self, $class;
614 return $self;
618 package Thrasher::HTMLNormalize::Token::Text;
619 use base 'Thrasher::HTMLNormalize::Token';
621 sub new { shift()->SUPER::_new(text => shift(), type => $TEXT) }
623 sub render {
624 return Thrasher::HTMLNormalize::escape($_[0]->{text});
627 package Thrasher::HTMLNormalize::TagBase;
628 use base 'Thrasher::HTMLNormalize::Token';
629 # Contains the similarities between Tags and SelfClosedTags.
631 # process an attribute list into proper HTML; this assumes that
632 # the attributes has been checked for valid attributes already,
633 # but this also checks for permitted entities. Returns an arrayref
634 # containing strings ready-to-be-output.
635 sub att_list {
636 my $self = shift;
637 my @atts;
638 while (my ($name, $value) = each %{$self->{atts}}) {
639 my @fragments = split (/&(\w+);/, $value);
640 my $i = 0;
641 my @new_value;
642 for my $text (@fragments) {
643 if ($i % 2) {
644 # This is an entity body.
645 push @new_value, Thrasher::HTMLNormalize::make_entity($text);
646 } else {
647 # normal text
648 push(@new_value,
649 Thrasher::HTMLNormalize::escape_quote($text));
651 $i ++;
653 my $value = join '', @new_value;
654 # note $name was checked for validity implicitly by the
655 # regex that pulled it out in the first place.
656 push @atts, "$name='$value'";
658 return \@atts;
661 package Thrasher::HTMLNormalize::Token::Tag;
662 use base 'Thrasher::HTMLNormalize::TagBase';
663 # Represents a Tag, which may have attributes.
664 # Takes: A name, and a hashref of atts.
665 # Note that name is checked implicitly by the regex that pulls it out.
667 use Carp qw(confess);
669 sub new {
670 my $self = shift()->SUPER::_new(name => shift(),
671 atts => shift() || {},
672 type => $TAG);
673 confess "Hash" if ref($self->{name}) eq 'HASH';
674 return $self;
677 sub render {
678 my $self = shift;
679 my $atts = $self->att_list;
681 if (@$atts) {
682 return "<$self->{name} " . join(' ', @$atts) . ">";
683 } else {
684 return "<$self->{name}>";
688 package Thrasher::HTMLNormalize::Token::SelfClosedTag;
689 use base 'Thrasher::HTMLNormalize::TagBase';
690 # Represents a self-closing tag, which may have attributes.
692 sub new { shift()->SUPER::_new(name => shift(),
693 atts => shift() || {},
694 type => $SELFCLOSEDTAG); }
696 sub render {
697 my $self = shift;
698 my $atts = $self->att_list;
700 if (@$atts) {
701 return "<$self->{name} " . join (' ', @$atts) . " />";
702 } else {
703 return "<$self->{name} />";
707 package Thrasher::HTMLNormalize::Token::TagClose;
708 use base 'Thrasher::HTMLNormalize::Token';
709 # Represents a close tag. Has only a name.
711 sub new { shift()->SUPER::_new(name => shift(),
712 type => $TAGCLOSE); }
714 sub render {
715 return "</$_[0]->{name}>";
718 package Thrasher::HTMLNormalize::Token::Entity;
719 use base 'Thrasher::HTMLNormalize::Token';
720 use Thrasher::HTMLNormalize::Entities qw(html_entity);
721 # Represents an entity. Has only a name. Performs numberization of
722 # HTML entities
724 sub new {
725 my $class = shift;
727 my $name = shift;
728 if (!$LEGAL_ENTITIES{$name} && (my $number = html_entity($name))) {
729 $name = '#' . $number;
731 # make_entity takes care of nulling out illegal entities
732 # that pass this test
734 return $class->SUPER::_new(name => $name, type => $ENTITY);
737 sub render {
738 return Thrasher::HTMLNormalize::make_entity($_[0]->{name});
741 package Thrasher::HTMLNormalize::Token::Newline;
742 use base 'Thrasher::HTMLNormalize::Token';
743 # Represents a single newline, suitable for <br />
745 sub new { shift()->SUPER::_new(type => $NEWLINE); }
747 sub render { "\n"; }
749 package Thrasher::HTMLNormalize::Token::DoubleNewline;
750 use base 'Thrasher::HTMLNormalize::Token';
751 # Double new line, suitable for <p>-ification
753 sub new { shift()->SUPER::_new(type => $DOUBLENEWLINE); }
755 sub render { "\n\n"; }
757 package Thrasher::HTMLNormalize::Token::CData;
758 use base 'Thrasher::HTMLNormalize::Token';
760 sub new { shift()->SUPER::_new(rawtext => shift(), type => $CDATA); }
762 sub render { "<![CDATA[" . $_[0]->{rawtext} . "]]>"; }
765 ### TAG STACK
768 package Thrasher::HTMLNormalize::TagStack;
769 use strict;
770 use warnings;
772 use Data::Dumper;
774 *{token} = *Thrasher::HTMLNormalize::token;
776 # This is what it says it is; the "stack" of tags that has led
777 # us to this point in the HTML. It is a "smart" stack; it can
778 # silently refuse to accept tag tokens, resulting in the removal
779 # of that tag from the resulting HTML. If you need to implement
780 # the real nesting rules for (X)HTML tags, this is where you'd
781 # put that logic.
783 sub new {
784 my $class = shift;
785 my $self = {stack => [],
786 # we abusively modify this in the normalize function :(
787 tokens => shift};
788 bless $self, $class;
789 return $self;
792 # last element on the stack
793 sub laststack { $_[0]->{stack}->[-1]; }
794 sub lasttoken { $_[0]->{tokens}->[-1]; }
796 # Push a new tag token onto the stack.
797 sub push {
798 my $self = shift;
799 my $tag = shift;
801 # Assume certain tags close themselves if they appear immediately
802 # after themselves, i.e., <li>stuff<li>morestuff.
803 # In particular, this if statement is asking "Is the previous
804 # element the same as this element and a SELF_CLOSER like p?"
805 if (@{$self->{stack}} && $self->laststack->{name} eq $tag->{name}
806 && $SELF_CLOSERS{$tag->{name}}) {
807 # If so, 'pop' the tag, which causes the emission of the
808 # appropriate end tag (see pop)
809 $self->pop(token('TagClose', $tag->{name}));
812 # If this tag should always self close, convert it to such
813 # unconditionally
814 if ($TAG_TYPES{$tag->{name}} == $BLOCK_SELF_CLOSE) {
815 push @{$self->{tokens}}, token('SelfClosedTag',
816 $tag->{name},
817 $tag->{atts});
818 return;
821 # otherwise, a normal push
822 push @{$self->{stack}}, $tag;
823 push @{$self->{tokens}}, $tag;
826 # A lot of the normalization magic takes place here.
827 sub pop {
828 my $self = shift;
829 my $tag = shift;
831 # We want a chance to manipulate the tokens we push back on
832 # a bit, before losing track of the tokens this func
833 # generated by tossing them onto the pile
834 my @tokens;
836 # If a newline or doublenewline is currently at the end of the
837 # stack, pop it off for a moment and put it back later. This has
838 # the effect of binding close paragraph tags to the paragraph they
839 # were used in. Without this,
841 # <p>Hello.
843 # <p>There.
845 # (complete with those newlines) gets converted to
847 # <p>Hello.
849 # </p><p>There.</p>
851 # which really isn't the intent, even if it is valid HTML.
852 my $popped_newline;
853 if (@{$self->{tokens}} &&
854 ($self->lasttoken->{type} == $NEWLINE ||
855 $self->lasttoken->{type} == $DOUBLENEWLINE)) {
856 $popped_newline = pop @{$self->{tokens}};
859 # Scan backwards for the matching start tag, emit
860 # any missing close tags.
861 my $i = scalar(@{$self->{stack}}) - 1;
862 while ($i >= 0) {
863 my $open_tag = $self->{stack}->[$i];
864 if ($open_tag->{name} eq $tag->{name}) {
865 while ($i + 1 < @{$self->{stack}}) {
866 my $tag_to_close = pop @{$self->{stack}};
867 CORE::push(@tokens,
868 token('TagClose', $tag_to_close->{name}));
870 CORE::pop @{$self->{stack}};
871 CORE::push @tokens, token('TagClose', $tag->{name});
872 last;
874 $i--;
877 # If we're closing off the previous *token* (not stack
878 # element), and there is therefore no content for that
879 # token, either elide it if it is an empty block token,
880 # or convert it to a self-closed tag
881 if (@tokens && $self->lasttoken->{type} == $TAG &&
882 $self->lasttoken->{name} eq $tokens[0]->{name}) {
883 my $name = $tokens[0]->{name};
884 my $tag_type = $TAG_TYPES{$name};
885 if ($tag_type == $BLOCK) {
886 # elide the token
887 pop @tokens;
888 pop @{$self->{tokens}};
890 if ($tag_type == $BLOCK_SELF_CLOSE) {
891 pop @tokens;
892 my $old_tag = pop @{$self->tokens};
893 unshift @tokens, token('SelfClosedTag',
894 $old_tag->{name},
895 $old_tag->{atts});
899 # If there was no corresponding open tag, just discard it by
900 # doing nothing on purpose.
902 # do_nothing();
904 # If we popped off a newline, now is when it goes back on
905 CORE::push @tokens, $popped_newline if $popped_newline;
907 CORE::push @{$self->{tokens}}, @tokens;
910 # Returns true if we are "in" the given tag, false otherwise. This
911 # is used to implement special treatment for <pre>, for instance
912 sub in_tag {
913 my $self = shift;
914 my $tag_name = shift;
916 for my $tag (@{$self->{stack}}) {
917 return 1 if $tag->{name} eq $tag_name;
919 return 0;
922 # Terminates the TagStack by popping all current elements. Used
923 # at the end of the input string. Despite the name, not the same
924 # as a DESTROY function.
925 sub terminate {
926 my $self = shift;
928 while (@{$self->{stack}}) {
929 my $tag_still_open = CORE::pop @{$self->{stack}};
930 CORE::push @{$self->{tokens}}, token('TagClose',
931 $tag_still_open->{name});
936 __END__
938 =head1 NAME
940 Thrasher::HTMLNormalize - a module for normalizing (X)HTML for safe
941 display
943 =head1 SYNOPSIS
945 use Thrasher::HTMLNormalize qw(:all);
947 # Take arbitrary HTML and make it syntactically correct
948 my $normalizedHTML = normalize($html);
950 # Take arbitrary maybe-HTML and make it correct, and
951 # use a chosen sub-set of HTML suitable for simple rich-text-like
952 # HTML.
953 my $comment_like_html = normalize($html, $CONSERVATIVE_PERMISSIONS);
955 # Take arbitrary maybe-HTML and make it correct, use a
956 # a chosen sub-set of HTML suitable for simple rich-text-like
957 # HTML, and if you see a "tag" that isn't actually HTML,
958 # escape it as if it was text.
959 my $message_log_like_html = normalize($html,
960 $CONSERVATIVE_PERMISSIONS,
961 undef, 1);
964 =head1 MOTIVATION
966 This predated HTML::Normalize on CPAN. I just had it lying around, and
967 I understand it, so I used it. I recommend checking out
968 HTML::Normalize if you think you might want to use this. (On the
969 other hand, I've had to implement features here that it doesn't have.)
971 This is a utility module; Thrasher::XHTMLNormalize specifically
972 defines the normalization for XHTML-IM.
974 This is a utility module; Thrasher::XHTMLNormalize specifically
975 defines the normalization for XHTML-IM.
977 =cut