Change perl-side callback name to match thperl name.
[thrasher.git] / perl / lib / Thrasher / HTMLNormalize.pm
bloba59a027f9c223cac69fc70ae634011de5a7646ed
1 package Thrasher::HTMLNormalize;
2 use strict;
3 use warnings;
5 use Thrasher::HTMLNormalize::Entities qw(html_entity);
7 use base 'Exporter';
9 our @EXPORT_OK = qw(&escape $CONSERVATIVE_PERMISSIONS &normalize
10 &escape_quote $CONSERVATIVE_PERMISSIONS_NO_LINKS);
12 our %EXPORT_TAGS = (all => \@EXPORT_OK);
14 my $VALIDATE_AS_XML = 0;
16 ###
17 ### PLEASE NOTE: This module has unit tests. Changing this module
18 ### without verifying them is a bad idea.
19 ###
21 # Also note that this file isn't as dirty as it looks; that's
22 # just the natural result of sticking an entire compiler in one
23 # file.
25 ###
26 ### CONSTANT AND DATA DECLARATIONS
27 ###
28 ### Recall "my" is actually file scoped, not package scoped.
29 ### use constant, however, *is* package scoped, so it's darned inconvenient.
31 # A set-like hash indicating which entities are allowed
32 # to exist as text-named entities, and not converted into numeric
33 # entities.
34 my %LEGAL_ENTITIES = map { $_ => 1 } qw(amp lt gt quot apos);
36 my %ESCAPE_ENTITIES = ('&' => '&',
37 '>' => '>',
38 '<' => '&lt;' );
39 my %QUOTE_ENTITIES = (%ESCAPE_ENTITIES,
40 "'" => '&apos;',
41 '"' => '&quot;');
43 # Tags that sort of close themselves
44 my %SELF_CLOSERS = map { $_ => 1 } qw(li p);
46 # Token types
47 my $TEXT = 1;
48 my $TAG = 2;
49 my $TAGCLOSE = 3;
50 my $SELFCLOSEDTAG = 4;
51 my $ENTITY = 5;
52 my $NEWLINE = 6;
53 my $DOUBLENEWLINE = 7;
54 my $CDATA = 8;
56 # Tag types. These are used for formatting the resulting text.
57 # A block tag, as in CSS
58 my $BLOCK = 10;
59 # An inline tag, as in CSS
60 my $INLINE = 11;
61 # A 'block' tag that should be converted to a self-closing
62 # tag if an empty instance is found (i.e., <br></br> => <br />)
63 my $BLOCK_SELF_CLOSE = 12;
65 my %TAG_TYPES =
67 p => $BLOCK,
68 div => $BLOCK,
69 span => $INLINE,
70 br => $BLOCK_SELF_CLOSE,
71 a => $INLINE,
72 li => $BLOCK,
73 dl => $BLOCK,
74 dt => $BLOCK,
75 dd => $BLOCK,
76 ul => $BLOCK,
77 font => $INLINE,
78 abbr => $INLINE,
79 acronym => $INLINE,
80 b => $INLINE,
81 bgsound => $INLINE,
82 big => $INLINE,
83 caption => $INLINE,
84 img => $INLINE,
85 blockquote => $BLOCK,
86 h1 => $BLOCK,
87 h2 => $BLOCK,
88 h3 => $BLOCK,
89 h4 => $BLOCK,
90 h5 => $BLOCK,
91 h6 => $BLOCK,
92 em => $INLINE,
93 strong => $INLINE,
94 dfn => $INLINE,
95 code => $INLINE,
96 samp => $INLINE,
97 kbd => $INLINE,
98 var => $INLINE,
99 cite => $INLINE,
100 hr => $BLOCK_SELF_CLOSE,
101 i => $INLINE,
102 ol => $BLOCK,
103 pre => $BLOCK,
104 small => $INLINE,
105 strike => $INLINE,
106 'sub' => $INLINE,
107 sup => $INLINE,
108 tt => $INLINE,
109 table => $BLOCK,
110 tr => $BLOCK,
111 td => $BLOCK,
112 u => $INLINE,
113 html => $BLOCK,
114 body => $BLOCK
117 # A reasonable pre-defined behavior for URLs. If it has a protocol
118 # in the front, it *must* be http or https. Anything else gets
119 # destroyed. If it doesn't have a protocol in the front, we'll
120 # at least take a shot and prepend http://.
121 sub fixURL {
122 my $url = shift;
123 if ($url !~ /^https?:\/\//) {
124 # has non-http protocol. Evil. Die now.
125 # or maybe an incorrectly-specified port. Die now anyhow.
126 if ($url =~ /^[^:]+:/) {
127 return undef;
130 return 'http://' . $url;
132 return $url;
135 # A reasonable pre-defined conservative set. Allows a reasonable
136 # range of simple formatting tags, but the only attributes allowed
137 # are URLS (and those only with http or https protocols), and
138 # titles for abbr and acronyms. Certainly no code execution allowed.
139 our $CONSERVATIVE_PERMISSIONS = {
140 p => {},
141 br => {},
142 a => {href => \&fixURL},
143 ul => {},
144 li => {},
145 abbr => {title => 1},
146 acronym => {title => 1},
147 b => {},
148 big => {},
149 blockquote => {},
150 em => {},
151 strong => {},
152 dfn => {},
153 code => {},
154 samp => {},
155 kbd => {},
156 var => {},
157 cite => {},
158 hr => {},
159 i => {},
160 ol => {},
161 pre => {},
162 small => {},
163 strike => {},
164 sub => {},
165 sup => {},
166 tt => {},
167 u => {}
170 my %CONSERVATIVE_PERMISSIONS_NO_LINKS = %$CONSERVATIVE_PERMISSIONS;
171 our $CONSERVATIVE_PERMISSIONS_NO_LINKS = \%CONSERVATIVE_PERMISSIONS_NO_LINKS;
172 delete $CONSERVATIVE_PERMISSIONS_NO_LINKS->{a};
175 ### UTILITY FUNCTIONS
178 # Return whether $s is the name of a valid entity
179 sub is_valid_entity {
180 my $s = shift;
181 return !!($s =~ /\#\d{1,5}/) || $LEGAL_ENTITIES{$s};
184 # Given an entity name, either return that entity or an empty
185 # string to elide it.
186 sub make_entity {
187 my $s = shift;
188 if (is_valid_entity($s)) {
189 return "&$s;";
190 } else {
191 return '';
195 sub escape {
196 my $s = shift;
197 $s =~ s/([&<>])/$ESCAPE_ENTITIES{$1}/ge;
198 return $s;
200 sub escape_quote {
201 my $s = shift;
202 $s =~ s/([&<>'"])/$QUOTE_ENTITIES{$1}/ge; #'])/;
203 return $s;
206 # Nicer interface to create tokens; just specify the last bit.
207 sub token {
208 my $token_type = shift;
209 $token_type = 'Thrasher::HTMLNormalize::Token::' . $token_type;
210 return $token_type->new(@_);
213 # Tokenize a tag or entity. We know it's one of them because of
214 # how it was parsed. (Pulled out for simple testing.)
215 sub tokenize_tag_or_entity {
216 my $tag_or_entity = shift;
217 my $allow_taglike = shift;
218 my $tag_translation = shift;
220 # \# is to convince emacs syntax highlighting that # is not a comment
221 if ($tag_or_entity !~ /((<(\/)?(\w+)([^>]*)>)|(&(\#?\w+);))/) {
222 die "tokenize_tag_or_entity must be passed a tag or entity: "
223 .$tag_or_entity;
226 # Convert that mass of matches into something comprehensible
227 my $groups = {tag => $2, # the whole of the tag
228 slash => $3, # if it has a slash to close the tag
229 tagname => $4,# the tag name
230 tagremains => $5, # the rest of the tag (atts)
231 entity => $6, # the whole of the entity
232 entityname => $7 };
234 $groups->{tagname} = lc $groups->{tagname};
236 if (defined(my $tag_translator =
237 $tag_translation->{lc $groups->{tagname}})) {
238 # use Data::Dumper;
239 #print "Initial: " . Dumper($groups);
240 if (ref($tag_translator) eq '') {
241 $groups->{tagname} = $tag_translator;
242 } elsif (ref($tag_translator) eq 'CODE') {
243 $tag_translator->($groups);
245 #print "Post-translation: " . Dumper($groups);
248 if ($groups->{tag} && $allow_taglike &&
249 !$TAG_TYPES{lc $groups->{tagname}}) {
250 return token('Text', $tag_or_entity);
251 } elsif ($groups->{tag}) {
252 # handle tokenizing a tag
253 if ($groups->{slash}) {
254 return token('TagClose', $groups->{tagname});
257 my $token_type = 'Tag';
259 my $atts = $groups->{tagremains};
260 if ($atts && substr($atts, -1) eq '/') {
261 $token_type = 'SelfClosedTag';
262 $atts = substr($atts, 0, length($atts) - 1); # pop off /
265 my $atts_hash = {};
267 # Pull out the attributes
268 while ($atts =~ /([a-zA-Z]\w+)(?:=(?:(?:\"([^\">]*)\")|(?:'([^'>]*)\')|(?:([^ \n\t>]*))))?/g) {
269 my $name = $1;
270 # one value delimited by ", the other by ',
271 # the final by whitespace, the final case
272 # covers those few atts in HTML that can
273 # show up with no value at all
274 my $value = $2 || $3 || $4 || 1;
276 $atts_hash->{lc $name} = $value;
279 return token($token_type, $groups->{tagname}, $atts_hash);
280 } elsif ($groups->{entity}) {
281 return token('Entity', $groups->{entityname});
284 # shouldn't be able to get here.
285 die "tokenize_tag_or_entity didn't get anything it expected.";
288 # Given something known to be character-based text with no
289 # (X)HTML in it, tokenize it.
290 sub tokenize_text {
291 my $s = shift;
292 my @tokens;
294 my @texthunks = split(/(\n+)/, $s);
295 my $i = 0;
297 for my $fragment (@texthunks) {
298 if ($i %2) {
299 my $newlines = length($fragment); # all newlines
300 if ($newlines == 1) {
301 push @tokens, token('Newline');
302 } else {
303 push @tokens, token('DoubleNewline');
305 } else {
306 if ($fragment) { # skip empty matches
307 push @tokens, token('Text', $fragment);
311 $i++;
314 return \@tokens;
317 # Given a tag and our validation criteria, verify the tag is
318 # allowed. If it is not allowed, this returns 0. It may also
319 # destructively manipulate the tag's attributes to make them valid.
320 sub validate_tag {
321 my $tag = shift;
322 my $allowed_elements = shift;
324 # It's all allowed.
325 if (!defined($allowed_elements)) {
326 return $tag;
329 my $legal_attributes = $allowed_elements->{$tag->{name}};
331 # Is this allowed at all?
332 if (!$legal_attributes) {
333 return 0;
336 # If this is a close tag, pass it through.
337 if ($tag->{type} == $TAGCLOSE) {
338 return $tag;
341 # Now, actually validate the attributes;
342 while (my ($name, $value) = each %{$tag->{atts}}) {
343 my $attribute_validation = $legal_attributes->{$name};
345 # Is the attribute even allowed?
346 if (!$attribute_validation) {
347 # Nope, toast it.
348 delete($tag->{atts}->{$name});
349 next;
352 if ($attribute_validation eq '1') { # note string compare
353 # everything is permitted here, pass through
354 next;
357 my $new_value;
358 if (ref($attribute_validation) eq 'CODE') {
359 $new_value = $attribute_validation->($value);
360 if (!defined($new_value)) {
361 # This attribute is so evil that the entire tag
362 # has to go away!
363 return 0;
366 if (!$new_value) { # either false or empty string
367 delete($tag->{atts}->{$name});
368 next;
370 } elsif (ref($attribute_validation) eq 'Regexp') {
371 my $match = $value =~ /$attribute_validation/;
372 if (!$match) {
373 return 0;
375 # Otherwise pass through; for better control, use a sub
376 $new_value = $value;
377 } elsif ($attribute_validation) { # any true value
378 $new_value = $value;
379 } else {
380 # any false value; redundent since unspecified = false
381 delete($tag->{atts}->{$name});
382 next;
385 $tag->{atts}->{$name} = $new_value;
388 return $tag;
391 # The money function at last.
392 sub normalize {
393 my $html = shift;
394 my $allowed_elements = shift; # default - allow everything
395 my $allow_cdata = shift; # default - no
396 my $allow_taglike = shift;
397 my $tag_translation = shift || {};
399 my $text_only = shift; # don't automatically slap in <p>s
401 # Whitespace trimmer
403 # Apparently, applying this to something that consists
404 # entirely of whitespace gets a warning about using
405 # an uninitialized value. Probably just the implementation
406 # poking out and doesn't seem to be worth worrying about.
407 no warnings 'uninitialized';
408 $html =~ s/(^\s+)|(\s+$ )//gx;
411 # FIXME: Verify in utf-8 text encoding
413 # If we elimated the HTML in that step, return nothing
414 if (!$html) {
415 return '';
418 my @tokens;
420 if ($allow_cdata) {
421 my @cdata_fragments = split(/<!\[CDATA\[(.*?)\]\]>/,
422 $html);
423 my $i = 0;
425 for my $fragment (@cdata_fragments) {
426 if ($i % 2) {
427 push @tokens, token('CData', $fragment);
428 } else {
429 tokenize_fragment(\@tokens, $fragment,
430 $allowed_elements, $allow_taglike,
431 $tag_translation);
433 $i++;
435 } else {
436 tokenize_fragment(\@tokens, $html, $allowed_elements,
437 $allow_taglike, $tag_translation);
440 # Push a p on front if the first token is not a block
441 # tag. Note that <html> is a block tag, so complete
442 # documents don't get this done, only fragments. This work
443 # because the TagStack will normalize the p correctly.
444 if (!$text_only &&
445 @tokens && ($tokens[0]->{type} != $TAG ||
446 ($TAG_TYPES{$tokens[0]->{name}} || $BLOCK)
447 != $BLOCK)) {
448 unshift @tokens, token('Tag', 'p');
451 my @new_tokens;
452 # we abusively share modification between this func
453 # and the tag stack :(
454 my $tag_stack = new Thrasher::HTMLNormalize::TagStack(\@new_tokens);
456 for (my $i = 0; $i < @tokens; $i++) {
457 my $token = $tokens[$i];
459 my $type = $token->{type};
460 if ($type == $TAG) {
461 $tag_stack->push($token);
463 if ($type == $TAGCLOSE) {
464 $tag_stack->pop($token);
466 if ($type == $TEXT || $type == $ENTITY ||
467 $type == $SELFCLOSEDTAG || $type == $CDATA) {
468 push @new_tokens, $token
471 # If a newline is already preceded by a self-closed
472 # br, pass it through, otherwise, prepend a self-closed br
473 if ($type == $NEWLINE) {
474 # Well, actually, if we're in a "pre", skip all fancy
475 # processing
476 if ($tag_stack->in_tag('pre')) {
477 push @new_tokens, token('Text', "\n");
478 next;
481 if (!@new_tokens) {
482 next;
485 my $previous_token = $tokens[$i - 1];
486 if ($previous_token->{type} == $SELFCLOSEDTAG &&
487 $previous_token->{name} eq 'br') {
488 push @new_tokens, $token;
489 } else {
490 # add a br somehowe
492 # we're not at the end of the tokens.
493 if ($i + 1 < @tokens) {
494 my $next_token = $tokens[$i + 1];
495 if ($next_token->{type} == $TAG) {
496 my $tag_type = $TAG_TYPES{$next_token->{name}}
497 || $BLOCK;
498 if ($tag_type != $BLOCK &&
499 $tag_type != $BLOCK_SELF_CLOSE) {
500 # this automatically gets eaten if there
501 # was no previous <p>
502 if (!$text_only) {
503 push @new_tokens,
504 token('SelfClosedTag', 'br');
506 push @new_tokens, $token;
507 } else {
508 push @new_tokens, $token;
510 } else {
511 if (!$text_only) {
512 push @new_tokens, token('SelfClosedTag', 'br');
514 push @new_tokens, $token;
520 if ($type == $DOUBLENEWLINE) {
521 if ($tag_stack->in_tag('pre')) {
522 push @new_tokens, token('Text', "\n\n");
523 next;
526 # Always assume we're closing a <p>. It gets eaten later
527 # by the TagStack if this turns out to be false.
528 if (!$text_only) {
529 $tag_stack->pop(token('TagClose', 'p'));
532 # If this is followed by a non-block tag, add a <p>.
533 if ($i < @tokens - 1) {
534 my $next_token = $tokens[$i + 1];
535 if ($next_token->{type} == $TAG) {
536 my $tag_type = $TAG_TYPES{$next_token->{name}} || $BLOCK;
537 if ($tag_type != $BLOCK &&
538 $tag_type != $BLOCK_SELF_CLOSE) {
539 push @new_tokens, $token;
540 if (!$text_only) {
541 $tag_stack->push(token('Tag', 'p'));
543 } else {
544 push @new_tokens, $token;
546 } else {
547 push @new_tokens, $token;
548 if (!$text_only) {
549 $tag_stack->push(token('Tag', 'p'));
556 $tag_stack->terminate();
558 my $new_html = join '', map { $_->render } @new_tokens;
560 if ($VALIDATE_AS_XML) {
561 # Do a simple validation pass.
562 die "XML validation not implemented yet.";
565 return $new_html;
568 # Performs low-level tokenization on a bit of text known not to have
569 # (valid) CDATA in it.
570 sub tokenize_fragment {
571 my $tokens = shift;
572 my $html = shift;
573 my $allowed_elements = shift;
574 my $allow_taglike = shift;
575 my $tag_translation = shift;
577 # Same as the validation re in tokenize_tag_or_entity, but
578 # with all but the outer parens turned into non-capturing.
579 my @fragments = split (/((?:<(?:\/)?(?:\w+)(?:[^>]*)>)|(?:&(?:\#?\w+);))/,
580 $html);
582 for (my $i = 0; $i < @fragments; $i++) {
583 my $text = $fragments[$i];
585 if ($i % 2) {
586 my $token = tokenize_tag_or_entity($text, $allow_taglike,
587 $tag_translation);
588 if ($token->{type} == $TAG) {
589 $token = validate_tag($token, $allowed_elements);
591 if ($token) {
592 push @$tokens, $token;
594 } else {
595 push @$tokens, @{tokenize_text($text)};
601 ### TOKEN CLASSES
604 package Thrasher::HTMLNormalize::Token;
605 # A base class for tokens, used for 'isa' checks.
607 # This is only used by children; do not call directly.
608 sub _new {
609 my $class = shift;
610 my %values = @_;
611 my $self = \%values;
612 bless $self, $class;
613 return $self;
617 package Thrasher::HTMLNormalize::Token::Text;
618 use base 'Thrasher::HTMLNormalize::Token';
620 sub new { shift()->SUPER::_new(text => shift(), type => $TEXT) }
622 sub render {
623 return Thrasher::HTMLNormalize::escape($_[0]->{text});
626 package Thrasher::HTMLNormalize::TagBase;
627 use base 'Thrasher::HTMLNormalize::Token';
628 # Contains the similarities between Tags and SelfClosedTags.
630 # process an attribute list into proper HTML; this assumes that
631 # the attributes has been checked for valid attributes already,
632 # but this also checks for permitted entities. Returns an arrayref
633 # containing strings ready-to-be-output.
634 sub att_list {
635 my $self = shift;
636 my @atts;
637 while (my ($name, $value) = each %{$self->{atts}}) {
638 my @fragments = split (/&(\w+);/, $value);
639 my $i = 0;
640 my @new_value;
641 for my $text (@fragments) {
642 if ($i % 2) {
643 # This is an entity body.
644 push @new_value, Thrasher::HTMLNormalize::make_entity($text);
645 } else {
646 # normal text
647 push(@new_value,
648 Thrasher::HTMLNormalize::escape_quote($text));
650 $i ++;
652 my $value = join '', @new_value;
653 # note $name was checked for validity implicitly by the
654 # regex that pulled it out in the first place.
655 push @atts, "$name='$value'";
657 return \@atts;
660 package Thrasher::HTMLNormalize::Token::Tag;
661 use base 'Thrasher::HTMLNormalize::TagBase';
662 # Represents a Tag, which may have attributes.
663 # Takes: A name, and a hashref of atts.
664 # Note that name is checked implicitly by the regex that pulls it out.
666 use Carp qw(confess);
668 sub new {
669 my $self = shift()->SUPER::_new(name => shift(),
670 atts => shift() || {},
671 type => $TAG);
672 confess "Hash" if ref($self->{name}) eq 'HASH';
673 return $self;
676 sub render {
677 my $self = shift;
678 my $atts = $self->att_list;
680 if (@$atts) {
681 return "<$self->{name} " . join(' ', @$atts) . ">";
682 } else {
683 return "<$self->{name}>";
687 package Thrasher::HTMLNormalize::Token::SelfClosedTag;
688 use base 'Thrasher::HTMLNormalize::TagBase';
689 # Represents a self-closing tag, which may have attributes.
691 sub new { shift()->SUPER::_new(name => shift(),
692 atts => shift() || {},
693 type => $SELFCLOSEDTAG); }
695 sub render {
696 my $self = shift;
697 my $atts = $self->att_list;
699 if (@$atts) {
700 return "<$self->{name} " . join (' ', @$atts) . " />";
701 } else {
702 return "<$self->{name} />";
706 package Thrasher::HTMLNormalize::Token::TagClose;
707 use base 'Thrasher::HTMLNormalize::Token';
708 # Represents a close tag. Has only a name.
710 sub new { shift()->SUPER::_new(name => shift(),
711 type => $TAGCLOSE); }
713 sub render {
714 return "</$_[0]->{name}>";
717 package Thrasher::HTMLNormalize::Token::Entity;
718 use base 'Thrasher::HTMLNormalize::Token';
719 use Thrasher::HTMLNormalize::Entities qw(html_entity);
720 # Represents an entity. Has only a name. Performs numberization of
721 # HTML entities
723 sub new {
724 my $class = shift;
726 my $name = shift;
727 if (!$LEGAL_ENTITIES{$name} && (my $number = html_entity($name))) {
728 $name = '#' . $number;
730 # make_entity takes care of nulling out illegal entities
731 # that pass this test
733 return $class->SUPER::_new(name => $name, type => $ENTITY);
736 sub render {
737 return Thrasher::HTMLNormalize::make_entity($_[0]->{name});
740 package Thrasher::HTMLNormalize::Token::Newline;
741 use base 'Thrasher::HTMLNormalize::Token';
742 # Represents a single newline, suitable for <br />
744 sub new { shift()->SUPER::_new(type => $NEWLINE); }
746 sub render { "\n"; }
748 package Thrasher::HTMLNormalize::Token::DoubleNewline;
749 use base 'Thrasher::HTMLNormalize::Token';
750 # Double new line, suitable for <p>-ification
752 sub new { shift()->SUPER::_new(type => $DOUBLENEWLINE); }
754 sub render { "\n\n"; }
756 package Thrasher::HTMLNormalize::Token::CData;
757 use base 'Thrasher::HTMLNormalize::Token';
759 sub new { shift()->SUPER::_new(rawtext => shift(), type => $CDATA); }
761 sub render { "<![CDATA[" . $_[0]->{rawtext} . "]]>"; }
764 ### TAG STACK
767 package Thrasher::HTMLNormalize::TagStack;
768 use strict;
769 use warnings;
771 use Data::Dumper;
773 *{token} = *Thrasher::HTMLNormalize::token;
775 # This is what it says it is; the "stack" of tags that has led
776 # us to this point in the HTML. It is a "smart" stack; it can
777 # silently refuse to accept tag tokens, resulting in the removal
778 # of that tag from the resulting HTML. If you need to implement
779 # the real nesting rules for (X)HTML tags, this is where you'd
780 # put that logic.
782 sub new {
783 my $class = shift;
784 my $self = {stack => [],
785 # we abusively modify this in the normalize function :(
786 tokens => shift};
787 bless $self, $class;
788 return $self;
791 # last element on the stack
792 sub laststack { $_[0]->{stack}->[-1]; }
793 sub lasttoken { $_[0]->{tokens}->[-1]; }
795 # Push a new tag token onto the stack.
796 sub push {
797 my $self = shift;
798 my $tag = shift;
800 # Assume certain tags close themselves if they appear immediately
801 # after themselves, i.e., <li>stuff<li>morestuff.
802 # In particular, this if statement is asking "Is the previous
803 # element the same as this element and a SELF_CLOSER like p?"
804 if (@{$self->{stack}} && $self->laststack->{name} eq $tag->{name}
805 && $SELF_CLOSERS{$tag->{name}}) {
806 # If so, 'pop' the tag, which causes the emission of the
807 # appropriate end tag (see pop)
808 $self->pop(token('TagClose', $tag->{name}));
811 # If this tag should always self close, convert it to such
812 # unconditionally
813 if ($TAG_TYPES{$tag->{name}} == $BLOCK_SELF_CLOSE) {
814 push @{$self->{tokens}}, token('SelfClosedTag',
815 $tag->{name},
816 $tag->{atts});
817 return;
820 # otherwise, a normal push
821 push @{$self->{stack}}, $tag;
822 push @{$self->{tokens}}, $tag;
825 # A lot of the normalization magic takes place here.
826 sub pop {
827 my $self = shift;
828 my $tag = shift;
830 # We want a chance to manipulate the tokens we push back on
831 # a bit, before losing track of the tokens this func
832 # generated by tossing them onto the pile
833 my @tokens;
835 # If a newline or doublenewline is currently at the end of the
836 # stack, pop it off for a moment and put it back later. This has
837 # the effect of binding close paragraph tags to the paragraph they
838 # were used in. Without this,
840 # <p>Hello.
842 # <p>There.
844 # (complete with those newlines) gets converted to
846 # <p>Hello.
848 # </p><p>There.</p>
850 # which really isn't the intent, even if it is valid HTML.
851 my $popped_newline;
852 if (@{$self->{tokens}} &&
853 ($self->lasttoken->{type} == $NEWLINE ||
854 $self->lasttoken->{type} == $DOUBLENEWLINE)) {
855 $popped_newline = pop @{$self->{tokens}};
858 # Scan backwards for the matching start tag, emit
859 # any missing close tags.
860 my $i = scalar(@{$self->{stack}}) - 1;
861 while ($i >= 0) {
862 my $open_tag = $self->{stack}->[$i];
863 if ($open_tag->{name} eq $tag->{name}) {
864 while ($i + 1 < @{$self->{stack}}) {
865 my $tag_to_close = pop @{$self->{stack}};
866 CORE::push(@tokens,
867 token('TagClose', $tag_to_close->{name}));
869 CORE::pop @{$self->{stack}};
870 CORE::push @tokens, token('TagClose', $tag->{name});
871 last;
873 $i--;
876 # If we're closing off the previous *token* (not stack
877 # element), and there is therefore no content for that
878 # token, either elide it if it is an empty block token,
879 # or convert it to a self-closed tag
880 if (@tokens && $self->lasttoken->{type} == $TAG &&
881 $self->lasttoken->{name} eq $tokens[0]->{name}) {
882 my $name = $tokens[0]->{name};
883 my $tag_type = $TAG_TYPES{$name};
884 if ($tag_type == $BLOCK) {
885 # elide the token
886 pop @tokens;
887 pop @{$self->{tokens}};
889 if ($tag_type == $BLOCK_SELF_CLOSE) {
890 pop @tokens;
891 my $old_tag = pop @{$self->tokens};
892 unshift @tokens, token('SelfClosedTag',
893 $old_tag->{name},
894 $old_tag->{atts});
898 # If there was no corresponding open tag, just discard it by
899 # doing nothing on purpose.
901 # do_nothing();
903 # If we popped off a newline, now is when it goes back on
904 CORE::push @tokens, $popped_newline if $popped_newline;
906 CORE::push @{$self->{tokens}}, @tokens;
909 # Returns true if we are "in" the given tag, false otherwise. This
910 # is used to implement special treatment for <pre>, for instance
911 sub in_tag {
912 my $self = shift;
913 my $tag_name = shift;
915 for my $tag (@{$self->{stack}}) {
916 return 1 if $tag->{name} eq $tag_name;
918 return 0;
921 # Terminates the TagStack by popping all current elements. Used
922 # at the end of the input string. Despite the name, not the same
923 # as a DESTROY function.
924 sub terminate {
925 my $self = shift;
927 while (@{$self->{stack}}) {
928 my $tag_still_open = CORE::pop @{$self->{stack}};
929 CORE::push @{$self->{tokens}}, token('TagClose',
930 $tag_still_open->{name});
935 __END__
937 =head1 NAME
939 Thrasher::HTMLNormalize - a module for normalizing (X)HTML for safe
940 display
942 =head1 SYNOPSIS
944 use Thrasher::HTMLNormalize qw(:all);
946 # Take arbitrary HTML and make it syntactically correct
947 my $normalizedHTML = normalize($html);
949 # Take arbitrary maybe-HTML and make it correct, and
950 # use a chosen sub-set of HTML suitable for simple rich-text-like
951 # HTML.
952 my $comment_like_html = normalize($html, $CONSERVATIVE_PERMISSIONS);
954 # Take arbitrary maybe-HTML and make it correct, use a
955 # a chosen sub-set of HTML suitable for simple rich-text-like
956 # HTML, and if you see a "tag" that isn't actually HTML,
957 # escape it as if it was text.
958 my $message_log_like_html = normalize($html,
959 $CONSERVATIVE_PERMISSIONS,
960 undef, 1);
963 =head1 MOTIVATION
965 This predated HTML::Normalize on CPAN. I just had it lying around, and
966 I understand it, so I used it. I recommend checking out
967 HTML::Normalize if you think you might want to use this. (On the
968 other hand, I've had to implement features here that it doesn't have.)
970 This is a utility module; Thrasher::XHTMLNormalize specifically
971 defines the normalization for XHTML-IM.
973 This is a utility module; Thrasher::XHTMLNormalize specifically
974 defines the normalization for XHTML-IM.
976 =cut