1 package Thrasher
::HTMLNormalize
;
5 use Thrasher
::HTMLNormalize
::Entities
qw(html_entity);
9 our @EXPORT_OK = qw(&escape $CONSERVATIVE_PERMISSIONS &normalize
10 &escape_quote $CONSERVATIVE_PERMISSIONS_NO_LINKS);
12 our %EXPORT_TAGS = (all => \@EXPORT_OK);
14 my $VALIDATE_AS_XML = 0;
17 ### PLEASE NOTE: This module has unit tests. Changing this module
18 ### without verifying them is a bad idea.
21 # Also note that this file isn't as dirty as it looks; that's
22 # just the natural result of sticking an entire compiler in one
26 ### CONSTANT AND DATA DECLARATIONS
28 ### Recall "my" is actually file scoped, not package scoped.
29 ### use constant, however, *is* package scoped, so it's darned inconvenient.
31 # A set-like hash indicating which entities are allowed
32 # to exist as text-named entities, and not converted into numeric
34 my %LEGAL_ENTITIES = map { $_ => 1 } qw(amp lt gt quot apos);
36 my %ESCAPE_ENTITIES = ('&' => '&',
39 my %QUOTE_ENTITIES = (%ESCAPE_ENTITIES,
43 # Tags that sort of close themselves
44 my %SELF_CLOSERS = map { $_ => 1 } qw(li p);
50 my $SELFCLOSEDTAG = 4;
53 my $DOUBLENEWLINE = 7;
56 # Tag types. These are used for formatting the resulting text.
57 # A block tag, as in CSS
59 # An inline tag, as in CSS
61 # A 'block' tag that should be converted to a self-closing
62 # tag if an empty instance is found (i.e., <br></br> => <br />)
63 my $BLOCK_SELF_CLOSE = 12;
70 br
=> $BLOCK_SELF_CLOSE,
100 hr
=> $BLOCK_SELF_CLOSE,
118 # A reasonable pre-defined behavior for URLs. If it has a protocol
119 # in the front, it *must* be http or https. Anything else gets
120 # destroyed. If it doesn't have a protocol in the front, we'll
121 # at least take a shot and prepend http://.
124 if ($url !~ /^https?:\/\
//) {
125 # has non-http protocol. Evil. Die now.
126 # or maybe an incorrectly-specified port. Die now anyhow.
127 if ($url =~ /^[^:]+:/) {
131 return 'http://' . $url;
136 # A reasonable pre-defined conservative set. Allows a reasonable
137 # range of simple formatting tags, but the only attributes allowed
138 # are URLS (and those only with http or https protocols), and
139 # titles for abbr and acronyms. Certainly no code execution allowed.
140 our $CONSERVATIVE_PERMISSIONS = {
143 a
=> {href
=> \
&fixURL
},
146 abbr
=> {title
=> 1},
147 acronym
=> {title
=> 1},
171 my %CONSERVATIVE_PERMISSIONS_NO_LINKS = %$CONSERVATIVE_PERMISSIONS;
172 our $CONSERVATIVE_PERMISSIONS_NO_LINKS = \
%CONSERVATIVE_PERMISSIONS_NO_LINKS;
173 delete $CONSERVATIVE_PERMISSIONS_NO_LINKS->{a
};
176 ### UTILITY FUNCTIONS
179 # Return whether $s is the name of a valid entity
180 sub is_valid_entity
{
182 return !!($s =~ /\#\d{1,5}/) || $LEGAL_ENTITIES{$s};
185 # Given an entity name, either return that entity or an empty
186 # string to elide it.
189 if (is_valid_entity
($s)) {
198 $s =~ s/([&<>])/$ESCAPE_ENTITIES{$1}/ge;
203 $s =~ s/([&<>'"])/$QUOTE_ENTITIES{$1}/ge; #'])/;
207 # Nicer interface to create tokens; just specify the last bit.
209 my $token_type = shift;
210 $token_type = 'Thrasher::HTMLNormalize::Token::' . $token_type;
211 return $token_type->new(@_);
214 # Tokenize a tag or entity. We know it's one of them because of
215 # how it was parsed. (Pulled out for simple testing.)
216 sub tokenize_tag_or_entity
{
217 my $tag_or_entity = shift;
218 my $allow_taglike = shift;
219 my $tag_translation = shift;
221 # \# is to convince emacs syntax highlighting that # is not a comment
222 if ($tag_or_entity !~ /((<(\/)?
(\w
+)([^>]*)>)|(&(\#?\w
+);))/) {
223 die "tokenize_tag_or_entity must be passed a tag or entity: "
227 # Convert that mass of matches into something comprehensible
228 my $groups = {tag
=> $2, # the whole of the tag
229 slash
=> $3, # if it has a slash to close the tag
230 tagname
=> $4,# the tag name
231 tagremains
=> $5, # the rest of the tag (atts)
232 entity
=> $6, # the whole of the entity
235 $groups->{tagname
} = lc $groups->{tagname
};
237 if (defined(my $tag_translator =
238 $tag_translation->{lc $groups->{tagname
}})) {
240 #print "Initial: " . Dumper($groups);
241 if (ref($tag_translator) eq '') {
242 $groups->{tagname
} = $tag_translator;
243 } elsif (ref($tag_translator) eq 'CODE') {
244 $tag_translator->($groups);
246 #print "Post-translation: " . Dumper($groups);
249 if ($groups->{tag
} && $allow_taglike &&
250 !$TAG_TYPES{lc $groups->{tagname
}}) {
251 return token
('Text', $tag_or_entity);
252 } elsif ($groups->{tag
}) {
253 # handle tokenizing a tag
254 if ($groups->{slash
}) {
255 return token
('TagClose', $groups->{tagname
});
258 my $token_type = 'Tag';
260 my $atts = $groups->{tagremains
};
261 if ($atts && substr($atts, -1) eq '/') {
262 $token_type = 'SelfClosedTag';
263 $atts = substr($atts, 0, length($atts) - 1); # pop off /
268 # Pull out the attributes
269 while ($atts =~ /([a-zA-Z]\w+)(?:=(?:(?:\"([^\">]*)\")|(?:'([^'>]*)\')|(?:([^ \n\t>]*))))?/g) {
271 # one value delimited by ", the other by ',
272 # the final by whitespace, the final case
273 # covers those few atts in HTML that can
274 # show up with no value at all
275 my $value = $2 || $3 || $4 || 1;
277 $atts_hash->{lc $name} = $value;
280 return token
($token_type, $groups->{tagname
}, $atts_hash);
281 } elsif ($groups->{entity
}) {
282 return token
('Entity', $groups->{entityname
});
285 # shouldn't be able to get here.
286 die "tokenize_tag_or_entity didn't get anything it expected.";
289 # Given something known to be character-based text with no
290 # (X)HTML in it, tokenize it.
295 my @texthunks = split(/(\n+)/, $s);
298 for my $fragment (@texthunks) {
300 my $newlines = length($fragment); # all newlines
301 if ($newlines == 1) {
302 push @tokens, token
('Newline');
304 push @tokens, token
('DoubleNewline');
307 if ($fragment) { # skip empty matches
308 push @tokens, token
('Text', $fragment);
318 # Given a tag and our validation criteria, verify the tag is
319 # allowed. If it is not allowed, this returns 0. It may also
320 # destructively manipulate the tag's attributes to make them valid.
323 my $allowed_elements = shift;
326 if (!defined($allowed_elements)) {
330 my $legal_attributes = $allowed_elements->{$tag->{name
}};
332 # Is this allowed at all?
333 if (!$legal_attributes) {
337 # If this is a close tag, pass it through.
338 if ($tag->{type
} == $TAGCLOSE) {
342 # Now, actually validate the attributes;
343 while (my ($name, $value) = each %{$tag->{atts
}}) {
344 my $attribute_validation = $legal_attributes->{$name};
346 # Is the attribute even allowed?
347 if (!$attribute_validation) {
349 delete($tag->{atts
}->{$name});
353 if ($attribute_validation eq '1') { # note string compare
354 # everything is permitted here, pass through
359 if (ref($attribute_validation) eq 'CODE') {
360 $new_value = $attribute_validation->($value);
361 if (!defined($new_value)) {
362 # This attribute is so evil that the entire tag
367 if (!$new_value) { # either false or empty string
368 delete($tag->{atts
}->{$name});
371 } elsif (ref($attribute_validation) eq 'Regexp') {
372 my $match = $value =~ /$attribute_validation/;
376 # Otherwise pass through; for better control, use a sub
378 } elsif ($attribute_validation) { # any true value
381 # any false value; redundent since unspecified = false
382 delete($tag->{atts
}->{$name});
386 $tag->{atts
}->{$name} = $new_value;
392 # The money function at last.
395 my $allowed_elements = shift; # default - allow everything
396 my $allow_cdata = shift; # default - no
397 my $allow_taglike = shift;
398 my $tag_translation = shift || {};
400 my $text_only = shift; # don't automatically slap in <p>s
404 # Apparently, applying this to something that consists
405 # entirely of whitespace gets a warning about using
406 # an uninitialized value. Probably just the implementation
407 # poking out and doesn't seem to be worth worrying about.
408 no warnings
'uninitialized';
409 $html =~ s/(^\s+)|(\s+$ )//gx;
412 # FIXME: Verify in utf-8 text encoding
414 # If we elimated the HTML in that step, return nothing
422 my @cdata_fragments = split(/<!\[CDATA\[(.*?)\]\]>/,
426 for my $fragment (@cdata_fragments) {
428 push @tokens, token
('CData', $fragment);
430 tokenize_fragment
(\
@tokens, $fragment,
431 $allowed_elements, $allow_taglike,
437 tokenize_fragment
(\
@tokens, $html, $allowed_elements,
438 $allow_taglike, $tag_translation);
441 # Push a p on front if the first token is not a block
442 # tag. Note that <html> is a block tag, so complete
443 # documents don't get this done, only fragments. This work
444 # because the TagStack will normalize the p correctly.
446 @tokens && ($tokens[0]->{type
} != $TAG ||
447 ($TAG_TYPES{$tokens[0]->{name
}} || $BLOCK)
449 unshift @tokens, token
('Tag', 'p');
453 # we abusively share modification between this func
454 # and the tag stack :(
455 my $tag_stack = new Thrasher
::HTMLNormalize
::TagStack
(\
@new_tokens);
457 for (my $i = 0; $i < @tokens; $i++) {
458 my $token = $tokens[$i];
460 my $type = $token->{type
};
462 $tag_stack->push($token);
464 if ($type == $TAGCLOSE) {
465 $tag_stack->pop($token);
467 if ($type == $TEXT || $type == $ENTITY ||
468 $type == $SELFCLOSEDTAG || $type == $CDATA) {
469 push @new_tokens, $token
472 # If a newline is already preceded by a self-closed
473 # br, pass it through, otherwise, prepend a self-closed br
474 if ($type == $NEWLINE) {
475 # Well, actually, if we're in a "pre", skip all fancy
477 if ($tag_stack->in_tag('pre')) {
478 push @new_tokens, token
('Text', "\n");
486 my $previous_token = $tokens[$i - 1];
487 if ($previous_token->{type
} == $SELFCLOSEDTAG &&
488 $previous_token->{name
} eq 'br') {
489 push @new_tokens, $token;
493 # we're not at the end of the tokens.
494 if ($i + 1 < @tokens) {
495 my $next_token = $tokens[$i + 1];
496 if ($next_token->{type
} == $TAG) {
497 my $tag_type = $TAG_TYPES{$next_token->{name
}}
499 if ($tag_type != $BLOCK &&
500 $tag_type != $BLOCK_SELF_CLOSE) {
501 # this automatically gets eaten if there
502 # was no previous <p>
505 token
('SelfClosedTag', 'br');
507 push @new_tokens, $token;
509 push @new_tokens, $token;
513 push @new_tokens, token
('SelfClosedTag', 'br');
515 push @new_tokens, $token;
521 if ($type == $DOUBLENEWLINE) {
522 if ($tag_stack->in_tag('pre')) {
523 push @new_tokens, token
('Text', "\n\n");
527 # Always assume we're closing a <p>. It gets eaten later
528 # by the TagStack if this turns out to be false.
530 $tag_stack->pop(token
('TagClose', 'p'));
533 # If this is followed by a non-block tag, add a <p>.
534 if ($i < @tokens - 1) {
535 my $next_token = $tokens[$i + 1];
536 if ($next_token->{type
} == $TAG) {
537 my $tag_type = $TAG_TYPES{$next_token->{name
}} || $BLOCK;
538 if ($tag_type != $BLOCK &&
539 $tag_type != $BLOCK_SELF_CLOSE) {
540 push @new_tokens, $token;
542 $tag_stack->push(token
('Tag', 'p'));
545 push @new_tokens, $token;
548 push @new_tokens, $token;
550 $tag_stack->push(token
('Tag', 'p'));
557 $tag_stack->terminate();
559 my $new_html = join '', map { $_->render } @new_tokens;
561 if ($VALIDATE_AS_XML) {
562 # Do a simple validation pass.
563 die "XML validation not implemented yet.";
569 # Performs low-level tokenization on a bit of text known not to have
570 # (valid) CDATA in it.
571 sub tokenize_fragment
{
574 my $allowed_elements = shift;
575 my $allow_taglike = shift;
576 my $tag_translation = shift;
578 # Same as the validation re in tokenize_tag_or_entity, but
579 # with all but the outer parens turned into non-capturing.
580 my @fragments = split (/((?:<(?:\/)?
(?
:\w
+)(?
:[^>]*)>)|(?
:&(?
:\#?\w
+);))/,
583 for (my $i = 0; $i < @fragments; $i++) {
584 my $text = $fragments[$i];
587 my $token = tokenize_tag_or_entity
($text, $allow_taglike,
589 if ($token->{type
} == $TAG) {
590 $token = validate_tag
($token, $allowed_elements);
593 push @
$tokens, $token;
596 push @
$tokens, @
{tokenize_text
($text)};
605 package Thrasher
::HTMLNormalize
::Token
;
606 # A base class for tokens, used for 'isa' checks.
608 # This is only used by children; do not call directly.
618 package Thrasher
::HTMLNormalize
::Token
::Text
;
619 use base
'Thrasher::HTMLNormalize::Token';
621 sub new
{ shift()->SUPER::_new
(text
=> shift(), type
=> $TEXT) }
624 return Thrasher
::HTMLNormalize
::escape
($_[0]->{text
});
627 package Thrasher
::HTMLNormalize
::TagBase
;
628 use base
'Thrasher::HTMLNormalize::Token';
629 # Contains the similarities between Tags and SelfClosedTags.
631 # process an attribute list into proper HTML; this assumes that
632 # the attributes has been checked for valid attributes already,
633 # but this also checks for permitted entities. Returns an arrayref
634 # containing strings ready-to-be-output.
638 while (my ($name, $value) = each %{$self->{atts
}}) {
639 my @fragments = split (/&(\w+);/, $value);
642 for my $text (@fragments) {
644 # This is an entity body.
645 push @new_value, Thrasher
::HTMLNormalize
::make_entity
($text);
649 Thrasher
::HTMLNormalize
::escape_quote
($text));
653 my $value = join '', @new_value;
654 # note $name was checked for validity implicitly by the
655 # regex that pulled it out in the first place.
656 push @atts, "$name='$value'";
661 package Thrasher
::HTMLNormalize
::Token
::Tag
;
662 use base
'Thrasher::HTMLNormalize::TagBase';
663 # Represents a Tag, which may have attributes.
664 # Takes: A name, and a hashref of atts.
665 # Note that name is checked implicitly by the regex that pulls it out.
667 use Carp
qw(confess);
670 my $self = shift()->SUPER::_new
(name
=> shift(),
671 atts
=> shift() || {},
673 confess
"Hash" if ref($self->{name
}) eq 'HASH';
679 my $atts = $self->att_list;
682 return "<$self->{name} " . join(' ', @
$atts) . ">";
684 return "<$self->{name}>";
688 package Thrasher
::HTMLNormalize
::Token
::SelfClosedTag
;
689 use base
'Thrasher::HTMLNormalize::TagBase';
690 # Represents a self-closing tag, which may have attributes.
692 sub new
{ shift()->SUPER::_new
(name
=> shift(),
693 atts
=> shift() || {},
694 type
=> $SELFCLOSEDTAG); }
698 my $atts = $self->att_list;
701 return "<$self->{name} " . join (' ', @
$atts) . " />";
703 return "<$self->{name} />";
707 package Thrasher
::HTMLNormalize
::Token
::TagClose
;
708 use base
'Thrasher::HTMLNormalize::Token';
709 # Represents a close tag. Has only a name.
711 sub new
{ shift()->SUPER::_new
(name
=> shift(),
712 type
=> $TAGCLOSE); }
715 return "</$_[0]->{name}>";
718 package Thrasher
::HTMLNormalize
::Token
::Entity
;
719 use base
'Thrasher::HTMLNormalize::Token';
720 use Thrasher
::HTMLNormalize
::Entities
qw(html_entity);
721 # Represents an entity. Has only a name. Performs numberization of
728 if (!$LEGAL_ENTITIES{$name} && (my $number = html_entity
($name))) {
729 $name = '#' . $number;
731 # make_entity takes care of nulling out illegal entities
732 # that pass this test
734 return $class->SUPER::_new
(name
=> $name, type
=> $ENTITY);
738 return Thrasher
::HTMLNormalize
::make_entity
($_[0]->{name
});
741 package Thrasher
::HTMLNormalize
::Token
::Newline
;
742 use base
'Thrasher::HTMLNormalize::Token';
743 # Represents a single newline, suitable for <br />
745 sub new
{ shift()->SUPER::_new
(type
=> $NEWLINE); }
749 package Thrasher
::HTMLNormalize
::Token
::DoubleNewline
;
750 use base
'Thrasher::HTMLNormalize::Token';
751 # Double new line, suitable for <p>-ification
753 sub new
{ shift()->SUPER::_new
(type
=> $DOUBLENEWLINE); }
755 sub render
{ "\n\n"; }
757 package Thrasher
::HTMLNormalize
::Token
::CData
;
758 use base
'Thrasher::HTMLNormalize::Token';
760 sub new
{ shift()->SUPER::_new
(rawtext
=> shift(), type
=> $CDATA); }
762 sub render
{ "<![CDATA[" . $_[0]->{rawtext
} . "]]>"; }
768 package Thrasher
::HTMLNormalize
::TagStack
;
774 *{token
} = *Thrasher
::HTMLNormalize
::token
;
776 # This is what it says it is; the "stack" of tags that has led
777 # us to this point in the HTML. It is a "smart" stack; it can
778 # silently refuse to accept tag tokens, resulting in the removal
779 # of that tag from the resulting HTML. If you need to implement
780 # the real nesting rules for (X)HTML tags, this is where you'd
785 my $self = {stack
=> [],
786 # we abusively modify this in the normalize function :(
792 # last element on the stack
793 sub laststack
{ $_[0]->{stack
}->[-1]; }
794 sub lasttoken
{ $_[0]->{tokens
}->[-1]; }
796 # Push a new tag token onto the stack.
801 # Assume certain tags close themselves if they appear immediately
802 # after themselves, i.e., <li>stuff<li>morestuff.
803 # In particular, this if statement is asking "Is the previous
804 # element the same as this element and a SELF_CLOSER like p?"
805 if (@
{$self->{stack
}} && $self->laststack->{name
} eq $tag->{name
}
806 && $SELF_CLOSERS{$tag->{name
}}) {
807 # If so, 'pop' the tag, which causes the emission of the
808 # appropriate end tag (see pop)
809 $self->pop(token
('TagClose', $tag->{name
}));
812 # If this tag should always self close, convert it to such
814 if ($TAG_TYPES{$tag->{name
}} == $BLOCK_SELF_CLOSE) {
815 push @
{$self->{tokens
}}, token
('SelfClosedTag',
821 # otherwise, a normal push
822 push @
{$self->{stack
}}, $tag;
823 push @
{$self->{tokens
}}, $tag;
826 # A lot of the normalization magic takes place here.
831 # We want a chance to manipulate the tokens we push back on
832 # a bit, before losing track of the tokens this func
833 # generated by tossing them onto the pile
836 # If a newline or doublenewline is currently at the end of the
837 # stack, pop it off for a moment and put it back later. This has
838 # the effect of binding close paragraph tags to the paragraph they
839 # were used in. Without this,
845 # (complete with those newlines) gets converted to
851 # which really isn't the intent, even if it is valid HTML.
853 if (@
{$self->{tokens
}} &&
854 ($self->lasttoken->{type
} == $NEWLINE ||
855 $self->lasttoken->{type
} == $DOUBLENEWLINE)) {
856 $popped_newline = pop @
{$self->{tokens
}};
859 # Scan backwards for the matching start tag, emit
860 # any missing close tags.
861 my $i = scalar(@
{$self->{stack
}}) - 1;
863 my $open_tag = $self->{stack
}->[$i];
864 if ($open_tag->{name
} eq $tag->{name
}) {
865 while ($i + 1 < @
{$self->{stack
}}) {
866 my $tag_to_close = pop @
{$self->{stack
}};
868 token
('TagClose', $tag_to_close->{name
}));
870 CORE
::pop @
{$self->{stack
}};
871 CORE
::push @tokens, token
('TagClose', $tag->{name
});
877 # If we're closing off the previous *token* (not stack
878 # element), and there is therefore no content for that
879 # token, either elide it if it is an empty block token,
880 # or convert it to a self-closed tag
881 if (@tokens && $self->lasttoken->{type
} == $TAG &&
882 $self->lasttoken->{name
} eq $tokens[0]->{name
}) {
883 my $name = $tokens[0]->{name
};
884 my $tag_type = $TAG_TYPES{$name};
885 if ($tag_type == $BLOCK) {
888 pop @
{$self->{tokens
}};
890 if ($tag_type == $BLOCK_SELF_CLOSE) {
892 my $old_tag = pop @
{$self->tokens};
893 unshift @tokens, token
('SelfClosedTag',
899 # If there was no corresponding open tag, just discard it by
900 # doing nothing on purpose.
904 # If we popped off a newline, now is when it goes back on
905 CORE
::push @tokens, $popped_newline if $popped_newline;
907 CORE
::push @
{$self->{tokens
}}, @tokens;
910 # Returns true if we are "in" the given tag, false otherwise. This
911 # is used to implement special treatment for <pre>, for instance
914 my $tag_name = shift;
916 for my $tag (@
{$self->{stack
}}) {
917 return 1 if $tag->{name
} eq $tag_name;
922 # Terminates the TagStack by popping all current elements. Used
923 # at the end of the input string. Despite the name, not the same
924 # as a DESTROY function.
928 while (@
{$self->{stack
}}) {
929 my $tag_still_open = CORE
::pop @
{$self->{stack
}};
930 CORE
::push @
{$self->{tokens
}}, token
('TagClose',
931 $tag_still_open->{name
});
940 Thrasher::HTMLNormalize - a module for normalizing (X)HTML for safe
945 use Thrasher::HTMLNormalize qw(:all);
947 # Take arbitrary HTML and make it syntactically correct
948 my $normalizedHTML = normalize($html);
950 # Take arbitrary maybe-HTML and make it correct, and
951 # use a chosen sub-set of HTML suitable for simple rich-text-like
953 my $comment_like_html = normalize($html, $CONSERVATIVE_PERMISSIONS);
955 # Take arbitrary maybe-HTML and make it correct, use a
956 # a chosen sub-set of HTML suitable for simple rich-text-like
957 # HTML, and if you see a "tag" that isn't actually HTML,
958 # escape it as if it was text.
959 my $message_log_like_html = normalize($html,
960 $CONSERVATIVE_PERMISSIONS,
966 This predated HTML::Normalize on CPAN. I just had it lying around, and
967 I understand it, so I used it. I recommend checking out
968 HTML::Normalize if you think you might want to use this. (On the
969 other hand, I've had to implement features here that it doesn't have.)
971 This is a utility module; Thrasher::XHTMLNormalize specifically
972 defines the normalization for XHTML-IM.
974 This is a utility module; Thrasher::XHTMLNormalize specifically
975 defines the normalization for XHTML-IM.