1 package Thrasher
::HTMLNormalize
;
5 use Thrasher
::HTMLNormalize
::Entities
qw(html_entity);
9 our @EXPORT_OK = qw(&escape $CONSERVATIVE_PERMISSIONS &normalize
10 &escape_quote $CONSERVATIVE_PERMISSIONS_NO_LINKS);
12 our %EXPORT_TAGS = (all => \@EXPORT_OK);
14 my $VALIDATE_AS_XML = 0;
17 ### PLEASE NOTE: This module has unit tests. Changing this module
18 ### without verifying them is a bad idea.
21 # Also note that this file isn't as dirty as it looks; that's
22 # just the natural result of sticking an entire compiler in one
26 ### CONSTANT AND DATA DECLARATIONS
28 ### Recall "my" is actually file scoped, not package scoped.
29 ### use constant, however, *is* package scoped, so it's darned inconvenient.
31 # A set-like hash indicating which entities are allowed
32 # to exist as text-named entities, and not converted into numeric
34 my %LEGAL_ENTITIES = map { $_ => 1 } qw(amp lt gt quot apos);
36 my %ESCAPE_ENTITIES = ('&' => '&',
39 my %QUOTE_ENTITIES = (%ESCAPE_ENTITIES,
43 # Tags that sort of close themselves
44 my %SELF_CLOSERS = map { $_ => 1 } qw(li p);
50 my $SELFCLOSEDTAG = 4;
53 my $DOUBLENEWLINE = 7;
56 # Tag types. These are used for formatting the resulting text.
57 # A block tag, as in CSS
59 # An inline tag, as in CSS
61 # A 'block' tag that should be converted to a self-closing
62 # tag if an empty instance is found (i.e., <br></br> => <br />)
63 my $BLOCK_SELF_CLOSE = 12;
70 br
=> $BLOCK_SELF_CLOSE,
100 hr
=> $BLOCK_SELF_CLOSE,
117 # A reasonable pre-defined behavior for URLs. If it has a protocol
118 # in the front, it *must* be http or https. Anything else gets
119 # destroyed. If it doesn't have a protocol in the front, we'll
120 # at least take a shot and prepend http://.
123 if ($url !~ /^https?:\/\
//) {
124 # has non-http protocol. Evil. Die now.
125 # or maybe an incorrectly-specified port. Die now anyhow.
126 if ($url =~ /^[^:]+:/) {
130 return 'http://' . $url;
135 # A reasonable pre-defined conservative set. Allows a reasonable
136 # range of simple formatting tags, but the only attributes allowed
137 # are URLS (and those only with http or https protocols), and
138 # titles for abbr and acronyms. Certainly no code execution allowed.
139 our $CONSERVATIVE_PERMISSIONS = {
142 a
=> {href
=> \
&fixURL
},
145 abbr
=> {title
=> 1},
146 acronym
=> {title
=> 1},
170 my %CONSERVATIVE_PERMISSIONS_NO_LINKS = %$CONSERVATIVE_PERMISSIONS;
171 our $CONSERVATIVE_PERMISSIONS_NO_LINKS = \
%CONSERVATIVE_PERMISSIONS_NO_LINKS;
172 delete $CONSERVATIVE_PERMISSIONS_NO_LINKS->{a
};
175 ### UTILITY FUNCTIONS
178 # Return whether $s is the name of a valid entity
179 sub is_valid_entity
{
181 return !!($s =~ /\#\d{1,5}/) || $LEGAL_ENTITIES{$s};
184 # Given an entity name, either return that entity or an empty
185 # string to elide it.
188 if (is_valid_entity
($s)) {
197 $s =~ s/([&<>])/$ESCAPE_ENTITIES{$1}/ge;
202 $s =~ s/([&<>'"])/$QUOTE_ENTITIES{$1}/ge; #'])/;
206 # Nicer interface to create tokens; just specify the last bit.
208 my $token_type = shift;
209 $token_type = 'Thrasher::HTMLNormalize::Token::' . $token_type;
210 return $token_type->new(@_);
213 # Tokenize a tag or entity. We know it's one of them because of
214 # how it was parsed. (Pulled out for simple testing.)
215 sub tokenize_tag_or_entity
{
216 my $tag_or_entity = shift;
217 my $allow_taglike = shift;
218 my $tag_translation = shift;
220 # \# is to convince emacs syntax highlighting that # is not a comment
221 if ($tag_or_entity !~ /((<(\/)?
(\w
+)([^>]*)>)|(&(\#?\w
+);))/) {
222 die "tokenize_tag_or_entity must be passed a tag or entity: "
226 # Convert that mass of matches into something comprehensible
227 my $groups = {tag
=> $2, # the whole of the tag
228 slash
=> $3, # if it has a slash to close the tag
229 tagname
=> $4,# the tag name
230 tagremains
=> $5, # the rest of the tag (atts)
231 entity
=> $6, # the whole of the entity
234 $groups->{tagname
} = lc $groups->{tagname
};
236 if (defined(my $tag_translator =
237 $tag_translation->{lc $groups->{tagname
}})) {
239 #print "Initial: " . Dumper($groups);
240 if (ref($tag_translator) eq '') {
241 $groups->{tagname
} = $tag_translator;
242 } elsif (ref($tag_translator) eq 'CODE') {
243 $tag_translator->($groups);
245 #print "Post-translation: " . Dumper($groups);
248 if ($groups->{tag
} && $allow_taglike &&
249 !$TAG_TYPES{lc $groups->{tagname
}}) {
250 return token
('Text', $tag_or_entity);
251 } elsif ($groups->{tag
}) {
252 # handle tokenizing a tag
253 if ($groups->{slash
}) {
254 return token
('TagClose', $groups->{tagname
});
257 my $token_type = 'Tag';
259 my $atts = $groups->{tagremains
};
260 if ($atts && substr($atts, -1) eq '/') {
261 $token_type = 'SelfClosedTag';
262 $atts = substr($atts, 0, length($atts) - 1); # pop off /
267 # Pull out the attributes
268 while ($atts =~ /([a-zA-Z]\w+)(?:=(?:(?:\"([^\">]*)\")|(?:'([^'>]*)\')|(?:([^ \n\t>]*))))?/g) {
270 # one value delimited by ", the other by ',
271 # the final by whitespace, the final case
272 # covers those few atts in HTML that can
273 # show up with no value at all
274 my $value = $2 || $3 || $4 || 1;
276 $atts_hash->{lc $name} = $value;
279 return token
($token_type, $groups->{tagname
}, $atts_hash);
280 } elsif ($groups->{entity
}) {
281 return token
('Entity', $groups->{entityname
});
284 # shouldn't be able to get here.
285 die "tokenize_tag_or_entity didn't get anything it expected.";
288 # Given something known to be character-based text with no
289 # (X)HTML in it, tokenize it.
294 my @texthunks = split(/(\n+)/, $s);
297 for my $fragment (@texthunks) {
299 my $newlines = length($fragment); # all newlines
300 if ($newlines == 1) {
301 push @tokens, token
('Newline');
303 push @tokens, token
('DoubleNewline');
306 if ($fragment) { # skip empty matches
307 push @tokens, token
('Text', $fragment);
317 # Given a tag and our validation criteria, verify the tag is
318 # allowed. If it is not allowed, this returns 0. It may also
319 # destructively manipulate the tag's attributes to make them valid.
322 my $allowed_elements = shift;
325 if (!defined($allowed_elements)) {
329 my $legal_attributes = $allowed_elements->{$tag->{name
}};
331 # Is this allowed at all?
332 if (!$legal_attributes) {
336 # If this is a close tag, pass it through.
337 if ($tag->{type
} == $TAGCLOSE) {
341 # Now, actually validate the attributes;
342 while (my ($name, $value) = each %{$tag->{atts
}}) {
343 my $attribute_validation = $legal_attributes->{$name};
345 # Is the attribute even allowed?
346 if (!$attribute_validation) {
348 delete($tag->{atts
}->{$name});
352 if ($attribute_validation eq '1') { # note string compare
353 # everything is permitted here, pass through
358 if (ref($attribute_validation) eq 'CODE') {
359 $new_value = $attribute_validation->($value);
360 if (!defined($new_value)) {
361 # This attribute is so evil that the entire tag
366 if (!$new_value) { # either false or empty string
367 delete($tag->{atts
}->{$name});
370 } elsif (ref($attribute_validation) eq 'Regexp') {
371 my $match = $value =~ /$attribute_validation/;
375 # Otherwise pass through; for better control, use a sub
377 } elsif ($attribute_validation) { # any true value
380 # any false value; redundent since unspecified = false
381 delete($tag->{atts
}->{$name});
385 $tag->{atts
}->{$name} = $new_value;
391 # The money function at last.
394 my $allowed_elements = shift; # default - allow everything
395 my $allow_cdata = shift; # default - no
396 my $allow_taglike = shift;
397 my $tag_translation = shift || {};
399 my $text_only = shift; # don't automatically slap in <p>s
403 # Apparently, applying this to something that consists
404 # entirely of whitespace gets a warning about using
405 # an uninitialized value. Probably just the implementation
406 # poking out and doesn't seem to be worth worrying about.
407 no warnings
'uninitialized';
408 $html =~ s/(^\s+)|(\s+$ )//gx;
411 # FIXME: Verify in utf-8 text encoding
413 # If we elimated the HTML in that step, return nothing
421 my @cdata_fragments = split(/<!\[CDATA\[(.*?)\]\]>/,
425 for my $fragment (@cdata_fragments) {
427 push @tokens, token
('CData', $fragment);
429 tokenize_fragment
(\
@tokens, $fragment,
430 $allowed_elements, $allow_taglike,
436 tokenize_fragment
(\
@tokens, $html, $allowed_elements,
437 $allow_taglike, $tag_translation);
440 # Push a p on front if the first token is not a block
441 # tag. Note that <html> is a block tag, so complete
442 # documents don't get this done, only fragments. This work
443 # because the TagStack will normalize the p correctly.
445 @tokens && ($tokens[0]->{type
} != $TAG ||
446 ($TAG_TYPES{$tokens[0]->{name
}} || $BLOCK)
448 unshift @tokens, token
('Tag', 'p');
452 # we abusively share modification between this func
453 # and the tag stack :(
454 my $tag_stack = new Thrasher
::HTMLNormalize
::TagStack
(\
@new_tokens);
456 for (my $i = 0; $i < @tokens; $i++) {
457 my $token = $tokens[$i];
459 my $type = $token->{type
};
461 $tag_stack->push($token);
463 if ($type == $TAGCLOSE) {
464 $tag_stack->pop($token);
466 if ($type == $TEXT || $type == $ENTITY ||
467 $type == $SELFCLOSEDTAG || $type == $CDATA) {
468 push @new_tokens, $token
471 # If a newline is already preceded by a self-closed
472 # br, pass it through, otherwise, prepend a self-closed br
473 if ($type == $NEWLINE) {
474 # Well, actually, if we're in a "pre", skip all fancy
476 if ($tag_stack->in_tag('pre')) {
477 push @new_tokens, token
('Text', "\n");
485 my $previous_token = $tokens[$i - 1];
486 if ($previous_token->{type
} == $SELFCLOSEDTAG &&
487 $previous_token->{name
} eq 'br') {
488 push @new_tokens, $token;
492 # we're not at the end of the tokens.
493 if ($i + 1 < @tokens) {
494 my $next_token = $tokens[$i + 1];
495 if ($next_token->{type
} == $TAG) {
496 my $tag_type = $TAG_TYPES{$next_token->{name
}}
498 if ($tag_type != $BLOCK &&
499 $tag_type != $BLOCK_SELF_CLOSE) {
500 # this automatically gets eaten if there
501 # was no previous <p>
504 token
('SelfClosedTag', 'br');
506 push @new_tokens, $token;
508 push @new_tokens, $token;
512 push @new_tokens, token
('SelfClosedTag', 'br');
514 push @new_tokens, $token;
520 if ($type == $DOUBLENEWLINE) {
521 if ($tag_stack->in_tag('pre')) {
522 push @new_tokens, token
('Text', "\n\n");
526 # Always assume we're closing a <p>. It gets eaten later
527 # by the TagStack if this turns out to be false.
529 $tag_stack->pop(token
('TagClose', 'p'));
532 # If this is followed by a non-block tag, add a <p>.
533 if ($i < @tokens - 1) {
534 my $next_token = $tokens[$i + 1];
535 if ($next_token->{type
} == $TAG) {
536 my $tag_type = $TAG_TYPES{$next_token->{name
}} || $BLOCK;
537 if ($tag_type != $BLOCK &&
538 $tag_type != $BLOCK_SELF_CLOSE) {
539 push @new_tokens, $token;
541 $tag_stack->push(token
('Tag', 'p'));
544 push @new_tokens, $token;
547 push @new_tokens, $token;
549 $tag_stack->push(token
('Tag', 'p'));
556 $tag_stack->terminate();
558 my $new_html = join '', map { $_->render } @new_tokens;
560 if ($VALIDATE_AS_XML) {
561 # Do a simple validation pass.
562 die "XML validation not implemented yet.";
568 # Performs low-level tokenization on a bit of text known not to have
569 # (valid) CDATA in it.
570 sub tokenize_fragment
{
573 my $allowed_elements = shift;
574 my $allow_taglike = shift;
575 my $tag_translation = shift;
577 # Same as the validation re in tokenize_tag_or_entity, but
578 # with all but the outer parens turned into non-capturing.
579 my @fragments = split (/((?:<(?:\/)?
(?
:\w
+)(?
:[^>]*)>)|(?
:&(?
:\#?\w
+);))/,
582 for (my $i = 0; $i < @fragments; $i++) {
583 my $text = $fragments[$i];
586 my $token = tokenize_tag_or_entity
($text, $allow_taglike,
588 if ($token->{type
} == $TAG) {
589 $token = validate_tag
($token, $allowed_elements);
592 push @
$tokens, $token;
595 push @
$tokens, @
{tokenize_text
($text)};
604 package Thrasher
::HTMLNormalize
::Token
;
605 # A base class for tokens, used for 'isa' checks.
607 # This is only used by children; do not call directly.
617 package Thrasher
::HTMLNormalize
::Token
::Text
;
618 use base
'Thrasher::HTMLNormalize::Token';
620 sub new
{ shift()->SUPER::_new
(text
=> shift(), type
=> $TEXT) }
623 return Thrasher
::HTMLNormalize
::escape
($_[0]->{text
});
626 package Thrasher
::HTMLNormalize
::TagBase
;
627 use base
'Thrasher::HTMLNormalize::Token';
628 # Contains the similarities between Tags and SelfClosedTags.
630 # process an attribute list into proper HTML; this assumes that
631 # the attributes has been checked for valid attributes already,
632 # but this also checks for permitted entities. Returns an arrayref
633 # containing strings ready-to-be-output.
637 while (my ($name, $value) = each %{$self->{atts
}}) {
638 my @fragments = split (/&(\w+);/, $value);
641 for my $text (@fragments) {
643 # This is an entity body.
644 push @new_value, Thrasher
::HTMLNormalize
::make_entity
($text);
648 Thrasher
::HTMLNormalize
::escape_quote
($text));
652 my $value = join '', @new_value;
653 # note $name was checked for validity implicitly by the
654 # regex that pulled it out in the first place.
655 push @atts, "$name='$value'";
660 package Thrasher
::HTMLNormalize
::Token
::Tag
;
661 use base
'Thrasher::HTMLNormalize::TagBase';
662 # Represents a Tag, which may have attributes.
663 # Takes: A name, and a hashref of atts.
664 # Note that name is checked implicitly by the regex that pulls it out.
666 use Carp
qw(confess);
669 my $self = shift()->SUPER::_new
(name
=> shift(),
670 atts
=> shift() || {},
672 confess
"Hash" if ref($self->{name
}) eq 'HASH';
678 my $atts = $self->att_list;
681 return "<$self->{name} " . join(' ', @
$atts) . ">";
683 return "<$self->{name}>";
687 package Thrasher
::HTMLNormalize
::Token
::SelfClosedTag
;
688 use base
'Thrasher::HTMLNormalize::TagBase';
689 # Represents a self-closing tag, which may have attributes.
691 sub new
{ shift()->SUPER::_new
(name
=> shift(),
692 atts
=> shift() || {},
693 type
=> $SELFCLOSEDTAG); }
697 my $atts = $self->att_list;
700 return "<$self->{name} " . join (' ', @
$atts) . " />";
702 return "<$self->{name} />";
706 package Thrasher
::HTMLNormalize
::Token
::TagClose
;
707 use base
'Thrasher::HTMLNormalize::Token';
708 # Represents a close tag. Has only a name.
710 sub new
{ shift()->SUPER::_new
(name
=> shift(),
711 type
=> $TAGCLOSE); }
714 return "</$_[0]->{name}>";
717 package Thrasher
::HTMLNormalize
::Token
::Entity
;
718 use base
'Thrasher::HTMLNormalize::Token';
719 use Thrasher
::HTMLNormalize
::Entities
qw(html_entity);
720 # Represents an entity. Has only a name. Performs numberization of
727 if (!$LEGAL_ENTITIES{$name} && (my $number = html_entity
($name))) {
728 $name = '#' . $number;
730 # make_entity takes care of nulling out illegal entities
731 # that pass this test
733 return $class->SUPER::_new
(name
=> $name, type
=> $ENTITY);
737 return Thrasher
::HTMLNormalize
::make_entity
($_[0]->{name
});
740 package Thrasher
::HTMLNormalize
::Token
::Newline
;
741 use base
'Thrasher::HTMLNormalize::Token';
742 # Represents a single newline, suitable for <br />
744 sub new
{ shift()->SUPER::_new
(type
=> $NEWLINE); }
748 package Thrasher
::HTMLNormalize
::Token
::DoubleNewline
;
749 use base
'Thrasher::HTMLNormalize::Token';
750 # Double new line, suitable for <p>-ification
752 sub new
{ shift()->SUPER::_new
(type
=> $DOUBLENEWLINE); }
754 sub render
{ "\n\n"; }
756 package Thrasher
::HTMLNormalize
::Token
::CData
;
757 use base
'Thrasher::HTMLNormalize::Token';
759 sub new
{ shift()->SUPER::_new
(rawtext
=> shift(), type
=> $CDATA); }
761 sub render
{ "<![CDATA[" . $_[0]->{rawtext
} . "]]>"; }
767 package Thrasher
::HTMLNormalize
::TagStack
;
773 *{token
} = *Thrasher
::HTMLNormalize
::token
;
775 # This is what it says it is; the "stack" of tags that has led
776 # us to this point in the HTML. It is a "smart" stack; it can
777 # silently refuse to accept tag tokens, resulting in the removal
778 # of that tag from the resulting HTML. If you need to implement
779 # the real nesting rules for (X)HTML tags, this is where you'd
784 my $self = {stack
=> [],
785 # we abusively modify this in the normalize function :(
791 # last element on the stack
792 sub laststack
{ $_[0]->{stack
}->[-1]; }
793 sub lasttoken
{ $_[0]->{tokens
}->[-1]; }
795 # Push a new tag token onto the stack.
800 # Assume certain tags close themselves if they appear immediately
801 # after themselves, i.e., <li>stuff<li>morestuff.
802 # In particular, this if statement is asking "Is the previous
803 # element the same as this element and a SELF_CLOSER like p?"
804 if (@
{$self->{stack
}} && $self->laststack->{name
} eq $tag->{name
}
805 && $SELF_CLOSERS{$tag->{name
}}) {
806 # If so, 'pop' the tag, which causes the emission of the
807 # appropriate end tag (see pop)
808 $self->pop(token
('TagClose', $tag->{name
}));
811 # If this tag should always self close, convert it to such
813 if ($TAG_TYPES{$tag->{name
}} == $BLOCK_SELF_CLOSE) {
814 push @
{$self->{tokens
}}, token
('SelfClosedTag',
820 # otherwise, a normal push
821 push @
{$self->{stack
}}, $tag;
822 push @
{$self->{tokens
}}, $tag;
825 # A lot of the normalization magic takes place here.
830 # We want a chance to manipulate the tokens we push back on
831 # a bit, before losing track of the tokens this func
832 # generated by tossing them onto the pile
835 # If a newline or doublenewline is currently at the end of the
836 # stack, pop it off for a moment and put it back later. This has
837 # the effect of binding close paragraph tags to the paragraph they
838 # were used in. Without this,
844 # (complete with those newlines) gets converted to
850 # which really isn't the intent, even if it is valid HTML.
852 if (@
{$self->{tokens
}} &&
853 ($self->lasttoken->{type
} == $NEWLINE ||
854 $self->lasttoken->{type
} == $DOUBLENEWLINE)) {
855 $popped_newline = pop @
{$self->{tokens
}};
858 # Scan backwards for the matching start tag, emit
859 # any missing close tags.
860 my $i = scalar(@
{$self->{stack
}}) - 1;
862 my $open_tag = $self->{stack
}->[$i];
863 if ($open_tag->{name
} eq $tag->{name
}) {
864 while ($i + 1 < @
{$self->{stack
}}) {
865 my $tag_to_close = pop @
{$self->{stack
}};
867 token
('TagClose', $tag_to_close->{name
}));
869 CORE
::pop @
{$self->{stack
}};
870 CORE
::push @tokens, token
('TagClose', $tag->{name
});
876 # If we're closing off the previous *token* (not stack
877 # element), and there is therefore no content for that
878 # token, either elide it if it is an empty block token,
879 # or convert it to a self-closed tag
880 if (@tokens && $self->lasttoken->{type
} == $TAG &&
881 $self->lasttoken->{name
} eq $tokens[0]->{name
}) {
882 my $name = $tokens[0]->{name
};
883 my $tag_type = $TAG_TYPES{$name};
884 if ($tag_type == $BLOCK) {
887 pop @
{$self->{tokens
}};
889 if ($tag_type == $BLOCK_SELF_CLOSE) {
891 my $old_tag = pop @
{$self->tokens};
892 unshift @tokens, token
('SelfClosedTag',
898 # If there was no corresponding open tag, just discard it by
899 # doing nothing on purpose.
903 # If we popped off a newline, now is when it goes back on
904 CORE
::push @tokens, $popped_newline if $popped_newline;
906 CORE
::push @
{$self->{tokens
}}, @tokens;
909 # Returns true if we are "in" the given tag, false otherwise. This
910 # is used to implement special treatment for <pre>, for instance
913 my $tag_name = shift;
915 for my $tag (@
{$self->{stack
}}) {
916 return 1 if $tag->{name
} eq $tag_name;
921 # Terminates the TagStack by popping all current elements. Used
922 # at the end of the input string. Despite the name, not the same
923 # as a DESTROY function.
927 while (@
{$self->{stack
}}) {
928 my $tag_still_open = CORE
::pop @
{$self->{stack
}};
929 CORE
::push @
{$self->{tokens
}}, token
('TagClose',
930 $tag_still_open->{name
});
939 Thrasher::HTMLNormalize - a module for normalizing (X)HTML for safe
944 use Thrasher::HTMLNormalize qw(:all);
946 # Take arbitrary HTML and make it syntactically correct
947 my $normalizedHTML = normalize($html);
949 # Take arbitrary maybe-HTML and make it correct, and
950 # use a chosen sub-set of HTML suitable for simple rich-text-like
952 my $comment_like_html = normalize($html, $CONSERVATIVE_PERMISSIONS);
954 # Take arbitrary maybe-HTML and make it correct, use a
955 # a chosen sub-set of HTML suitable for simple rich-text-like
956 # HTML, and if you see a "tag" that isn't actually HTML,
957 # escape it as if it was text.
958 my $message_log_like_html = normalize($html,
959 $CONSERVATIVE_PERMISSIONS,
965 This predated HTML::Normalize on CPAN. I just had it lying around, and
966 I understand it, so I used it. I recommend checking out
967 HTML::Normalize if you think you might want to use this. (On the
968 other hand, I've had to implement features here that it doesn't have.)
970 This is a utility module; Thrasher::XHTMLNormalize specifically
971 defines the normalization for XHTML-IM.
973 This is a utility module; Thrasher::XHTMLNormalize specifically
974 defines the normalization for XHTML-IM.