3 # This file is provided under the GNU General Public License.
4 # A copy of that license can be found in the LICENSE-LiveJournal.txt file included as
5 # part of this distribution.
7 # Original code related to the 'cut_retrieve' option of the 'clean' method by Afuna in Dreamwidth (http://www.dreamwidth.org/)
10 use Class
::Autouse
qw(
31 # name: LJ::strip_bad_code
33 # des: Removes malicious/annoying HTML.
34 # info: This is just a wrapper function around [func[LJ::CleanHTML::clean]].
36 # des-textref: Scalar reference to text to be cleaned.
42 LJ
::CleanHTML
::clean
($data, {
43 'eat' => [qw
[layer script object embed
]],
45 'keepcomments' => 1, # Allows CSS to work
49 package LJ
::CleanHTML
;
50 # LJ::CleanHTML::clean(\$u->{'bio'}, {
51 # 'wordlength' => 100, # maximum length of an unbroken "word"
52 # 'addbreaks' => 1, # insert <br/> after newlines where appropriate
53 # 'tablecheck' => 1, # make sure they aren't closing </td> that weren't opened.
54 # 'eat' => [qw(head title style layer iframe)],
56 # 'deny' => [qw(marquee)],
58 # 'maximgwidth' => 100,
59 # 'maximgheight' => 100,
60 # 'keepcomments' => 1,
61 # 'cuturl' => 'http://www.domain.com/full_item_view.ext',
62 # 'ljcut_disable' => 1, # stops the cleaner from using the lj-cut tag
64 # 'extractlinks' => 1, # remove a hrefs; implies noautolinks
65 # 'noautolinks' => 1, # do not auto linkify
66 # 'extractimages' => 1, # placeholder images
67 # 'transform_embed_nocheck' => 1, # do not do checks on object/embed tag transforming
68 # 'transform_embed_wmode' => <value>, # define a wmode value for videos (usually 'transparent' is the value you want)
69 # 'blocked_links' => [ qr/evil\.com/, qw/spammer\.com/ ], # list of sites which URL's will be blocked
70 # 'blocked_link_substitute' => 'http://domain.com/error.html' # blocked links will be replaced by this URL
71 # 'allowed_img_attrs' => hashref of allowed img attibutes, other attrs are removed.
72 # 'remove_all_attribs' => 1, # remove all attributes from html tags
73 # 'remove_attribs' => [qw/id class style/], # remove specified attributes only
78 my $p = HTML
::TokeParser
->new("");
79 eval {$p->DESTROY(); };
83 # this treats normal characters and &entities; as single characters
84 # also treats UTF-8 chars as single characters if $LJ::UNICODE
87 my $utf_longchar = '[\xc2-\xdf][\x80-\xbf]|\xe0[\xa0-\xbf][\x80-\xbf]|[\xe1-\xef][\x80-\xbf][\x80-\xbf]|\xf0[\x90-\xbf][\x80-\xbf][\x80-\xbf]|[\xf1-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]';
89 if (not $LJ::UNICODE
) {
90 $match = '[^&\s]|(&\#?\w{1,7};)';
92 $match = $utf_longchar . '|[^&\s\x80-\xff]|(?:&\#?\w{1,7};)';
94 $onechar = qr/$match/o;
97 # Some browsers, such as Internet Explorer, have decided to alllow
98 # certain HTML tags to be an alias of another. This has manifested
99 # itself into a problem, as these aliases act in the browser in the
100 # same manner as the original tag, but are not treated the same by
103 my %tag_substitute = (
107 # In XHTML you can close a tag in the same opening tag like <br />,
108 # but some browsers still will interpret it as an opening only tag.
109 # This is a list of tags which you can actually close with a trailing
110 # slash and get the proper behavior from a browser.
111 my $slashclose_tags = qr/^(?:area|base|basefont|br|col|embed|frame|hr|img|input|isindex|link|meta|param|lj-embed)$/i;
113 our $EnableDynamicElements = undef;
116 # name: LJ::CleanHTML::clean
118 # des: Multi-faceted HTML parse function
121 # des-data: A reference to HTML to parse to output, or HTML if modified in-place.
122 # des-opts: An hash of options to pass to the parser.
130 # remove the auth portion of any see_request.bml links
131 $$data =~ s/(see_request\.bml\S+?)auth=\w+/$1/ig;
132 $$data =~ s/(<lj\-random\s*\/?>)/int
(rand(10_000_000
))/gie
;
133 $$data =~ s/(\<\;lj\-random\s*\/?\>\;)/int
(rand(10_000_000
))/gie
;
135 # decode escapes to get a valid unicode string
136 # we encode it back before return
137 $$data = Encode
::decode_utf8
($$data);
139 my $p = HTML
::TokeParser
->new($data);
140 my $wordlength = $opts->{'wordlength'};
141 my $addbreaks = $opts->{'addbreaks'};
142 my $keepcomments = $opts->{'keepcomments'};
143 my $mode = $opts->{'mode'};
144 my $undefined_tags = $opts->{undefined_tags
} || '';
145 my $cut = $opts->{'cuturl'} || $opts->{'cutpreview'};
146 my $ljcut_disable = $opts->{'ljcut_disable'};
147 my $s1var = $opts->{'s1var'};
148 my $extractlinks = 0 || $opts->{'extractlinks'};
149 my $noautolinks = $extractlinks || $opts->{'noautolinks'};
150 my $noexpand_embedded = $opts->{'noexpandembedded'} || $opts->{'textonly'} || 0;
151 my $transform_embed_nocheck = $opts->{'transform_embed_nocheck'} || 0;
152 my $transform_embed_wmode = $opts->{'transform_embed_wmode'};
153 my $remove_colors = $opts->{'remove_colors'} || 0;
154 my $remove_sizes = $opts->{'remove_sizes'} || 0;
155 my $remove_fonts = $opts->{'remove_fonts'} || 0;
156 my $blocked_links = (exists $opts->{'blocked_links'}) ?
$opts->{'blocked_links'} : \
@LJ::BLOCKED_LINKS
;
157 my $blocked_link_substitute =
158 (exists $opts->{'blocked_link_substitute'}) ?
$opts->{'blocked_link_substitute'} :
159 ($LJ::BLOCKED_LINK_SUBSTITUTE
) ?
$LJ::BLOCKED_LINK_SUBSTITUTE
: '#';
160 my $suspend_msg = $opts->{'suspend_msg'} || 0;
161 my $unsuspend_supportid = $opts->{'unsuspend_supportid'} || 0;
162 my $remove_all_attribs = $opts->{'remove_all_attribs'} || 0;
163 my %remove_attribs = ($opts->{'remove_attribs'}) ?
164 (map {$_ => 1} @
{ $opts->{'remove_attribs'} }) : ();
165 my $remove_positioning = $opts->{'remove_positioning'} || 0;
166 my $placeholders = $opts->{'placeholders'} || 0;
167 my $target = $opts->{'target'} || '';
168 my $ljrepost_allowed = ($opts->{ljrepost_allowed
} && ! $opts->{'textonly'}) || 0;
169 my $cut_retrieve = $opts->{cut_retrieve
} || 0;
170 my $expand_lj_user_tag = $opts->{'expand_lj_user_tag'} || 0;
171 my $skip_lj_user_tag = $opts->{'skip_lj_user_tag'} || 0;
173 my $enable_dynamic_elements = $EnableDynamicElements;
174 unless ( defined $enable_dynamic_elements ) {
175 $enable_dynamic_elements = LJ
::is_web_context
();
177 $enable_dynamic_elements = 0 if $opts->{'textonly'};
179 my $ljspoiler_allowed = $enable_dynamic_elements;
181 my $poster = $opts->{poster
} || LJ
::load_userid
($opts->{posterid
});
182 my $put_nofollow = not ($poster and $poster->get_cap('paid') and not $poster->get_cap('trynbuy'));
184 my $viewer_lang = $opts->{'viewer_lang'};
185 unless ($viewer_lang) {
186 $viewer_lang = LJ
::Lang
::get_remote_lang
();
189 # cuturl or entry_url tells about context and texts address,
190 # Expand or close lj-cut tag should be switched directly by special flag
192 $cut = '' if $opts->{expand_cut
};
194 my @canonical_urls; # extracted links
197 if (ref $opts->{'eat'} eq "ARRAY") {
198 foreach (@
{$opts->{'eat'}}) { $action{$_} = "eat"; }
200 if (ref $opts->{'allow'} eq "ARRAY") {
201 foreach (@
{$opts->{'allow'}}) { $action{$_} = "allow"; }
203 if (ref $opts->{'deny'} eq "ARRAY") {
204 foreach (@
{$opts->{'deny'}}) { $action{$_} = "deny"; }
206 if (ref $opts->{'remove'} eq "ARRAY") {
207 foreach (@
{$opts->{'remove'}}) { $action{$_} = "deny"; $remove{$_} = 1; }
210 $action{'script'} = "eat";
212 # if removing sizes, remove heading tags
214 foreach my $tag (qw( h1 h2 h3 h4 h5 h6 )) {
215 $action{$tag} = "deny";
220 if ($opts->{'strongcleancss'}) {
221 $opts->{'cleancss'} = 1;
224 my @attrstrip = qw();
225 # cleancss means clean annoying css
226 # clean_js_css means clean javascript from css
227 if ($opts->{'cleancss'}) {
228 push @attrstrip, 'id';
229 $opts->{'clean_js_css'} = 1;
232 if ($opts->{'nocss'}) {
233 push @attrstrip, 'style';
236 if (ref $opts->{'attrstrip'} eq "ARRAY") {
237 foreach (@
{$opts->{'attrstrip'}}) { push @attrstrip, $_; }
245 # bytes known good. set this BEFORE we start parsing any new
246 # start tag, where most evil is (because where attributes can be)
247 # then, if we have to totally fail, we can cut stuff off after this.
250 # then, if we decide that part of an entry has invalid content, we'll
251 # escape that part and stuff it in here. this lets us finish cleaning
252 # the "good" part of the entry (since some tags might not get closed
253 # till after $good_until bytes into the text).
255 my $total_fail = sub {
256 my $tag = LJ
::ehtml
(@_);
258 my $edata = LJ
::ehtml
($$data);
259 $edata =~ s/\r?\n/<br \/>/g
if $addbreaks;
261 $extra_text = "<div class='ljparseerror'>[<b>Error:</b> Irreparable invalid markup ('<$tag>') in entry. ".
262 "Owner must fix manually. Raw contents below.]<br /><br />" .
263 '<div style="width: 95%; overflow: auto">' . $edata . '</div></div>';
266 ## We do not need to eat a tag 'iframe' if it enabled here.
267 my $htmlcleaner = HTMLCleaner
->new(
268 valid_stylesheet
=> \
&LJ
::valid_stylesheet_url
,
269 enable_iframe
=> (grep { $_ eq 'iframe' && $action{$_} == "allow" ?
1 : 0 } keys %action) ?
1 : 0
272 my $eating_ljuser_span = 0; # bool, if we're eating an ljuser span
273 my $ljuser_text_node = ""; # the last text node we saw while eating ljuser tags
274 my @eatuntil = (); # if non-empty, we're eating everything. thing at end is thing
275 # we're looking to open again or close again.
277 my $capturing_during_eat; # if we save all tokens that happen inside the eating.
278 my @capture = (); # if so, they go here
286 my $start_capture = sub {
287 next if $capturing_during_eat;
289 my ($tag, $first_token, $cb) = @_;
290 push @eatuntil, $tag;
291 @capture = ($first_token);
292 $capturing_during_eat = $cb || sub {};
295 my $finish_capture = sub {
297 $capturing_during_eat = undef;
300 # this is the stack that holds information about state of parsing
301 # <lj-lang> tags; the syntax of these is as follows:
303 # <lj-lang-container>
304 # <lj-lang include="en"> English text </lj-lang>
305 # <lj-lang include="de"> German text </lj-lang>
306 # <lj-lang include="en,de"> Text that displays in both
307 # English and German </lj-lang>
308 # <lj-lang otherwise> In case all above failed, this is
309 # the text </lj-lang>
310 # </lj-lang-container>
312 # it is pretty trivial to implement the 'include' versions of
313 # tags, and for the 'otherwise' version, we have a state variable
314 # indicating that we haven't yet seen an 'include' tag that had
315 # its language matching the remote's language. so when we occur
316 # an 'otherwise' tag, we figure whether to display its body using
319 # as for the stack, it allows us to make it so that:
320 # 1). container tags may be nested
321 # 2). lj-lang doesn't actually need to be inside of a container
323 # opening <lj-lang-container> unshifts the stack
324 # closing <lj-lang-container> shifts it
325 # when we need to access a 'variable', $lj_lang_otherwise[0] will do
327 # TODO: this comment indicates that the code is less than easy to
328 # understand and it would benefit from a refactor, i.e. encapsulating
329 # handling specific tags in some set of classes, or something.
330 # - ailyin, Nov 15, 2010
331 my @lj_lang_otherwise = ( 1 );
333 my %vkontakte_like_js;
336 my $href_b_link = '';
340 my $ljspoilers_open = 0;
342 # if we're retrieving a cut tag, then we want to eat everything
343 # until we hit the first cut tag.
344 my @cuttag_stack = ();
345 my $eatall = $cut_retrieve ?
1 : 0;
348 while (my $token = $p->get_token) {
349 my $type = $token->[0];
351 # See if this tag should be treated as an alias
353 if ( ($type eq 'S' || $type eq 'E') ) {
354 $token->[1] = $tag_substitute{$token->[1]} if defined $tag_substitute{$token->[1]};
359 my $tag = $token->[1];
360 my $attr = $token->[2]; # hashref
362 $good_until = length $newdata;
364 # stupid hack to remove the class='ljcut' from divs when we're
365 # disabling them, so we account for the open div normally later.
366 my $ljcut_div = $tag eq "div" && lc $attr->{class} eq "ljcut";
367 if ($ljcut_div && $ljcut_disable) {
371 if (LJ
::is_enabled
('remove_allowscriptaccess')) {
372 ## TODO: remove closing </param> tag,
373 ## don't strip 'allowscriptaccess' from YouTube and other trusted sites
374 if ($tag eq 'param' && $attr->{name
} eq 'allowscriptaccess') {
377 if ($tag eq 'embed' && keys %$attr) {
378 # LJSUP-15368: don't delete allowScriptAccess from trusted sites
379 # probably it's must placed in transform_embed hook...
380 my $site = $attr->{src
};
381 $site =~ m{(?:https?:)?//(?:[\w\-]+\.)*([\w\-]+\.\w*)}; #get site url from src
383 unless ( grep($_ eq 'allowScriptAccess', @
{$LJ::WHITELIST_VIDEO_HOSTS
{$site}->{'other_whitelist'}}) ) {
384 delete $attr->{allowscriptaccess
};
390 push @capture, $token if $capturing_during_eat;
391 if ($tag eq $eatuntil[-1]) {
392 push @eatuntil, $tag;
397 # if we're looking for cut tags, ignore everything that's
399 if ( $eatall && $tag ne "lj-cut" && !$ljcut_div ) {
404 if ( $opts->{'img_placeholders'} ) {
407 $href_b_link = $attr->{href
};
411 if ( $tag eq 'img' && $in_link ) {
417 if ($tag eq "lj-template" && ! $noexpand_embedded) {
418 my $name = $attr->{name
} || "";
421 my $run_template_hook = sub {
422 # can pass in tokens to override passing the hook the @capture array
423 my ($token, $override_capture) = @_;
424 my $capture = $override_capture ?
[$token] : \
@capture;
426 # In $expanded we must has valid unicode string.
427 my $expanded = ($name =~ /^\w+$/) ?
428 Encode
::decode_utf8
(LJ
::run_hook
("expand_template_$name", $capture, remove_video_sizes
=> $opts->{remove_video_sizes
})) : "";
429 $newdata .= $expanded || "<b>[Error: unknown template '" . LJ
::ehtml
($name) . "']</b>";
433 # template is self-closing, no need to do capture
434 $run_template_hook->($token, 1);
436 # capture and send content to hook
437 $start_capture->("lj-template", $token, $run_template_hook);
442 if ($tag eq "lj-replace") {
443 my $name = $attr->{name
} || "";
444 my $replace = ($name =~ /^\w+$/) ? Encode
::decode_utf8
(LJ
::lj_replace
($name, $attr)) : undef;
445 $newdata .= defined $replace ?
$replace : "<b>[Error: unknown lj-replace key '" . LJ
::ehtml
($name) . "']</b>";
450 if ($tag eq 'lj-map') {
451 $newdata .= LJ
::Maps
->expand_ljmap_tag($attr);
456 # lj-repost tag adds button that allows easily post text in remote user's blog.
460 # 2. <lj-repost button="post this" />
461 # 3. <lj-repost>some text</lj-repost>
462 # 4. <lj-repost button="re-post to your journal" subject="WOW">
466 if ($tag eq "lj-repost" and $ljrepost_allowed){
467 next TOKEN
if ref $opencount{$tag}; # no support for nested <lj-repost> tags
468 my $button = LJ
::ehtml
($attr->{button
}) ||
469 Encode
::decode_utf8
(LJ
::Lang
::ml
("repost.default_button"));
471 # short <lj-repost /> form of tag
472 $newdata .= qq[<form action
="http://www.$LJ::DOMAIN/update.bml" method
="GET">]
473 . qq[<input type
="hidden" name
="repost" value
="$opts->{cuturl}" />]
474 . qq[<input type
="hidden" name
="repost_type" value
="a" />]
475 . qq[<input type
="submit" value
="$button" /> ]
480 subject
=> $attr->{subject
},
481 offset
=> length $newdata,
487 # LJSUP-11810: Change the widget trava.ru
488 # bypass S2 "print safe" function.
489 # <lj-music> must be expanded at last order
490 if ( $tag eq 'lj-music' && ! $opts->{'ignore_lj_music'} ) {
491 $newdata .= LJ
::Setting
::Music
::format_ljmusic
( $attr->{'provider'}, $attr->{'id'} );
495 elsif ( $tag eq 'lj-music' ) {
496 $newdata .= $token->[4];
500 ## <lj-userpic> - current journal's default userpic
501 ## <lj-userpic remote> - remote user's default userpic
502 ## <lj-userpic user="test"> - test's default userpic
503 if ($tag eq "lj-userpic" and !$opts->{'textonly'} and $action{$tag} ne 'deny') {
506 $u = LJ
::load_user
($attr->{user
});
507 } elsif ($attr->{remote
}){
508 $u = LJ
::get_remote
();
510 my $cur_journal = LJ
::Session
->domain_journal;
511 $u = LJ
::load_user
($cur_journal) if $cur_journal;
514 my $upic = ref $u ?
$u->userpic : '';
516 $newdata .= $upic->imgtag;
518 $newdata .= qq|<img src
="http://wh.livejournal.ru/icons/nouserpic.png" width
="100" height
="100" class="userpic-img" />|;
523 if ($tag eq "lj-wishlist") {
524 my $wishid = $attr->{wishid
};
525 my $userid = $attr->{userid
};
526 $newdata .= Encode
::decode_utf8
(LJ
::WishElement
->check_and_expand_entry($userid, $wishid));
529 if ( $tag eq 'lj-spoiler' ) {
530 next TOKEN
unless $ljspoiler_allowed;
532 my $title = exists $attr->{'title'} && length $attr->{'title'}
534 : $attr->{'text'} || Encode
::decode_utf8
( LJ
::Lang
::ml
('fcklang.ljspoiler.prompt.text') );
536 $title = LJ
::ehtml
($title);
538 $newdata .= qq{<div
class="lj-spoiler"><div
class="lj-spoiler-head">[<b
><a href
="#">$title</a></b
>]</div
><div
class="lj-spoiler-body">};
543 # Capture object and embed tags to possibly transform them into something else.
544 if ($tag eq "object" || $tag eq "embed") {
545 if (LJ
::are_hooks
("transform_embed") && !$noexpand_embedded) {
546 # XHTML style open/close tags done as a singleton shouldn't actually
547 # start a capture loop, because there won't be a close tag.
549 $newdata .= LJ
::run_hook
(
552 nocheck
=> $transform_embed_nocheck,
553 wmode
=> $transform_embed_wmode,
554 video_placeholders
=> $opts->{video_placeholders
},
555 remove_video_sizes
=> $opts->{remove_video_sizes
},
561 $start_capture->($tag, $token, sub {
562 my $expanded = LJ
::run_hook
(
565 nocheck
=> $transform_embed_nocheck,
566 wmode
=> $transform_embed_wmode,
567 video_placeholders
=> $opts->{video_placeholders
},
568 remove_video_sizes
=> $opts->{remove_video_sizes
},
571 $newdata .= $expanded || "";
577 if ($tag eq "span" && lc $attr->{class} eq "ljuser" && ! $noexpand_embedded) {
578 $eating_ljuser_span = 1;
579 $ljuser_text_node = "";
582 if ($eating_ljuser_span) {
586 if (($tag eq "div" || $tag eq "span") && lc $attr->{class} eq "ljvideo") {
587 $start_capture->($tag, $token, sub {
588 my $expanded = LJ
::run_hook
("expand_template_video", \
@capture);
589 $newdata .= $expanded || "<b>[Error: unknown template 'video']</b>";
594 # do some quick checking to see if this is an email address/URL, and if so, just
595 # escape it and ignore it
596 if ($tag =~ m!(?:\@|://)!) {
597 $newdata .= LJ
::ehtml
("<$tag>");
601 if ($form_tag->{$tag}) {
602 if (! $opencount{form
}) {
603 $newdata .= "<$tag ... >";
607 if ($tag eq "input") {
608 if ($attr->{type
} !~ /^\w+$/ || lc $attr->{type
} eq "password") {
609 delete $attr->{type
};
614 my $slashclose = 0; # If set to 1, use XML-style empty tag marker
615 # for tags like <name/>, pretend it's <name> and reinsert the slash later
616 $slashclose = 1 if ($tag =~ s!/$!!);
618 unless ($tag =~ /^\w([\w\-:_]*\w)?$/) {
623 # for incorrect tags like <name/attrib=val> (note the lack of a space)
624 # delete everything after 'name' to prevent a security loophole which happens
625 # because IE understands them.
628 # Try to execute default action on undefined tags
629 next if (!$action{$tag} && $undefined_tags eq "eat");
631 if ( $action{$tag} eq "eat" || $tag =~ /^fb|g:/ ) {
632 $p->unget_token($token);
633 $p->get_tag("/$tag");
637 if ($tag eq 'iframe' || $tag eq 'video' || $tag eq 'audio' || $tag eq 'source') {
639 ## Remove all autoplay tags
640 delete $attr->{'autoplay'};
642 ## Allow some iframes from trusted sources (if they are not eaten already)
643 ## YouTube (http://apiblog.youtube.com/2010/07/new-way-to-embed-youtube-videos.html),
644 ## Vimeo, VKontakte, Google Calendar, Google Docs, VK.com, etc.
645 ## see @LJ::EMBED_IFRAME_WHITELIST in lj-disabled-conf
648 if (my $src = $attr->{'src'}) {
649 foreach my $wl ( @LJ::EMBED_IFRAME_WHITELIST
) {
650 if ($src =~ $wl->{re
}) {
651 if ($wl->{personal_posts_only
}) {
652 last unless $opts->{journalid
};
653 my $u = LJ
::load_userid
($opts->{journalid
});
654 last unless $u && $u->is_personal;
660 ## tags video and audio may have no attribute 'src'
661 ## and using special tag <source>
662 } elsif ($tag =~ /^(?:video|audio)$/) {
666 unless ($src_allowed) {
669 ## if not autoclosed tag (<iframe />),
670 ## then skip everything till the closing tag
671 $p->get_tag("/iframe");
677 # try to call HTMLCleaner's element-specific cleaner on this open tag
678 my $clean_res = eval {
680 $cleantag =~ s/^.*://s;
681 $cleantag =~ s/[^\w]//go;
683 my $meth = "CLEAN_$cleantag";
684 my $seq = $token->[3]; # attribute names, listref
685 my $code = $htmlcleaner->can($meth)
687 return $code->($htmlcleaner, $seq, $attr);
690 next if !$@
&& !$clean_res;
692 # this is so the rte converts its source to the standard ljuser html
693 my $ljuser_div = $tag eq "div" && $attr->{class} eq "ljuser";
696 my $href = $p->get_tag("a");
697 my $href_attr = $href->[1]->{"href"};
698 my $username = LJ
::get_user_by_url
( $href_attr );
699 $attr->{'user'} = $username ?
$username : '';
701 my $ljuser_text = $p->get_text("/b");
703 $ljuser_text =~ s/\[info\]//;
705 $attr->{'title'} = $ljuser_text;
709 # no cut URL, record the anchor, but then fall through
710 if (0 && $ljcut_div && !$cut) {
712 $newdata .= "<a name=\"cutid$cutcount\"></a>";
716 if ( $tag eq 'lj-lang' ) {
717 # extract a "standard" type of lang here;
718 # also, it's a weird way to convert en_LJ -> en
719 my $lang = LJ
::lang_to_locale
($viewer_lang);
722 if ($attr->{'include'}) {
723 my @include = split /[,;\s]+/, $attr->{'include'};
724 if ( grep { $_ eq $lang } @include ) {
725 $lj_lang_otherwise[0] = 0;
730 if ( $attr->{'otherwise'} || $attr->{'default'} ) {
731 next TOKEN
if ($lj_lang_otherwise[0]);
734 push @eatuntil, $tag;
737 if ( $tag eq 'lj-lang-container' ) {
738 unshift @lj_lang_otherwise, 1;
741 if (($tag eq "lj-cut" || $ljcut_div)) {
742 next TOKEN
if $ljcut_disable;
745 # if this is the cut tag we're looking for, then push it
746 # onto the stack (in case there are nested cut tags) and
747 # start including the content.
749 if ( $cutcount == $cut_retrieve ) {
751 push @cuttag_stack, $tag;
756 my $link_text = sub {
757 my $text = LJ
::Lang
::ml
('fcklang.readmore');
758 $text = Encode
::decode_utf8
($text) if $text;
759 if (exists $attr->{'text'} && length $attr->{'text'}) {
760 $text = $attr->{'text'};
767 my $etext = $link_text->();
768 my $url = LJ
::ehtml
($cut);
769 $newdata .= "<div>" if $tag eq "div";
771 if ($opts->{entry_url
} && $opts->{entry_url
} ne '#') {
772 my $entry = LJ
::Entry
->new_from_url($opts->{entry_url
});
774 my $journalid = $entry->journalid;
775 if ($entry && $entry->valid) {
776 $ditemid = $entry->ditemid;
778 $data_ids = qq(data
-widget
='ljcut' data
-widget
-options
='{ "journalid": "$journalid", "ditemid": "$ditemid", "cutid": "$cutcount", "placeholders" : $placeholders }');
780 $newdata .= "<b $data_ids class=\"ljcut-link lj-widget\"><span class='ljcut-brace'>( </span><span class=\"ljcut-decor\"><a href=\"$url#cutid$cutcount\" class=\"ljcut-link-expand\">$etext</a>";
781 $newdata .= "<a href=\"$url#cutid$cutcount\" class=\"ljcut-link-collapse\">".Encode
::decode_utf8
(LJ
::Lang
::ml
("ljcut.collapse"))."</a>" unless $opts->{no_ljcut_collapse
};
782 $newdata .= "</span><span class='ljcut-brace'> )</span></b>";
783 $newdata .= "</div>" if $tag eq "div";
784 unless ($opts->{'cutpreview'}) {
785 push @eatuntil, $tag;
789 $newdata .= "<a name=\"cutid$cutcount\"></a>" unless $opts->{'textonly'};
790 if ($tag eq "div" && !$opts->{'textonly'}) {
792 my $etext = $link_text->();
793 $newdata .= "<div class=\"ljcut\" text=\"$etext\">";
798 elsif ($tag eq "style") {
799 my $style = $p->get_text("/style");
800 $p->get_tag("/style");
801 unless ($LJ::DISABLED
{'css_cleaner'}) {
802 my $cleaner = LJ
::CSS
::Cleaner
->new;
803 $style = $cleaner->clean($style);
804 LJ
::run_hook
('css_cleaner_transform', \
$style);
805 if ($LJ::IS_DEV_SERVER
) {
806 $style = "/* cleaned */\n" . $style;
809 $newdata .= "\n<style>\n$style</style>\n";
812 elsif ( ($tag eq "lj-app") || ($tag eq "lj-widget") )
814 next TOKEN
unless LJ
::is_enabled
('userapps');
815 my %app_attr = map { $_ => Encode
::encode_utf8
($attr->{$_}) } keys %$attr;
817 if ($tag eq "lj-widget") {
818 $app_attr{type
} = 'widget';
819 $app_attr{key
} = delete $app_attr{name
};
822 my $app = LJ
::UserApps
->get_application( id
=> delete $app_attr{id
}, key
=> delete $app_attr{key
} );
823 next TOKEN
unless $app && $app->can_show_restricted;
825 # Gain all context data
827 $context{posterid
} = $opts->{posterid
} if($opts->{posterid
});
828 $context{journalid
} = $opts->{journalid
} if($opts->{journalid
});
829 if($opts->{entry_url
}) {
830 my $entry = LJ
::Entry
->new_from_url($opts->{entry_url
});
831 if ($entry && $entry->valid) {
832 $context{ditemid
} = $entry->ditemid;
836 $newdata .= Encode
::decode_utf8
($app->ljapp_display(viewer
=> LJ
::get_remote
(), owner
=> $poster, attrs
=> \
%app_attr, context
=> \
%context), Encode
::FB_QUIET
);
839 elsif ($tag eq "lj" && !$skip_lj_user_tag)
841 # keep <lj comm> working for backwards compatibility, but pretend
842 # it was <lj user> so we don't have to account for it below.
843 my $user = $attr->{'user'} = exists $attr->{'user'} ?
$attr->{'user'} :
844 exists $attr->{'comm'} ?
$attr->{'comm'} : undef;
847 my $orig_user = $user; # save for later, in case
848 $user = LJ
::canonical_username
($user);
850 $newdata .= "%%ljuser:$1%%" if $attr->{'user'} =~ /^\%\%([\w\-\']+)\%\%$/;
851 } elsif (length $user) {
852 if ($opts->{'textonly'} && !$expand_lj_user_tag) {
855 my $title = Encode
::encode_utf8
($attr->{title
});
856 my $ljuser = LJ
::ljuser
($user, { title
=> $title, target
=> $target } );
857 $newdata .= Encode
::decode_utf8
($ljuser);
860 $orig_user = LJ
::no_utf8_flag
($orig_user);
861 $newdata .= "<b>[Bad username: " . LJ
::ehtml
($orig_user) . "]</b>";
864 $newdata .= "<b>[Unknown LJ tag]</b>";
867 elsif ($tag eq "lj-raw") {
868 # Strip it out, but still register it as being open
871 elsif ($tag eq "lj-cvk-poll") {
872 $newdata .= Encode
::decode_utf8
(LJ
::Widget
::CVK
->render_body());
874 elsif ( $tag eq 'lj-like' ) {
875 next TOKEN
if $opts->{'textonly'};
877 unless ( exists $opts->{'entry_url'} && $opts->{'entry_url'} ) {
878 $newdata .= '<b>[lj-like in invalid context]</b>';
883 my $like = LJ
::CleanHtml
::Like
->new({ 'entry_url' => $opts->{'entry_url'},
884 'buttons' => $attr->{'buttons'} ,
887 $newdata .= $like->html({ 'vkontakte_like_js' => \
%vkontakte_like_js});
889 elsif ( $tag eq 'lj-lead' ) {
890 next TOKEN
if $opencount{'lj-lead'};
892 $newdata .= qq{<div
class="b-journalpreamble">};
893 $opencount{'lj-lead'}++;
895 elsif ( $tag eq 'lj-quote' ) {
896 $newdata .= qq{<div
class="b-journalblockquote">};
897 $opencount{'lj-quote'}++;
899 elsif ( $tag eq 'lj-quote-cite' ) {
900 next TOKEN
if !$opencount{'lj-quote'} || $opencount{'lj-quote-cite'};
902 $newdata .= qq{<cite
class="b-journalblockquote-author">};
903 $opencount{'lj-quote-cite'}++;
905 elsif ( $tag eq 'lj-gallery' ) {
906 next TOKEN
if $opencount{'lj-gallery'};
907 $opencount{'lj-gallery'}->{width
} = $attr->{width
};
908 $opencount{'lj-gallery'}->{height
} = $attr->{height
};
909 $newdata .= $token->[4];
911 elsif ( $tag eq 'lj-gallery-item' ) {
912 next TOKEN
unless $opencount{'lj-gallery'};
914 my $src = $attr->{src
};
915 my $width = $opencount{'lj-gallery'}->{width
} ?
qq{width
="$opencount{'lj-gallery'}->{width}"} : '';
916 my $height = $opencount{'lj-gallery'}->{height
} ?
qq{height
="$opencount{'lj-gallery'}->{height}"} : '';
918 $newdata .= qq{<lj
-gallery
-item
><img src
="$src" $width $height><lj
-gallery
-item
-capture
>};
920 elsif ( $tag eq 'lj-image' ) {
921 $opencount{'lj-image-a'} = 0;
923 my $src = $attr->{src
};
924 my $href = $attr->{href
};
926 my $height = $attr->{height
};
927 my $width = $attr->{width
};
929 my $center = defined $attr->{center
} ?
'b-journalpicture-alignment' : '';
933 if ($width || $height) {
934 $width = "width: ${width}px;" if $width;
935 $height = "height: ${height}px;" if $height;
936 $style = qq{style
="$width $height"};
939 my $img = qq{<img
$style class="b-journalpicture-image" src
="$src">};
941 $img = qq{<a href
="$href">$img</a
>};
944 $img = qq{<figure
class="b-journalpicture b-journalpicture-alignment">$img<figcaption
class="b-journalpicture-caption">};
946 $opencount{'lj-image'}++;
950 # Don't allow any tag with the "set" attribute
951 elsif ($tag =~ m/:set$/) {
958 my $hash = $token->[2];
959 my $attrs = $token->[3]; # attribute names, in original order
961 $slashclose = 1 if delete $hash->{'/'};
963 foreach (@attrstrip) {
964 # maybe there's a better place for this?
965 next if (lc $tag eq 'lj-embed' && lc $_ eq 'id');
969 if ($tag eq "form") {
970 my $action = lc($hash->{'action'});
972 if ($action =~ m!^https?://?([^/]+)!) {
975 $host =~ /[%\@\s]/ ||
976 $LJ::FORM_DOMAIN_BANNED
{$host};
980 delete $hash->{'action'} if $deny;
984 foreach my $attr (keys %$hash) {
985 if ( $remove_all_attribs || $remove_attribs{$attr} ) {
986 delete $hash->{$attr};
990 if ($attr =~ /^(?:on|dynsrc)/) {
991 delete $hash->{$attr};
995 if ($attr eq "data") {
996 delete $hash->{$attr} unless $tag eq "object";
1000 unless ($opts->{entry_url
}) {
1001 if ($attr eq 'width' || $attr eq 'height' ) {
1002 if ($hash->{$attr} > 1024*2) {
1003 $hash->{$attr} = 1024*2;
1008 ## warning: in commets left by anonymous users, <img src="something">
1009 ## is replaced by <a href="something"> (see 'extractimages' param)
1010 ## If "something" is "data:<script ...", we'll get a vulnerability
1011 if (($attr eq "href" || $attr eq 'src') && $hash->{$attr} =~ /^data/) {
1012 delete $hash->{$attr};
1016 if ($attr =~ /(?:^=)|[\x0b\x0d]/) {
1017 # Cleaner attack: <p ='>' onmouseover="javascript:alert(document/**/.cookie)" >
1018 # is returned by HTML::Parser as P_tag("='" => "='") Text( onmouseover...)
1019 # which leads to reconstruction of valid HTML. Clever!
1020 # detect this, and fail.
1021 $total_fail->("$tag $attr");
1025 # ignore attributes that do not fit this strict scheme
1026 unless ($attr =~ /^[\w_:-]+$/) {
1027 $total_fail->("$tag " . (%$hash > 1 ?
"[...] " : "") . "$attr");
1031 $hash->{$attr} =~ s/[\t\n]//g;
1033 # IE ignores the null character, so strip it out
1034 $hash->{$attr} =~ s/\x0//g;
1037 my $nowhite = $hash->{$attr};
1038 $nowhite =~ s/[\s\x0b]+//go;
1039 if ($nowhite =~ /(?:jscript|livescript|javascript|vbscript|about):/ix) {
1040 delete $hash->{$attr};
1044 if ($attr eq 'style') {
1045 if ($opts->{'cleancss'}) {
1046 # css2 spec, section 4.1.3
1047 # position === p\osition :(
1048 # strip all slashes no matter what.
1049 $hash->{style
} =~ s/\\//g;
1051 # and catch the obvious ones ("[" is for things like document["coo"+"kie"]
1052 foreach my $css ("/*", "[", qw(absolute fixed expression eval behavior cookie document window javascript -moz-binding)) {
1053 if ($hash->{style
} =~ /\Q$css\E/i) {
1054 delete $hash->{style
};
1059 if ($opts->{'strongcleancss'}) {
1060 if ($hash->{style
} =~ /-moz-|absolute|relative|outline|z-index|(?<!-)(?:top|left|right|bottom)\s*:|filter|-webkit-/io) {
1061 delete $hash->{style
};
1066 # remove specific CSS definitions
1067 if ($remove_colors) {
1068 $hash->{style
} =~ s/(?:background-)?color:.*?(?:;|$)//gi;
1071 if ($remove_sizes) {
1072 $hash->{style
} =~ s/font-size:.*?(?:;|$)//gi;
1075 if ($remove_fonts) {
1076 $hash->{style
} =~ s/font-family:.*?(?:;|$)//gi;
1079 if ($remove_positioning) {
1080 $hash->{style
} =~ s/margin.*?(?:;|$)//gi;
1081 $hash->{style
} =~ s/height\s*?:.*?(?:;|$)//gi;
1083 # strip excessive padding
1084 $hash->{style
} =~ s/padding[^:]*?:\D*\d{3,}[^;]*(?:;|$)//gi;
1088 if ($opts->{'clean_js_css'} && ! $LJ::DISABLED
{'css_cleaner'}) {
1089 # and then run it through a harder CSS cleaner that does a full parse
1090 my $css = LJ
::CSS
::Cleaner
->new;
1091 $hash->{style
} = $css->clean_property($hash->{style
});
1096 lc $tag ne 'lj-embed' &&
1097 ( $attr eq 'class' || $attr eq 'id' ) &&
1098 $opts->{'strongcleancss'} )
1100 unless (exists $LJ::CLASSNAME_WHITELIST
{$hash->{$attr}}) {
1101 delete $hash->{$attr};
1106 # reserve ljs_* ids for divs, etc so users can't override them to replace content
1107 if ($attr eq 'id' && $hash->{$attr} =~ /^ljs_/i) {
1108 delete $hash->{$attr};
1113 if ($attr =~ /%%/) {
1114 delete $hash->{$attr};
1118 my $props = $LJ::S1
::PROPS
->{$s1var};
1120 if ($hash->{$attr} =~ /^%%([\w:]+:)?(\S+?)%%$/ && $props->{$2} =~ /[aud]/) {
1122 } elsif ($hash->{$attr} =~ /^%%cons:\w+%%[^\%]*$/) {
1123 # a site constant with something appended is also fine.
1124 } elsif ($hash->{$attr} =~ /%%/) {
1125 my $clean_var = sub {
1126 my ($mods, $prop) = @_;
1128 # HTML escape and kill line breaks
1129 $mods = "attr:$mods" unless
1130 $mods =~ /^(color|cons|siteroot|sitename|img):/ ||
1131 $props->{$prop} =~ /[ud]/;
1132 return '%%' . $mods . $prop . '%%';
1135 $hash->{$attr} =~ s/[\n\r]//g;
1136 $hash->{$attr} =~ s/%%([\w:]+:)?(\S+?)%%/$clean_var->(lc($1), $2)/eg;
1138 if ($attr =~ /^(href|src|lowsrc|style)$/) {
1139 $hash->{$attr} = "\%\%[attr[$hash->{$attr}]]\%\%";
1144 # remove specific attributes
1145 if (($remove_colors && ($attr eq "color" || $attr eq "bgcolor" || $attr eq "fgcolor" || $attr eq "text")) ||
1146 ($remove_sizes && $attr eq "size") ||
1147 ($remove_fonts && $attr eq "face")) {
1148 delete $hash->{$attr};
1153 ## attribute lj-sys-message-close is used in SiteMessage's only
1154 if (exists $hash->{'lj-sys-message-close'}) {
1155 delete $hash->{'lj-sys-message-close'};
1156 if (my $mid = $opts->{'lj_sys_message_id'}) {
1157 $hash->{'onclick'} = "LiveJournal.closeSiteMessage(this, event, $mid)";
1158 push @
$attrs, 'onclick';
1162 if (exists $hash->{href
}) {
1163 ## links to some resources will be completely blocked
1164 ## and replaced by value of 'blocked_link_substitute' param
1165 if ($blocked_links) {
1166 foreach my $re (@
$blocked_links) {
1167 if ($hash->{href
} =~ $re) {
1168 $hash->{href
} = sprintf($blocked_link_substitute, LJ
::eurl
($hash->{href
}));
1174 unless ($hash->{href
} =~ s/^lj:(?:\/\/)?
(.*)$/ExpandLJURL($1)/ei
) {
1175 $hash->{href
} = canonical_url
($hash->{href
}, 1);
1179 if ($tag eq "img") {
1182 if ($opts->{'extractimages'}) { $img_bad = 1; }
1184 if ( my $maxwidth = $opts->{'maximgwidth'} ) {
1185 my $width = $hash->{'width'};
1186 if ( $width && $width !~ /\%$/ ) {
1187 $width =~ s/[^\d.]//g;
1188 if ( int $width > $maxwidth ) {
1189 delete $hash->{'width'};
1190 delete $hash->{'height'};
1195 # don't use placeholders for small images
1196 if ( $opts->{'img_placeholders'} ) {
1197 if ( exists $hash->{style
} ) {
1198 if ( $hash->{style
} =~ /[^\-]width\:\s*(\d+)(?:px)?\;/i ) {
1199 $hash->{width
} = $1;
1202 if ( $hash->{style
} =~ /[^\-]height\:\s*(\d+)(?:px)?\;/i ) {
1203 $hash->{height
} = $1;
1207 if ( exists $hash->{width
} && $hash->{width
} =~ /^[\d.]+$/ && exists $hash->{height
} && $hash->{height
} =~ /^[\d.]+$/ ) {
1208 if ( $hash->{'width'} > 140 && $hash->{'height'} > 37 ) {
1216 delete $hash->{width
} if exists $hash->{width
};
1217 delete $hash->{height
} if exists $hash->{height
};
1224 ## Option 'allowed_img_attrs' provides a list of allowed attributes
1225 if (my $allowed = $opts->{'allowed_img_attrs'}){
1226 while (my ($attr, undef) = each %$hash){
1227 delete $hash->{$attr} unless $allowed->{$attr};
1231 ## TODO: a better check of $hash->{src} is needed,
1232 ## known (fixed) vulnerability is src="data:..."
1233 $hash->{src
} = canonical_url
($hash->{src
}, 1);
1235 ## Ratings can be cheated by commenting a popular post with
1236 ## <img src="http://my-journal.livejournal.com/12345.html">
1237 if ($hash->{src
} =~ m!/\d+\.html$!) {
1242 ## http://pics.livejournal.com/<certain-journal>/pic/000fbt9x* -> l-pics.livejournal.com
1243 ## TODO: make it work for communities too
1244 if ($hash->{'src'} =~ m!^http://(?:l-)?pics.livejournal.com/(\w+)/pic/(.*)$!i) {
1245 my ($journal, $rest) = ($1, $2);
1246 my $host = (!$LJ::DISABLED
{'pics_via_cdn'} && $LJ::USE_CDN_FOR_PICS
{$journal})
1247 ?
"l-pics.livejournal.com" : "pics.livejournal.com";
1248 $hash->{'src'} = "http://$host/$journal/pic/$rest";
1252 $newdata .= qq~<a
class="b-mediaplaceholder b-mediaplaceholder-photo ~ . ( $opts->{'remove_img_sizes'} ? '"' : qq~ b-mediaplaceholder-good" style="width:$hash->{'width
'}px;height:$hash->{'height
'}px;"~ ) . ( $hash->{'width
'} ? qq~ data-width="$hash->{'width
'}"~ : '' ) . ( $hash->{'height
'} ? qq~ data-height="$hash->{'height
'}"~: '' ) . qq~data-href="$href_b_link" href="~ .
1253 LJ::ehtml($hash->{'src
'}) . '" onclick="return LiveJournal
.placeholderClick
(this
, \'image
\')">' .
1254 '<span class="b
-mediaplaceholder
-outer
">' .
1255 '<span class="b
-mediaplaceholder
-inner
">' .
1256 '<i class="b
-mediaplaceholder
-pic
"></i>' .
1257 '<span class="b
-mediaplaceholder
-label b
-mediaplaceholder
-view
">' . Encode::decode_utf8(LJ::Lang::ml("mediaplaceholder
.viewimage
")) . '</span>'.
1258 '<span class="b
-mediaplaceholder
-label b
-mediaplaceholder
-loading
">' . Encode::decode_utf8(LJ::Lang::ml("mediaplaceholder
.loading
")) . '</span>'.
1262 $newdata .= $href_b_link ?
1263 '<a href="' . $href_b_link .'" class="b
-mediaplaceholder
-external
" title="' . Encode::decode_utf8(LJ::Lang::ml("mediaplaceholder.link")) . '">' .
1264 '<i class="b
-mediaplaceholder
-bg
"></i>' .
1265 '<i class="b
-mediaplaceholder
-pic
"></i>' .
1266 '<span class="b
-mediaplaceholder
-inner
">' . Encode::decode_utf8(LJ::Lang::ml("mediaplaceholder
.link")) . '</span>' .
1269 $opencount{"img
"}++;
1273 if ($tag eq "a
" && $extractlinks)
1275 push @canonical_urls, canonical_url($attr->{href}, 1);
1280 if ($tag eq "a
" and $hash->{href} and $put_nofollow) {
1281 if ($hash->{href} =~ m!^(https?://)?([^/]+?)(/.*)?$!) {
1283 unless ($host =~ /\Q$LJ::DOMAIN\E$/i) {
1284 $hash->{rel} = "nofollow
";
1285 push @$attrs, 'rel';
1290 ## LJSUP-10811: due to security issue only Flash is allowed
1291 ## LJSV-1995: Embedded video from http://video.yandex.ru doesn't shown
1292 if ($tag eq 'embed'){
1293 $hash->{type} = 'application/x-shockwave-flash';
1294 push @$attrs => 'type' unless grep { $_ eq 'type' } @$attrs;
1296 if ($tag eq 'object' and ($hash->{data} || $hash->{src})){
1297 $hash->{type} = 'application/x-shockwave-flash';
1298 push @$attrs => 'type' unless grep { $_ eq 'type' } @$attrs;
1301 # LJSV-2152: When comment has embed in it - bubbles should be above buttons
1302 if ( $tag eq 'iframe' and $hash->{'src'} ) {
1303 foreach my $host (keys %LJ::WHITELIST_VIDEO_HOSTS) {
1304 if ( index ($hash->{'src'}, $host) != -1) {
1306 # Youtube accepts escaped parameters in form "%61utoplay=1"
1307 $hash->{'src'} = LJ::durl($hash->{'src'});
1309 # LJSUP-17010: For all links with media parameter "autoplay
" must be deleted or = 0
1310 $hash->{'src'} =~ s/autoplay=1/autoplay=0/gi;
1312 # LJSUP-17018: Replacement autoplay = true on autoplay = false
1313 $hash->{'src'} =~ s/autoplay=true/autoplay=false/gi;
1315 if ( $hash->{'src'} !~ m!player\.seemedia\.pro! && $hash->{'src'} !~ m!wmode=opaque!i ) {
1316 if ( $hash->{'src'} =~ m!\?! ) {
1317 $hash->{'src'} .= '&wmode=opaque';
1319 $hash->{'src'} .= '?wmode=opaque';
1328 # Through the xsl namespace in XML, it is possible to embed scripting lanaguages
1329 # as elements which will then be executed by the browser. Combining this with
1330 # customview.cgi makes it very easy for someone to replace their entire journal
1331 # in S1 with a page that embeds scripting as well. An example being an AJAX
1332 # six degrees tool, while cool it should not be allowed.
1335 # <xsl:element name="script
">
1336 # <xsl:attribute name="type
">text/javascript</xsl:attribute>
1337 if ($tag eq 'xsl:attribute')
1339 $alt_output = 1; # We'll always deal with output for this token
1341 my $orig_value = $p->get_text; # Get the value of this element
1342 my $value = $orig_value; # Make a copy if this turns out to be alright
1343 $value =~ s/\s+//g; # Remove any whitespace
1345 # See if they are trying to output scripting, if so eat the xsl:attribute
1346 # container and its value
1347 if ($value =~ /(javascript|vbscript)/i) {
1349 # Remove the closing tag from the tree
1352 # Remove the value itself from the tree
1355 # No harm, no foul...Write back out the original
1357 $newdata .= "$token->[4]$orig_value";
1361 unless ($alt_output) {
1364 if ($mode eq "allow
") {
1366 if ($action{$tag} eq "deny
") { $allow = 0; }
1369 if ($action{$tag} eq "allow
") { $allow = 1; }
1374 if ($allow && ! $remove{$tag}) {
1375 if ($opts->{'tablecheck'}) {
1379 # can't open table elements from outside a table
1380 ($tag =~ /^(?:tbody|thead|tfoot|tr|td|th|caption|colgroup|col)$/ && ! @tablescope) ||
1382 # can't open td or th if not inside tr
1383 ($tag =~ /^(?:td|th)$/ && ! $tablescope[-1]->{'tr'}) ||
1385 # can't open a table unless inside a td or th
1386 ($tag eq 'table' && @tablescope && ! grep { $tablescope[-1]->{$_} } qw(td th));
1389 if ($allow) { $newtag .= "<$tag"; }
1390 else { $newtag .= "<$tag"; }
1392 # output attributes in original order, but only those
1393 # that are allowed (by still being in %$hash after cleaning)
1395 $newtag .= " $_=\"" . LJ
::ehtml
($hash->{$_}) . "\""
1396 if exists $hash->{$_};
1399 # ignore the effects of slashclose unless we're dealing with a tag that can
1400 # actually close itself. Otherwise, a tag like <em /> can pass through as valid
1401 # even though some browsers just render it as an opening tag
1402 if ($slashclose && $tag =~ $slashclose_tags) {
1405 $tablescope[-1]->{$tag}-- if $opts->{'tablecheck'} && @tablescope;
1411 # maintain current table scope
1412 if ($opts->{'tablecheck'}) {
1415 if ($tag eq 'table') {
1416 push @tablescope, {};
1418 # new tag within current table
1419 } elsif (@tablescope) {
1420 $tablescope[-1]->{$tag}++;
1425 else { $newtag .= ">"; }
1427 # change iframe with video to placeholder according to user settings
1428 if ( lc $tag eq 'iframe' && $opts->{video_placeholders
} ) {
1429 my $width = $hash->{width
};
1430 my $height = $hash->{height
};
1433 $width = 960 if $width > 960;
1434 $height = 750 if $height > 750;
1436 $width = $width =~ /^\d+$/ ?
$width : 320;
1437 $height = $height =~ /^\d+$/ ?
$height : 240;
1439 $newdata .= LJ
::placeholder_link
(
1440 placeholder_html
=> $newtag,
1443 img
=> "$LJ::IMGPREFIX/videoplaceholder.png",
1444 remove_video_sizes
=> $opts->{remove_video_sizes
},
1449 $newdata .= $newtag;
1456 elsif ($type eq "E")
1458 my $tag = $token->[1];
1459 next TOKEN
if $tag =~ /[^\w\-:]/;
1461 push @capture, $token if $capturing_during_eat;
1463 if ($eatuntil[-1] eq $tag) {
1465 if (my $cb = $capturing_during_eat) {
1467 $finish_capture->();
1472 next TOKEN
if @eatuntil;
1475 # if we're just getting the contents of a cut tag, then pop the
1476 # tag off the stack. if this is the last tag on the stack, then
1477 # go back to eating the rest of the content.
1478 if ( @cuttag_stack ) {
1479 if ( $cuttag_stack[-1] eq $tag ) {
1481 last TOKEN
unless ( @cuttag_stack );
1489 if ($eating_ljuser_span && $tag eq "span") {
1490 $eating_ljuser_span = 0;
1491 $newdata .= $opts->{'textonly'} ?
$ljuser_text_node : LJ
::ljuser
($ljuser_text_node);
1495 if ( $opts->{'img_placeholders'} ) {
1496 if ( $tag eq 'a' && $in_link ) {
1503 next TOKEN
if $text_a_link;
1507 if ($tag eq "lj-raw") {
1509 $tablescope[-1]->{$tag}-- if $opts->{'tablecheck'} && @tablescope;
1511 elsif ($tag eq "lj-cut") {
1512 if ($opts->{'cutpreview'}) {
1513 $newdata .= "<b></lj-cut></b>";
1515 $newdata .= "<a name='cutid$cutcount-end'></a>"
1518 elsif ($tag eq "lj-repost" and $ljrepost_allowed and ref $opencount{$tag}) {
1519 ## Add repost button
1520 ## If there is opening <lj-repost> tag than $opencount{$tag} exists.
1522 my $button = LJ
::ehtml
($opencount{$tag}->{button
}) || LJ
::Lang
::ml
("repost.default_button");
1523 my $subject = LJ
::ehtml
($opencount{$tag}->{subject
});
1524 my $captured = substr $newdata => $opencount{$tag}->{offset
};
1526 if ($captured and my $entry = LJ
::Entry
->new_from_url($opts->{cuturl
})){
1527 # !!! avoid calling any 'text' methods on $entry,
1528 # it can produce inifinite loop of cleanhtml calls.
1531 $subject = LJ
::ehtml
($entry->subject_raw || LJ
::Lang
::ml
("repost.default_subject"));
1534 if ($subject && Encode
::is_utf8
($subject)) {
1535 $subject = Encode
::encode_utf8
($subject);
1538 ## 'posterid' property of a removed (is_valied eq 'false') entry is empty.
1539 my $poster_username = $entry->poster
1540 ?
$entry->poster->username
1543 LJ
::EmbedModule
->add_user_to_embed($poster_username, \
$captured);
1544 $captured = LJ
::Lang
::ml
("repost.wrapper", {
1545 username
=> $poster_username,
1547 subject
=> $subject,
1548 text
=> Encode
::encode_utf8
($captured),
1551 $captured = Encode
::decode_utf8
($captured);
1552 $subject = Encode
::decode_utf8
($subject) if $subject;
1555 $captured = LJ
::ehtml
($captured);
1557 # add <form> with invisible fields and visible submit button
1560 <form action
="http://www.$LJ::DOMAIN/update.bml" method
="POST">
1561 <div style
="display:none;visible:false">
1562 <input type
="text" name
="subject" value
="$subject" />
1563 <textarea name
="event">$captured</textarea
>
1564 <input type
="hidden" name
="repost" value
="$opts->{cuturl}" />
1565 <input type
="hidden" name
="repost_type" value
="a" />
1567 <input type
="submit" value
="$button" />
1570 ## treat <lj-repost></lj-repost> as <lj-repost />
1571 $newdata .= qq[<form action
="http://www.$LJ::DOMAIN/update.bml" method
="GET">]
1572 . qq[<input type
="hidden" name
="repost" value
="$opts->{cuturl}" />]
1573 . qq[<input type
="hidden" name
="repost_type" value
="a" />]
1574 . qq(<input type
="submit" value
="$button" /> )
1578 delete $opencount{$tag};
1580 } elsif ( $tag eq 'lj-lang' ) {
1582 } elsif ( $tag eq 'lj-lang-container' ) {
1583 shift @lj_lang_otherwise;
1584 } elsif ( $tag eq 'lj-spoiler' ) {
1585 if ($ljspoiler_allowed && $ljspoilers_open) {
1586 $newdata .= qq{</div></div
>};
1589 } elsif ( $tag eq 'lj-quote' ) {
1590 next TOKEN
unless $opencount{'lj-quote'};
1592 if ($opencount{'lj-quote-block'}) {
1593 $newdata .= qq{</blockquote
>};
1594 $opencount{'lj-quote-block'}--;
1596 $newdata .= qq{</div
>};
1597 $opencount{'lj-quote'}--;
1598 } elsif ( $tag eq 'lj-quote-cite' ) {
1599 next TOKEN
unless $opencount{'lj-quote-cite'};
1601 $newdata .= qq{</cite
>};
1602 $opencount{'lj-quote-cite'}--;
1604 $newdata .= qq{<blockquote
class="b-journalblockquote-quote">};
1605 $opencount{'lj-quote-block'}++;
1606 } elsif ( $tag eq 'lj-lead' ) {
1607 next TOKEN
unless $opencount{'lj-lead'};
1609 $newdata .= qq{</div
>};
1610 $opencount{'lj-lead'}--;
1611 } elsif ( $tag eq 'lj-gallery' ) {
1612 next TOKEN
unless $opencount{'lj-gallery'};
1613 undef $opencount{'lj-gallery'};
1614 $newdata .= qq{</lj
-gallery
>};
1615 } elsif ( $tag eq 'lj-gallery-item' ) {
1616 $newdata .= qq{</lj-gallery-item-capture></lj
-gallery
-item
>};
1617 } elsif ( $tag eq 'lj-image' ) {
1618 $newdata .= qq{</figcaption></figure
>};
1620 if ($mode eq "allow") {
1622 if ($action{$tag} eq "deny") { $allow = 0; }
1625 if ($action{$tag} eq "allow") { $allow = 1; }
1628 if ($extractlinks && $tag eq "a") {
1629 if (@canonical_urls) {
1630 my $url = LJ
::ehtml
(pop @canonical_urls);
1631 $newdata .= "</b> ($url)";
1636 if ($allow && ! $remove{$tag})
1639 if ($opts->{'tablecheck'}) {
1643 # can't close table elements from outside a table
1644 ($tag =~ /^(?:table|tbody|thead|tfoot|tr|td|th|caption|colgroup|col)$/ && ! @tablescope) ||
1646 # can't close td or th unless open tr
1647 ($tag =~ /^(?:td|th)$/ && ! $tablescope[-1]->{'tr'});
1650 if ($allow && ! ($opts->{'noearlyclose'} && ! $opencount{$tag})) {
1652 # maintain current table scope
1653 if ($opts->{'tablecheck'}) {
1656 if ($tag eq 'table') {
1659 # closing tag within current table
1660 } elsif (@tablescope) {
1661 $tablescope[-1]->{$tag}--;
1665 $newdata .= "</$tag>";
1668 $newdata .= "</$tag>";
1673 elsif ($type eq "D") {
1674 # remove everything past first closing tag
1675 $token->[1] =~ s/>.+/>/s;
1676 # kill any opening tag except the starting one
1677 $token->[1] =~ s/.<//sg;
1678 $newdata .= $token->[1];
1680 elsif ($type eq "T") {
1686 push @capture, $token if $capturing_during_eat;
1694 if ( $opts->{'img_placeholders'} ) {
1695 if ( $in_link && $img_link ) {
1696 $newdata .= qq~<a href
="$href_b_link">~
1704 if ($eating_ljuser_span) {
1705 $ljuser_text_node = $token->[1];
1709 if ($opencount{'style'} && $LJ::DEBUG
{'s1_style_textnode'}) {
1710 my $uri = LJ
::Request
->uri;
1711 my $host = LJ
::Request
->header_in("Host");
1712 warn "Got text node while style elements open. Shouldn't happen anymore. ($host$uri)\n";
1715 my $auto_format = $addbreaks &&
1716 ($opencount{'table'} <= ($opencount{'td'} + $opencount{'th'})) &&
1717 ! $opencount{'pre'} &&
1718 ! $opencount{'lj-raw'};
1720 if ($auto_format && ! $noautolinks && ! $opencount{'a'} && ! $opencount{'textarea'}) {
1724 if ($str =~ /^(.*?)(&(#39|quot|lt|gt)(;.*)?)$/) {
1725 $url{++$urlcount} = $1;
1728 $url{++$urlcount} = $str;
1730 $nofollow{$urlcount} = 0;
1731 if ($put_nofollow and $url{$urlcount} =~ m!^https?://([^/]+?)(/.*)?$!) {
1733 unless ($host =~ /\Q$LJ::DOMAIN\E$/i) {
1734 $nofollow{$urlcount} = 1;
1737 return "&url$urlcount;$url{$urlcount}&urlend;$end";
1739 ## URL is http://anything-here-but-space-and-quotes/and-last-symbol-isn't-space-comma-period-etc
1740 ## like this (http://example.com) and these: http://foo.bar, http://bar.baz.
1741 $token->[1] =~ s!(https?://[^\s\'\"\<\>]+[^\s\'\"\<\>\.\,\?\:\)])! $match->($1); !ge;
1744 # escape tags in text tokens. shouldn't belong here!
1745 # especially because the parser returns things it's
1746 # confused about (broken, ill-formed HTML) as text.
1747 $token->[1] =~ s/</</g;
1748 $token->[1] =~ s/>/>/g;
1750 # put <wbr> tags into long words, except inside <pre> and <textarea>.
1751 if ($wordlength && !$opencount{'pre'} && !$opencount{'textarea'}) {
1752 $token->[1] =~ s/(\S{$wordlength,})/break_word($1,$wordlength)/eg;
1755 if ($auto_format && ! $noautolinks && ! $opencount{'a'} && ! $opencount{'textarea'}) {
1756 ## Convert %username%.жж.рф and %username%.живойжурнал.рф to urls
1757 $token->[1] =~ s/(?<!http:\/\/)\b([\w
]+\
.\x
{0436}\x
{0436}\
.\x
{0440}\x
{0444})/<a href="http:\/\
/$1">$1<\/a>/g
;
1758 $token->[1] =~ s/(?<!http:\/\/)\b([\w
]+\
.\x
{0436}\x
{0438}\x
{0432}\x
{043E
}\x
{0439}\x
{0436}\x
{0443}\x
{0440}\x
{043D
}\x
{0430}\x
{043B
}\
.\x
{0440}\x
{0444})/<a href="http:\/\
/$1">$1<\/a>/g
;
1761 # auto-format things, unless we're in a textarea, when it doesn't make sense
1762 if ($auto_format && !$opencount{'textarea'}) {
1763 $token->[1] =~ s/\r?\n/<br \/>/g
;
1764 if (! $opencount{'a'}) {
1766 my ($key, $title) = @_;
1767 my $nofollow = $nofollow{$key} ?
" rel='nofollow'" : "";
1768 return "<a href='$url{$key}'$nofollow>$title</a>";
1770 $token->[1] =~ s
|&url
(\d
+);(.*?
)&urlend
;|$tag_a->($1,$2)|ge;
1774 $newdata .= $token->[1];
1776 elsif ($type eq "C") {
1778 # probably a malformed tag rather than a comment, so escape it
1779 # -- ehtml things like "<3", "<--->", "<>", etc
1780 # -- comments must start with <! to be eaten
1781 if ($token->[1] =~ /^<[^!]/) {
1782 $newdata .= LJ
::ehtml
($token->[1]);
1784 # by default, ditch comments
1785 } elsif ($keepcomments) {
1786 my $com = $token->[1];
1787 $com =~ s/^<!--\s*//;
1788 $com =~ s/\s*--!>$//;
1791 $newdata .= "<!-- $com -->";
1794 elsif ($type eq "PI") {
1795 my $tok = $token->[1];
1798 $newdata .= "<?$tok>";
1801 $newdata .= "<!-- OTHER: " . $type . "-->\n";
1805 # finish up open links if we're extracting them
1806 if ($extractlinks && @canonical_urls) {
1807 while (my $url = LJ
::ehtml
(pop @canonical_urls)) {
1808 $newdata .= "</b> ($url)";
1813 # close any tags that were opened and not closed
1814 # don't close tags that don't need a closing tag -- otherwise,
1815 # we output the closing tags in the wrong place (eg, a </td>
1816 # after the <table> was closed) causing unnecessary problems
1817 if (ref $opts->{'autoclose'} eq "ARRAY") {
1818 foreach my $tag (@
{$opts->{'autoclose'}}) {
1819 next if $tag =~ /^(?:tr|td|th|tbody|thead|tfoot|li)$/o;
1820 if ($opencount{$tag}) {
1821 $newdata .= "</$tag>" x
$opencount{$tag};
1826 if ($ljspoilers_open) {
1827 $newdata .= qq{</div></div
>} x
$ljspoilers_open;
1830 if ($opencount{'lj-quote-cite'}) {
1831 $newdata .= qq{</cite
>} x
$opencount{'lj-quote-cite'};
1834 if ($opencount{'lj-quote-block'}) {
1835 $newdata .= qq{</blockquote
>} x
$opencount{'lj-quote-block'};
1838 if ($opencount{'lj-quote'}) {
1839 $newdata .= qq{</div
>} x
$opencount{'lj-quote'};
1842 if ($opencount{'lj-lead'}) {
1843 $newdata .= qq{</div
>} x
$opencount{'lj-lead'};
1846 # extra-paranoid check
1847 1 while $newdata =~ s/<script\b//ig;
1849 $newdata =~ s/<x-vk-like id="(\d+)">/$vkontakte_like_js{$1}/eg;
1852 $$data .= $extra_text if $extra_text; # invalid markup error
1854 # encode data back to utf8 before return
1855 $$data = Encode
::encode_utf8
($$data);
1858 my $msg = qq{<div style
="color: #000; font: 12px Verdana, Arial, Sans-Serif; background-color: #ffeeee; background-repeat: repeat-x; border: 1px solid #ff9999; padding: 8px; margin: 5px auto; width: auto; text-align: left; background-image: url('$LJ::IMGPREFIX/message-error.gif?v=4888');">};
1859 my $link_style = "color: #00c; text-decoration: underline; background: transparent; border: 0;";
1861 if ($unsuspend_supportid) {
1862 $msg .= LJ
::Lang
::ml
('cleanhtml.suspend_msg_with_supportid', { aopts
=> "href='$LJ::SITEROOT/support/see_request.bml?id=$unsuspend_supportid' style='$link_style'" });
1864 $msg .= LJ
::Lang
::ml
('cleanhtml.suspend_msg', { aopts
=> "href='$LJ::SITEROOT/abuse/report.bml' style='$link_style'" });
1869 $$data = $msg . $$data;
1876 # takes a reference to HTML and a base URL, and modifies HTML in place to use absolute URLs from the given base
1877 sub resolve_relative_urls
{
1878 my ($data, $base) = @_;
1879 my $p = HTML
::TokeParser
->new($data);
1881 # where we look for relative URLs
1891 my $global_did_mod = 0;
1892 my $base_uri = undef; # until needed
1896 while (my $token = $p->get_token)
1898 my $type = $token->[0];
1900 if ($type eq "S") # start tag
1902 my $tag = $token->[1];
1903 my $hash = $token->[2]; # attribute hashref
1904 my $attrs = $token->[3]; # attribute names, in original order
1907 # see if this is a tag that could contain relative URLs we fix up.
1908 if (my $relats = $rel_source->{$tag}) {
1909 while (my $k = each %$relats) {
1910 next unless defined $hash->{$k} && $hash->{$k} !~ /^[a-z]+:/;
1911 my $rel_url = $hash->{$k};
1912 $global_did_mod = $did_mod = 1;
1914 $base_uri ||= URI
->new($base);
1915 $hash->{$k} = URI
->new_abs($rel_url, $base_uri)->as_string;
1919 # if no change was necessary
1921 $newdata .= $token->[4];
1925 # otherwise, rebuild the opening tag
1927 # for tags like <name/>, pretend it's <name> and reinsert the slash later
1928 my $slashclose = 0; # If set to 1, use XML-style empty tag marker
1929 $slashclose = 1 if $tag =~ s!/$!!;
1930 $slashclose = 1 if delete $hash->{'/'};
1933 $newdata .= "<$tag";
1934 # output attributes in original order
1936 $newdata .= " $_=\"" . LJ
::ehtml
($hash->{$_}) . "\""
1937 if exists $hash->{$_};
1939 $newdata .= " /" if $slashclose;
1942 elsif ($type eq "E") {
1943 $newdata .= $token->[2];
1945 elsif ($type eq "D") {
1946 $newdata .= $token->[1];
1948 elsif ($type eq "T") {
1949 $newdata .= $token->[1];
1951 elsif ($type eq "C") {
1952 $newdata .= $token->[1];
1954 elsif ($type eq "PI") {
1955 $newdata .= $token->[2];
1959 $$data = $newdata if $global_did_mod;
1964 my @args = grep { $_ } split(/\//, $_[0]);
1965 my $mode = shift @args;
1972 return "support/faq/$id.html";
1974 return "support/faq/";
1978 my $user = LJ
::canonical_username
(shift);
1980 return "memories.bml?user=$user";
1982 return "memories.bml";
1986 my $user = LJ
::canonical_username
(shift);
1988 return "pubkey.bml?user=$user";
1990 return "pubkey.bml";
1996 return "support/see_request.bml?id=$id";
2002 my $user = LJ
::canonical_username
(shift);
2004 return "todo/?user=$user";
2010 my $user = LJ
::canonical_username
(shift);
2011 return "" if grep { /[\"\'\<\>\n\&]/ } @_;
2012 return $_[0] eq 'profile' ?
2013 "userinfo.bml?user=$user" :
2014 "users/$user/" . join("", map { "$_/" } @_ );
2017 my $user = LJ
::canonical_username
(shift);
2019 return "userinfo.bml?user=$user";
2021 return "userinfo.bml";
2025 my $user = LJ
::canonical_username
(shift);
2027 return "allpics.bml?user=$user";
2029 return "allpics.bml";
2034 my $uri = $modes{$mode} ?
$modes{$mode}->(@args) : "error:bogus-lj-url";
2036 return "$LJ::SITEROOT/$uri";
2039 my $subject_eat = [qw
[head title style layer iframe applet object param base
]];
2040 my $subject_allow = [qw
[a b i u em strong cite
]];
2041 my $subject_remove = [qw
[bgsound embed object caption
link font noscript lj
-userpic
]];
2044 return unless $$ref =~ /[\<\>]/;
2045 my $opts = shift || {};
2050 'eat' => $subject_eat,
2052 'allow' => $subject_allow,
2053 'remove' => $subject_remove,
2054 'autoclose' => $subject_allow,
2055 'noearlyclose' => 1,
2056 'remove_attribs' => [qw
/id class style/],
2061 ## returns a pure text subject (needed in links, email headers, etc...)
2062 my $subjectall_eat = [qw
[head title style layer iframe applet object lj
-spoiler
]];
2063 sub clean_subject_all
{
2065 return unless $$ref =~ /[\<\>]/;
2069 'eat' => $subjectall_eat,
2072 'autoclose' => $subject_allow,
2073 'noearlyclose' => 1,
2077 # wrapper around clean_subject_all; this also trims the subject to the given length
2078 sub clean_and_trim_subject
{
2080 my $length = shift || 40;
2082 LJ
::CleanHTML
::clean_subject_all
($ref);
2084 $$ref = LJ
::text_trim
($$ref, 0, $length);
2087 my $event_eat = [qw
[head title style layer applet object xml param base
]];
2088 my $event_remove = [qw
[bgsound embed object
link body meta noscript plaintext noframes
]];
2090 my @comment_close = qw(
2091 a sub sup xmp bdo q span
2092 b i u tt s strike big small font
2093 abbr acronym cite code dfn em kbd samp strong var del ins
2094 h1 h2 h3 h4 h5 h6 div blockquote address pre center
2096 table tr td th tbody tfoot thead colgroup caption
2097 area map form textarea blink
2099 my @comment_all = (@comment_close, qw{img br hr p col iframe audio video source
});
2101 my $userbio_eat = $event_eat;
2102 my $userbio_remove = $event_remove;
2103 my @userbio_close = @comment_close;
2106 my ($ref, $opts) = @_;
2108 # old prototype was passing in the ref and preformatted flag.
2109 # now the second argument is a hashref of options, so convert it to support the old way.
2110 unless (ref $opts eq "HASH") {
2111 $opts = { 'preformatted' => $opts };
2114 my $wordlength = defined $opts->{'wordlength'} ?
$opts->{'wordlength'} : 40;
2116 # fast path: no markup or URLs to linkify, and no suspend message needed
2117 if ($$ref !~ /\<|\>|http/ && $$ref !~ /(.*?)\.?жж\.рф/ && $$ref !~ /(.*?)\.?живойжурнал\.рф/ && ! $opts->{preformatted
} && !$opts->{suspend_msg
}) {
2118 $$ref =~ s/(\S{$wordlength,})/break_word($1,$wordlength)/eg if $wordlength;
2119 $$ref =~ s/\r?\n/<br \/>/g
;
2123 my $cleancss = $opts->{'journalid'} ?
2124 ! $LJ::STYLE_TRUSTED
{ $opts->{'journalid'} } : 0;
2126 my $strongcleancss = $cleancss;
2128 my $poster = LJ
::load_userid
( $opts->{'posterid'} );
2129 my $journal = LJ
::load_userid
( $opts->{'journalid'} );
2130 my $active_journal = LJ
::get_active_journal
();
2132 $poster->get_cap('no_strong_clean_css') &&
2133 $poster->equals($journal) &&
2134 $poster->equals($active_journal) )
2136 $strongcleancss = 0;
2139 # slow path: need to be run it through the cleaner
2142 'wordlength' => $wordlength,
2143 'addbreaks' => $opts->{'preformatted'} ?
0 : 1,
2144 'cutpreview' => $opts->{'cutpreview'},
2145 'posterid' => $opts->{'posterid'},
2146 'eat' => $event_eat,
2148 'remove' => $event_remove,
2149 'autoclose' => \
@comment_close,
2150 'cleancss' => $cleancss,
2151 'strongcleancss' => $strongcleancss,
2152 'noearlyclose' => 1,
2154 'ljrepost_allowed' => 1,
2159 sub pre_clean_event_for_entryform
{
2162 ## fast path - no html tags
2163 return unless $$ref =~ /</;
2166 my $data = Encode
::decode_utf8
($$ref);
2167 my $p = HTML
::TokeParser
->new(\
$data);
2171 while (my $token = $p->get_token) {
2172 my $type = $token->[0];
2175 my $tag = $token->[1];
2176 my $hash = $token->[2]; # attributes
2177 my $attrs = $token->[3]; # attribute names, in original order
2180 if ($tag eq 'script') {
2181 $p->get_tag("/$tag");
2184 if ($tag eq 'meta') {
2187 if ($tag =~ /:set$/) {
2190 unless ($tag =~ /^\w([\w\-:_]*\w)?\/?
$/) {
2194 my $autoclose = delete $hash->{'/'};
2195 foreach my $attr (keys %$hash) {
2196 if ($attr =~ /^(?:on|dynsrc)/) {
2197 delete $hash->{$attr};
2199 } elsif ($attr eq 'href' || $attr eq 'src') {
2200 if ($hash->{$attr} =~ /^data/) {
2201 delete $hash->{$attr};
2205 if ($attr =~ /(?:^=)|[\x0b\x0d]/) {
2208 unless ($attr =~ /^[\w_:-]+$/) {
2209 delete $hash->{$attr};
2212 my $tmp = $hash->{$attr};
2213 $tmp =~ s/[\t\n\0]//g;
2214 if ($tmp =~ /(?:jscript|livescript|javascript|vbscript|about):/ix) {
2215 delete $hash->{$attr};
2218 ## TODO: css & xslt js expressions
2220 ## reconstruct the tag
2221 $newdata .= "<$tag";
2223 $newdata .= " $_=\"" . LJ
::ehtml
($hash->{$_}) . "\"" if exists $hash->{$_};
2225 $newdata .= ($autoclose) ?
" />" : ">";
2226 } elsif ($type eq 'E' or $type eq 'PI') {
2227 ## close (end) tags and processing instructions
2228 $newdata .= $token->[2];
2230 $newdata .= $token->[1];
2234 # extra-paranoid check
2235 1 while $newdata =~ s/<script\b//ig;
2237 $$ref = Encode
::encode_utf8
($newdata);
2240 sub get_okay_comment_tags
{
2241 return @comment_all;
2245 # ref: scalarref of text to clean, gets cleaned in-place
2246 # opts: either a hashref of opts:
2247 # - preformatted: if true, don't insert breaks and auto-linkify
2248 # - anon_comment: don't linkify things, and prevent <a> tags
2249 # or, opts can just be a boolean scalar, which implies the performatted tag
2251 my ($ref, $opts) = @_;
2253 unless (ref $opts) {
2254 $opts = { 'preformatted' => $opts,
2258 # fast path: no markup or URLs to linkify
2259 if ($$ref !~ /\<|\>|http/ && $$ref !~ /(.*?)\.?жж\.рф/ && $$ref !~ /(.*?)\.?живойжурнал\.рф/ && ! $opts->{preformatted
}) {
2260 $$ref =~ s/(\S{40,})/break_word($1,40)/eg;
2261 $$ref =~ s/\r?\n/<br \/>/g
;
2265 # slow path: need to be run it through the cleaner
2266 return clean
($ref, {
2269 'addbreaks' => $opts->{preformatted
} ?
0 : 1,
2270 'eat' => [qw
[head title style layer applet object
]],
2272 'allow' => \
@comment_all,
2273 'autoclose' => \
@comment_close,
2275 'strongcleancss' => $opts->{'blocked_content'} ?
0 : 1,
2276 'extractlinks' => $opts->{'anon_comment'},
2277 'extractimages' => $opts->{'anon_comment'},
2278 'noearlyclose' => 1,
2280 'nocss' => $opts->{'nocss'},
2281 'textonly' => $opts->{'textonly'} ?
1 : 0,
2282 'remove_positioning' => 1,
2283 'posterid' => $opts->{'posterid'},
2284 'img_placeholders' => $opts->{'img_placeholders'},
2285 'video_placeholders' => $opts->{'video_placeholders'},
2286 'remove_img_sizes' => $opts->{'remove_img_sizes'},
2287 'remove_video_sizes' => $opts->{'remove_video_sizes'},
2288 'no_encode' => $opts->{'no_encode'},
2292 # ref: scalarref of text to clean, gets cleaned in-place
2294 my ($ref, $opts) = @_;
2296 # slow path: need to be run it through the cleaner
2297 return clean
($ref, {
2301 'eat' => [qw
[head title style layer applet object
]],
2303 'allow' => \
@comment_all,
2304 'autoclose' => \
@comment_close,
2306 'strongcleancss' => 1,
2307 'noearlyclose' => 1,
2309 'nocss' => $opts->{'nocss'},
2310 'textonly' => $opts->{'textonly'} ?
1 : 0,
2311 'remove_positioning' => 1,
2316 my ($ref, %opts) = @_;
2318 return undef unless ref $ref;
2321 'wordlength' => 100,
2323 'attrstrip' => [qw
[style
]],
2325 'noearlyclose' => 1,
2327 'eat' => $userbio_eat,
2328 'remove' => $userbio_remove,
2329 'autoclose' => \
@userbio_close,
2334 clean
($ref, \
%final_opts);
2337 sub clean_s1_style
{
2342 LJ
::parse_vars
(\
$s1, \
%tmpl);
2343 foreach my $v (keys %tmpl) {
2345 'eat' => [qw
[layer script object embed applet
]],
2347 'keepcomments' => 1, # allows CSS to work
2348 'clean_js_css' => 1,
2353 return Storable
::nfreeze
(\
%tmpl);
2356 sub s1_attribute_clean
{
2359 $a =~ s/\"/"/g;
2360 $a =~ s/\'/&\#39;/g;
2365 if ($a =~ /((?
:(?
:v\s
*b
)|(?
:j\s
*a\s
*v\s
*a
))\s
*s\s
*c\s
*r\s
*i\s
*p\s
*t
|
2366 a\s
*b\s
*o\s
*u\s
*t
)\s
*:/ix
) { return ""; }
2372 my $allow_all = shift;
2374 # strip leading and trailing spaces
2378 return '' unless $url;
2380 unless ($allow_all) {
2381 # see what protocol they want, default to http
2383 $pref = $1 if $url =~ /^(https?|ftp|webcal):/;
2385 # strip out the protocol section
2386 $url =~ s!^.*?:/*!!;
2388 return '' unless $url;
2391 $url = "$pref://$url";
2394 if ($LJ::DEBUG
{'aol_http_to_ftp'}) {
2395 # aol blocks http referred from lj, but ftp has no referer header.
2396 if ($url =~ m!^http://(?:www\.)?(?:members|hometown|users)\.aol\.com/!) {
2397 $url =~ s!^http!ftp!;
2405 my ($word, $at) = @_;
2406 return $word unless $at;
2408 $word =~ s/((?:$onechar){$at})\B/$1<wbr \/>/g
;
2416 my @tags_remove = qw(bgsound embed object link body meta noscript plaintext noframes);
2417 my @tags_allow = qw(lj);
2419 LJ
::CleanHTML
::clean
($ref, {
2421 'wordlength' => 160,
2422 'undefined_tags' => 'eat',
2423 'allow' => \
@tags_allow,
2424 'remove' => \
@tags_remove,
2426 'noearlyclose' => 1,
2431 # Trim function must be a part of cleanHTML::clean method,
2432 # but now this method is too complicated to do this right way.
2433 # Now just cut off last breaked tag.
2436 my $trunc = LJ
::text_trim
($$ref, 640, 320);
2437 if ($$ref ne $trunc) {
2438 $trunc =~ s/(\W+\w+)$//; # cut off last space and chars right from it.
2440 # cut off last unclosed tag
2441 if ($trunc =~ m!\</?([^>]+)$!) { # ... <tag or ... </tag
2443 $trunc =~ s!</?\Q$tag\E>?.*?$!!;
2446 # add '...' to the tail
2447 $$ref = $trunc . ' ...';