3 require_once 'HTMLPurifier/AttrDef.php';
4 require_once 'HTMLPurifier/URIScheme.php';
5 require_once 'HTMLPurifier/URISchemeRegistry.php';
6 require_once 'HTMLPurifier/AttrDef/URI/Host.php';
7 require_once 'HTMLPurifier/PercentEncoder.php';
9 HTMLPurifier_ConfigSchema
::define(
10 'URI', 'DefaultScheme', 'http', 'string',
11 'Defines through what scheme the output will be served, in order to '.
12 'select the proper object validator when no scheme information is present.'
15 HTMLPurifier_ConfigSchema
::define(
16 'URI', 'Host', null, 'string/null',
17 'Defines the domain name of the server, so we can determine whether or '.
18 'an absolute URI is from your website or not. Not strictly necessary, '.
19 'as users should be using relative URIs to reference resources on your '.
20 'website. It will, however, let you use absolute URIs to link to '.
21 'subdomains of the domain you post here: i.e. example.com will allow '.
22 'sub.example.com. However, higher up domains will still be excluded: '.
23 'if you set %URI.Host to sub.example.com, example.com will be blocked. '.
24 'This directive has been available since 1.2.0.'
27 HTMLPurifier_ConfigSchema
::define(
28 'URI', 'DisableExternal', false, 'bool',
29 'Disables links to external websites. This is a highly effective '.
30 'anti-spam and anti-pagerank-leech measure, but comes at a hefty price: no'.
31 'links or images outside of your domain will be allowed. Non-linkified '.
32 'URIs will still be preserved. If you want to be able to link to '.
33 'subdomains or use absolute URIs, specify %URI.Host for your website. '.
34 'This directive has been available since 1.2.0.'
37 HTMLPurifier_ConfigSchema
::define(
38 'URI', 'DisableExternalResources', false, 'bool',
39 'Disables the embedding of external resources, preventing users from '.
40 'embedding things like images from other hosts. This prevents '.
41 'access tracking (good for email viewers), bandwidth leeching, '.
42 'cross-site request forging, goatse.cx posting, and '.
43 'other nasties, but also results in '.
44 'a loss of end-user functionality (they can\'t directly post a pic '.
45 'they posted from Flickr anymore). Use it if you don\'t have a '.
46 'robust user-content moderation team. This directive has been '.
47 'available since 1.3.0.'
50 HTMLPurifier_ConfigSchema
::define(
51 'URI', 'DisableResources', false, 'bool',
52 'Disables embedding resources, essentially meaning no pictures. You can '.
53 'still link to them though. See %URI.DisableExternalResources for why '.
54 'this might be a good idea. This directive has been available since 1.3.0.'
57 HTMLPurifier_ConfigSchema
::define(
58 'URI', 'Munge', null, 'string/null',
59 'Munges all browsable (usually http, https and ftp) URI\'s into some URL '.
60 'redirection service. Pass this directive a URI, with %s inserted where '.
61 'the url-encoded original URI should be inserted (sample: '.
62 '<code>http://www.google.com/url?q=%s</code>). '.
63 'This prevents PageRank leaks, while being as transparent as possible '.
64 'to users (you may also want to add some client side JavaScript to '.
65 'override the text in the statusbar). Warning: many security experts '.
66 'believe that this form of protection does not deter spam-bots. '.
67 'You can also use this directive to redirect users to a splash page '.
68 'telling them they are leaving your website. '.
69 'This directive has been available since 1.3.0.'
72 HTMLPurifier_ConfigSchema
::define(
73 'URI', 'HostBlacklist', array(), 'list',
74 'List of strings that are forbidden in the host of any URI. Use it to '.
75 'kill domain names of spam, etc. Note that it will catch anything in '.
76 'the domain, so <tt>moo.com</tt> will catch <tt>moo.com.example.com</tt>. '.
77 'This directive has been available since 1.3.0.'
80 HTMLPurifier_ConfigSchema
::define(
81 'URI', 'Disable', false, 'bool',
82 'Disables all URIs in all forms. Not sure why you\'d want to do that '.
83 '(after all, the Internet\'s founded on the notion of a hyperlink). '.
84 'This directive has been available since 1.3.0.'
86 HTMLPurifier_ConfigSchema
::defineAlias('Attr', 'DisableURI', 'URI', 'Disable');
89 * Validates a URI as defined by RFC 3986.
90 * @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
92 class HTMLPurifier_AttrDef_URI
extends HTMLPurifier_AttrDef
100 * @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
102 function HTMLPurifier_AttrDef_URI($embeds_resource = false) {
103 $this->host
= new HTMLPurifier_AttrDef_URI_Host();
104 $this->PercentEncoder
= new HTMLPurifier_PercentEncoder();
105 $this->embeds_resource
= (bool) $embeds_resource;
108 function validate($uri, $config, &$context) {
110 // We'll write stack-based parsers later, for now, use regexps to
111 // get things working as fast as possible (irony)
113 if ($config->get('URI', 'Disable')) return false;
116 $uri = $this->parseCDATA($uri);
118 // fix up percent-encoding
119 $uri = $this->PercentEncoder
->normalize($uri);
121 // while it would be nice to use parse_url(), that's specifically
122 // for HTTP and thus won't work for our generic URI parsing
124 // according to the RFC... (but this cuts corners, i.e. non-validating)
126 '(([^:/?#<>\'"]+):)?'. // 2. Scheme
127 '(//([^/?#<>\'"]*))?'. // 4. Authority
128 '([^?#<>\'"]*)'. // 5. Path
129 '(\?([^#<>\'"]*))?'. // 7. Query
130 '(#([^<>\'"]*))?'. // 8. Fragment
134 $result = preg_match($r_URI, $uri, $matches);
136 if (!$result) return false; // invalid URI
138 // seperate out parts
139 $scheme = !empty($matches[1]) ?
$matches[2] : null;
140 $authority = !empty($matches[3]) ?
$matches[4] : null;
141 $path = $matches[5]; // always present, can be empty
142 $query = !empty($matches[6]) ?
$matches[7] : null;
143 $fragment = !empty($matches[8]) ?
$matches[9] : null;
147 $registry =& HTMLPurifier_URISchemeRegistry
::instance();
148 if ($scheme !== null) {
149 // no need to validate the scheme's fmt since we do that when we
150 // retrieve the specific scheme object from the registry
151 $scheme = ctype_lower($scheme) ?
$scheme : strtolower($scheme);
152 $scheme_obj = $registry->getScheme($scheme, $config, $context);
153 if (!$scheme_obj) return false; // invalid scheme, clean it out
155 $scheme_obj = $registry->getScheme(
156 $config->get('URI', 'DefaultScheme'), $config, $context
161 // the URI we're processing embeds_resource a resource in the page, but the URI
162 // it references cannot be located
163 if ($this->embeds_resource
&& !$scheme_obj->browsable
) {
168 if ($authority !== null) {
170 // remove URI if it's absolute and we disabled externals or
171 // if it's absolute and embedded and we disabled external resources
174 $config->get('URI', 'DisableExternal') ||
176 $config->get('URI', 'DisableExternalResources') &&
177 $this->embeds_resource
180 $our_host = $config->get('URI', 'Host');
181 if ($our_host === null) return false;
184 $HEXDIG = '[A-Fa-f0-9]';
185 $unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with []
186 $sub_delims = '!$&\'()'; // needs []
187 $pct_encoded = "%$HEXDIG$HEXDIG";
188 $r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*";
189 $r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
191 preg_match($r_authority, $authority, $matches);
193 $userinfo = !empty($matches[1]) ?
$matches[2] : null;
194 $host = !empty($matches[3]) ?
$matches[3] : null;
195 $port = !empty($matches[4]) ?
$matches[5] : null;
198 if ($port !== null) {
200 if ($port < 1 ||
$port > 65535) $port = null;
203 $host = $this->host
->validate($host, $config, $context);
204 if ($host === false) $host = null;
206 if ($this->checkBlacklist($host, $config, $context)) return false;
208 // more lenient absolute checking
209 if (isset($our_host)) {
210 $host_parts = array_reverse(explode('.', $host));
212 $our_host_parts = array_reverse(explode('.', $our_host));
213 foreach ($our_host_parts as $i => $discard) {
214 if (!isset($host_parts[$i])) return false;
215 if ($host_parts[$i] != $our_host_parts[$i]) return false;
219 // userinfo and host are validated within the regexp
222 $port = $host = $userinfo = null;
226 // query and fragment are quite simple in terms of definition:
227 // *( pchar / "/" / "?" ), so define their validation routines
228 // when we start fixing percent encoding
232 // path gets to be validated against a hodge-podge of rules depending
233 // on the status of authority and scheme, but it's not that important,
234 // esp. since it won't be applicable to everyone
238 // okay, now we defer execution to the subobject for more processing
239 // note that $fragment is omitted
240 list($userinfo, $host, $port, $path, $query) =
241 $scheme_obj->validateComponents(
242 $userinfo, $host, $port, $path, $query, $config, $context
246 // reconstruct authority
248 if (!is_null($userinfo) ||
!is_null($host) ||
!is_null($port)) {
250 if($userinfo !== null) $authority .= $userinfo . '@';
252 if($port !== null) $authority .= ':' . $port;
255 // reconstruct the result
257 if ($scheme !== null) $result .= "$scheme:";
258 if ($authority !== null) $result .= "//$authority";
260 if ($query !== null) $result .= "?$query";
261 if ($fragment !== null) $result .= "#$fragment";
263 // munge if necessary
264 $munge = $config->get('URI', 'Munge');
265 if (!empty($scheme_obj->browsable
) && $munge !== null) {
266 if ($authority !== null) {
267 $result = str_replace('%s', rawurlencode($result), $munge);
276 * Checks a host against an array blacklist
277 * @param $host Host to check
278 * @param $config HTMLPurifier_Config instance
279 * @param $context HTMLPurifier_Context instance
280 * @return bool Is spam?
282 function checkBlacklist($host, &$config, &$context) {
283 $blacklist = $config->get('URI', 'HostBlacklist');
284 if (!empty($blacklist)) {
285 foreach($blacklist as $blacklisted_host_fragment) {
286 if (strpos($host, $blacklisted_host_fragment) !== false) {