3 require_once 'HTMLPurifier/URI.php';
6 * Parses a URI into the components and fragment identifier as specified
9 class HTMLPurifier_URIParser
13 * Instance of HTMLPurifier_PercentEncoder to do normalization with.
17 function HTMLPurifier_URIParser() {
18 $this->percentEncoder
= new HTMLPurifier_PercentEncoder();
23 * @param $uri string URI to parse
24 * @return HTMLPurifier_URI representation of URI. This representation has
25 * not been validated yet and may not conform to RFC.
27 function parse($uri) {
29 $uri = $this->percentEncoder
->normalize($uri);
31 // Regexp is as per Appendix B.
32 // Note that ["<>] are an addition to the RFC's recommended
33 // characters, because they represent external delimeters.
35 '(([^:/?#"<>]+):)?'. // 2. Scheme
36 '(//([^/?#"<>]*))?'. // 4. Authority
37 '([^?#"<>]*)'. // 5. Path
38 '(\?([^#"<>]*))?'. // 7. Query
39 '(#([^"<>]*))?'. // 8. Fragment
43 $result = preg_match($r_URI, $uri, $matches);
45 if (!$result) return false; // *really* invalid URI
48 $scheme = !empty($matches[1]) ?
$matches[2] : null;
49 $authority = !empty($matches[3]) ?
$matches[4] : null;
50 $path = $matches[5]; // always present, can be empty
51 $query = !empty($matches[6]) ?
$matches[7] : null;
52 $fragment = !empty($matches[8]) ?
$matches[9] : null;
54 // further parse authority
55 if ($authority !== null) {
56 $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
58 preg_match($r_authority, $authority, $matches);
59 $userinfo = !empty($matches[1]) ?
$matches[2] : null;
60 $host = !empty($matches[3]) ?
$matches[3] : '';
61 $port = !empty($matches[4]) ?
(int) $matches[5] : null;
63 $port = $host = $userinfo = null;
66 return new HTMLPurifier_URI(
67 $scheme, $userinfo, $host, $port, $path, $query, $fragment);