2007-08-01 18:34:46 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
/**
|
2008-05-14 02:19:00 +00:00
|
|
|
* HTML Purifier's internal representation of a URI.
|
|
|
|
* @note
|
|
|
|
* Internal data-structures are completely escaped. If the data needs
|
|
|
|
* to be used in a non-URI context (which is very unlikely), be sure
|
|
|
|
* to decode it first. The URI may not necessarily be well-formed until
|
|
|
|
* validate() is called.
|
2007-08-01 18:34:46 +00:00
|
|
|
*/
|
|
|
|
class HTMLPurifier_URI
|
|
|
|
{
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2007-11-25 02:24:39 +00:00
|
|
|
public $scheme, $userinfo, $host, $port, $path, $query, $fragment;
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2007-08-01 18:34:46 +00:00
|
|
|
/**
|
|
|
|
* @note Automatically normalizes scheme and port
|
|
|
|
*/
|
2007-11-29 04:29:51 +00:00
|
|
|
public function __construct($scheme, $userinfo, $host, $port, $path, $query, $fragment) {
|
2007-08-01 18:34:46 +00:00
|
|
|
$this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme);
|
|
|
|
$this->userinfo = $userinfo;
|
|
|
|
$this->host = $host;
|
|
|
|
$this->port = is_null($port) ? $port : (int) $port;
|
|
|
|
$this->path = $path;
|
|
|
|
$this->query = $query;
|
|
|
|
$this->fragment = $fragment;
|
|
|
|
}
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2007-08-02 01:41:37 +00:00
|
|
|
/**
|
|
|
|
* Retrieves a scheme object corresponding to the URI's scheme/default
|
|
|
|
* @param $config Instance of HTMLPurifier_Config
|
|
|
|
* @param $context Instance of HTMLPurifier_Context
|
|
|
|
* @return Scheme object appropriate for validating this URI
|
|
|
|
*/
|
2008-01-05 00:10:43 +00:00
|
|
|
public function getSchemeObj($config, $context) {
|
|
|
|
$registry = HTMLPurifier_URISchemeRegistry::instance();
|
2007-08-01 18:34:46 +00:00
|
|
|
if ($this->scheme !== null) {
|
|
|
|
$scheme_obj = $registry->getScheme($this->scheme, $config, $context);
|
|
|
|
if (!$scheme_obj) return false; // invalid scheme, clean it out
|
|
|
|
} else {
|
|
|
|
// no scheme: retrieve the default one
|
2007-08-02 21:47:24 +00:00
|
|
|
$def = $config->getDefinition('URI');
|
|
|
|
$scheme_obj = $registry->getScheme($def->defaultScheme, $config, $context);
|
2007-08-01 18:34:46 +00:00
|
|
|
if (!$scheme_obj) {
|
|
|
|
// something funky happened to the default scheme object
|
|
|
|
trigger_error(
|
2007-08-02 21:47:24 +00:00
|
|
|
'Default scheme object "' . $def->defaultScheme . '" was not readable',
|
2007-08-01 18:34:46 +00:00
|
|
|
E_USER_WARNING
|
|
|
|
);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return $scheme_obj;
|
|
|
|
}
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2007-08-01 18:34:46 +00:00
|
|
|
/**
|
2008-05-14 02:19:00 +00:00
|
|
|
* Generic validation method applicable for all schemes. May modify
|
|
|
|
* this URI in order to get it into a compliant form.
|
2007-08-02 01:41:37 +00:00
|
|
|
* @param $config Instance of HTMLPurifier_Config
|
|
|
|
* @param $context Instance of HTMLPurifier_Context
|
|
|
|
* @return True if validation/filtering succeeds, false if failure
|
2007-08-01 18:34:46 +00:00
|
|
|
*/
|
2008-01-05 00:10:43 +00:00
|
|
|
public function validate($config, $context) {
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2008-05-14 02:19:00 +00:00
|
|
|
// ABNF definitions from RFC 3986
|
|
|
|
$chars_sub_delims = '!$&\'()*+,;=';
|
|
|
|
$chars_gen_delims = ':/?#[]@';
|
|
|
|
$chars_pchar = $chars_sub_delims . ':@';
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2007-08-01 18:34:46 +00:00
|
|
|
// validate host
|
|
|
|
if (!is_null($this->host)) {
|
|
|
|
$host_def = new HTMLPurifier_AttrDef_URI_Host();
|
|
|
|
$this->host = $host_def->validate($this->host, $config, $context);
|
|
|
|
if ($this->host === false) $this->host = null;
|
|
|
|
}
|
2008-12-06 02:28:20 -05:00
|
|
|
|
Dramatically rewrite null host URI handling.
Basically, browsers don't parse what should be valid URIs correctly, so
we have to go through some backbends to accomodate them. Specifically,
for browseable URIs, the following URIs have unintended behavior:
- ///example.com
- http:/example.com
- http:///example.com
Furthermore, if the path begins with //, modifying these URLs must
be done with care, as if you remove the host-name component, the
parse tree changes.
I've modified the engine to follow correct URI semantics as much
as possible while outputting browser compatible code, and invalidate
the URI in cases where we can't deal. There has been a refactoring
of URIScheme so that this important check is always performed,
introducing a new member variable allow_empty_host which is true
on data, file, mailto and news schemes.
This also fixes bypass bugs on URI.Munge.
Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
2011-01-25 18:56:46 +00:00
|
|
|
// validate scheme
|
|
|
|
// NOTE: It's not appropriate to check whether or not this
|
|
|
|
// scheme is in our registry, since a URIFilter may convert a
|
|
|
|
// URI that we don't allow into one we do. So instead, we just
|
|
|
|
// check if the scheme can be dropped because there is no host
|
|
|
|
// and it is our default scheme.
|
|
|
|
if (!is_null($this->scheme) && is_null($this->host) || $this->host === '') {
|
|
|
|
// support for relative paths is pretty abysmal when the
|
|
|
|
// scheme is present, so axe it when possible
|
|
|
|
$def = $config->getDefinition('URI');
|
|
|
|
if ($def->defaultScheme === $this->scheme) {
|
|
|
|
$this->scheme = null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-05-14 02:19:00 +00:00
|
|
|
// validate username
|
|
|
|
if (!is_null($this->userinfo)) {
|
|
|
|
$encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
|
|
|
|
$this->userinfo = $encoder->encode($this->userinfo);
|
|
|
|
}
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2007-08-01 18:34:46 +00:00
|
|
|
// validate port
|
|
|
|
if (!is_null($this->port)) {
|
|
|
|
if ($this->port < 1 || $this->port > 65535) $this->port = null;
|
|
|
|
}
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2008-05-14 02:19:00 +00:00
|
|
|
// validate path
|
|
|
|
$path_parts = array();
|
|
|
|
$segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
|
Dramatically rewrite null host URI handling.
Basically, browsers don't parse what should be valid URIs correctly, so
we have to go through some backbends to accomodate them. Specifically,
for browseable URIs, the following URIs have unintended behavior:
- ///example.com
- http:/example.com
- http:///example.com
Furthermore, if the path begins with //, modifying these URLs must
be done with care, as if you remove the host-name component, the
parse tree changes.
I've modified the engine to follow correct URI semantics as much
as possible while outputting browser compatible code, and invalidate
the URI in cases where we can't deal. There has been a refactoring
of URIScheme so that this important check is always performed,
introducing a new member variable allow_empty_host which is true
on data, file, mailto and news schemes.
This also fixes bypass bugs on URI.Munge.
Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
2011-01-25 18:56:46 +00:00
|
|
|
if (!is_null($this->host)) { // this catches $this->host === ''
|
2008-05-14 02:19:00 +00:00
|
|
|
// path-abempty (hier and relative)
|
Dramatically rewrite null host URI handling.
Basically, browsers don't parse what should be valid URIs correctly, so
we have to go through some backbends to accomodate them. Specifically,
for browseable URIs, the following URIs have unintended behavior:
- ///example.com
- http:/example.com
- http:///example.com
Furthermore, if the path begins with //, modifying these URLs must
be done with care, as if you remove the host-name component, the
parse tree changes.
I've modified the engine to follow correct URI semantics as much
as possible while outputting browser compatible code, and invalidate
the URI in cases where we can't deal. There has been a refactoring
of URIScheme so that this important check is always performed,
introducing a new member variable allow_empty_host which is true
on data, file, mailto and news schemes.
This also fixes bypass bugs on URI.Munge.
Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
2011-01-25 18:56:46 +00:00
|
|
|
// http://www.example.com/my/path
|
|
|
|
// //www.example.com/my/path (looks odd, but works, and
|
|
|
|
// recognized by most browsers)
|
|
|
|
// (this set is valid or invalid on a scheme by scheme
|
|
|
|
// basis, so we'll deal with it later)
|
|
|
|
// file:///my/path
|
|
|
|
// ///my/path
|
2008-05-14 02:19:00 +00:00
|
|
|
$this->path = $segments_encoder->encode($this->path);
|
Dramatically rewrite null host URI handling.
Basically, browsers don't parse what should be valid URIs correctly, so
we have to go through some backbends to accomodate them. Specifically,
for browseable URIs, the following URIs have unintended behavior:
- ///example.com
- http:/example.com
- http:///example.com
Furthermore, if the path begins with //, modifying these URLs must
be done with care, as if you remove the host-name component, the
parse tree changes.
I've modified the engine to follow correct URI semantics as much
as possible while outputting browser compatible code, and invalidate
the URI in cases where we can't deal. There has been a refactoring
of URIScheme so that this important check is always performed,
introducing a new member variable allow_empty_host which is true
on data, file, mailto and news schemes.
This also fixes bypass bugs on URI.Munge.
Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
2011-01-25 18:56:46 +00:00
|
|
|
} elseif ($this->path !== '') {
|
|
|
|
if ($this->path[0] === '/') {
|
|
|
|
// path-absolute (hier and relative)
|
|
|
|
// http:/my/path
|
|
|
|
// /my/path
|
|
|
|
if (strlen($this->path) >= 2 && $this->path[1] === '/') {
|
|
|
|
// This could happen if both the host gets stripped
|
|
|
|
// out
|
|
|
|
// http://my/path
|
|
|
|
// //my/path
|
|
|
|
$this->path = '';
|
|
|
|
} else {
|
|
|
|
$this->path = $segments_encoder->encode($this->path);
|
|
|
|
}
|
|
|
|
} elseif (!is_null($this->scheme)) {
|
|
|
|
// path-rootless (hier)
|
|
|
|
// http:my/path
|
|
|
|
// Short circuit evaluation means we don't need to check nz
|
2008-05-14 02:19:00 +00:00
|
|
|
$this->path = $segments_encoder->encode($this->path);
|
|
|
|
} else {
|
Dramatically rewrite null host URI handling.
Basically, browsers don't parse what should be valid URIs correctly, so
we have to go through some backbends to accomodate them. Specifically,
for browseable URIs, the following URIs have unintended behavior:
- ///example.com
- http:/example.com
- http:///example.com
Furthermore, if the path begins with //, modifying these URLs must
be done with care, as if you remove the host-name component, the
parse tree changes.
I've modified the engine to follow correct URI semantics as much
as possible while outputting browser compatible code, and invalidate
the URI in cases where we can't deal. There has been a refactoring
of URIScheme so that this important check is always performed,
introducing a new member variable allow_empty_host which is true
on data, file, mailto and news schemes.
This also fixes bypass bugs on URI.Munge.
Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
2011-01-25 18:56:46 +00:00
|
|
|
// path-noscheme (relative)
|
|
|
|
// my/path
|
|
|
|
// (once again, not checking nz)
|
|
|
|
$segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
|
|
|
|
$c = strpos($this->path, '/');
|
|
|
|
if ($c !== false) {
|
|
|
|
$this->path =
|
|
|
|
$segment_nc_encoder->encode(substr($this->path, 0, $c)) .
|
|
|
|
$segments_encoder->encode(substr($this->path, $c));
|
|
|
|
} else {
|
|
|
|
$this->path = $segment_nc_encoder->encode($this->path);
|
|
|
|
}
|
2008-05-14 02:19:00 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// path-empty (hier and relative)
|
|
|
|
$this->path = ''; // just to be safe
|
|
|
|
}
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2008-05-21 02:58:41 +00:00
|
|
|
// qf = query and fragment
|
|
|
|
$qf_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/?');
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2008-05-21 02:58:41 +00:00
|
|
|
if (!is_null($this->query)) {
|
|
|
|
$this->query = $qf_encoder->encode($this->query);
|
|
|
|
}
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2008-05-21 02:58:41 +00:00
|
|
|
if (!is_null($this->fragment)) {
|
|
|
|
$this->fragment = $qf_encoder->encode($this->fragment);
|
|
|
|
}
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2007-08-01 18:34:46 +00:00
|
|
|
return true;
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2007-08-01 18:34:46 +00:00
|
|
|
}
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2007-08-01 18:34:46 +00:00
|
|
|
/**
|
|
|
|
* Convert URI back to string
|
2007-08-02 01:41:37 +00:00
|
|
|
* @return String URI appropriate for output
|
2007-08-01 18:34:46 +00:00
|
|
|
*/
|
2007-11-25 02:24:39 +00:00
|
|
|
public function toString() {
|
2007-08-01 18:34:46 +00:00
|
|
|
// reconstruct authority
|
|
|
|
$authority = null;
|
Dramatically rewrite null host URI handling.
Basically, browsers don't parse what should be valid URIs correctly, so
we have to go through some backbends to accomodate them. Specifically,
for browseable URIs, the following URIs have unintended behavior:
- ///example.com
- http:/example.com
- http:///example.com
Furthermore, if the path begins with //, modifying these URLs must
be done with care, as if you remove the host-name component, the
parse tree changes.
I've modified the engine to follow correct URI semantics as much
as possible while outputting browser compatible code, and invalidate
the URI in cases where we can't deal. There has been a refactoring
of URIScheme so that this important check is always performed,
introducing a new member variable allow_empty_host which is true
on data, file, mailto and news schemes.
This also fixes bypass bugs on URI.Munge.
Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
2011-01-25 18:56:46 +00:00
|
|
|
// there is a rendering difference between a null authority
|
|
|
|
// (http:foo-bar) and an empty string authority
|
|
|
|
// (http:///foo-bar).
|
2007-08-01 18:34:46 +00:00
|
|
|
if (!is_null($this->host)) {
|
|
|
|
$authority = '';
|
|
|
|
if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
|
|
|
|
$authority .= $this->host;
|
|
|
|
if(!is_null($this->port)) $authority .= ':' . $this->port;
|
|
|
|
}
|
2008-12-06 02:28:20 -05:00
|
|
|
|
Dramatically rewrite null host URI handling.
Basically, browsers don't parse what should be valid URIs correctly, so
we have to go through some backbends to accomodate them. Specifically,
for browseable URIs, the following URIs have unintended behavior:
- ///example.com
- http:/example.com
- http:///example.com
Furthermore, if the path begins with //, modifying these URLs must
be done with care, as if you remove the host-name component, the
parse tree changes.
I've modified the engine to follow correct URI semantics as much
as possible while outputting browser compatible code, and invalidate
the URI in cases where we can't deal. There has been a refactoring
of URIScheme so that this important check is always performed,
introducing a new member variable allow_empty_host which is true
on data, file, mailto and news schemes.
This also fixes bypass bugs on URI.Munge.
Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
2011-01-25 18:56:46 +00:00
|
|
|
// Reconstruct the result
|
|
|
|
// One might wonder about parsing quirks from browsers after
|
2011-03-27 20:35:38 +01:00
|
|
|
// this reconstruction. Unfortunately, parsing behavior depends
|
Dramatically rewrite null host URI handling.
Basically, browsers don't parse what should be valid URIs correctly, so
we have to go through some backbends to accomodate them. Specifically,
for browseable URIs, the following URIs have unintended behavior:
- ///example.com
- http:/example.com
- http:///example.com
Furthermore, if the path begins with //, modifying these URLs must
be done with care, as if you remove the host-name component, the
parse tree changes.
I've modified the engine to follow correct URI semantics as much
as possible while outputting browser compatible code, and invalidate
the URI in cases where we can't deal. There has been a refactoring
of URIScheme so that this important check is always performed,
introducing a new member variable allow_empty_host which is true
on data, file, mailto and news schemes.
This also fixes bypass bugs on URI.Munge.
Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
2011-01-25 18:56:46 +00:00
|
|
|
// on what *scheme* was employed (file:///foo is handled *very*
|
|
|
|
// differently than http:///foo), so unfortunately we have to
|
|
|
|
// defer to the schemes to do the right thing.
|
2007-08-01 18:34:46 +00:00
|
|
|
$result = '';
|
|
|
|
if (!is_null($this->scheme)) $result .= $this->scheme . ':';
|
|
|
|
if (!is_null($authority)) $result .= '//' . $authority;
|
|
|
|
$result .= $this->path;
|
|
|
|
if (!is_null($this->query)) $result .= '?' . $this->query;
|
|
|
|
if (!is_null($this->fragment)) $result .= '#' . $this->fragment;
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2007-08-01 18:34:46 +00:00
|
|
|
return $result;
|
|
|
|
}
|
2008-12-06 02:28:20 -05:00
|
|
|
|
2011-06-12 10:59:27 +01:00
|
|
|
/**
|
|
|
|
* Returns true if this URL might be considered a 'local' URL given
|
|
|
|
* the current context. This is true when the host is null, or
|
|
|
|
* when it matches the host supplied to the configuration.
|
|
|
|
*
|
|
|
|
* Note that this does not do any scheme checking (URI.Munge, I'm
|
|
|
|
* looking at you).
|
|
|
|
*/
|
|
|
|
public function isLocal($config, $context) {
|
|
|
|
if ($this->host === null) return true;
|
|
|
|
$uri_def = $config->getDefinition('URI');
|
|
|
|
if ($uri_def->host === $this->host) return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2007-08-01 18:34:46 +00:00
|
|
|
}
|
|
|
|
|
2008-12-06 04:24:59 -05:00
|
|
|
// vim: et sw=4 sts=4
|