mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-22 16:31:53 +00:00
Dramatically rewrite null host URI handling.
Basically, browsers don't parse what should be valid URIs correctly, so we have to go through some backbends to accomodate them. Specifically, for browseable URIs, the following URIs have unintended behavior: - ///example.com - http:/example.com - http:///example.com Furthermore, if the path begins with //, modifying these URLs must be done with care, as if you remove the host-name component, the parse tree changes. I've modified the engine to follow correct URI semantics as much as possible while outputting browser compatible code, and invalidate the URI in cases where we can't deal. There has been a refactoring of URIScheme so that this important check is always performed, introducing a new member variable allow_empty_host which is true on data, file, mailto and news schemes. This also fixes bypass bugs on URI.Munge. Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
This commit is contained in:
parent
a32d5b52e1
commit
e76f4b45d0
2
NEWS
2
NEWS
@ -39,6 +39,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
||||
raw definition. Reported by ajh.
|
||||
- Switch to using require_once in the Bootstrap to work around bad
|
||||
interaction with Zend Debugger and APC. Reported by Antonio Parraga.
|
||||
- Fix URI handling when hostname is missing but scheme is present.
|
||||
Reported by Neike Taika-Tessaro.
|
||||
|
||||
4.2.0, released 2010-09-15
|
||||
! Added %Core.RemoveProcessingInstructions, which lets you remove
|
||||
|
@ -23,6 +23,12 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
|
||||
|
||||
public function validate($string, $config, $context) {
|
||||
$length = strlen($string);
|
||||
// empty hostname is OK; it's usually semantically equivalent:
|
||||
// the default host as defined by a URI scheme is used:
|
||||
//
|
||||
// If the URI scheme defines a default for host, then that
|
||||
// default applies when the host subcomponent is undefined
|
||||
// or when the registered name is empty (zero length).
|
||||
if ($string === '') return '';
|
||||
if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') {
|
||||
//IPv6
|
||||
|
@ -67,14 +67,6 @@ class HTMLPurifier_URI
|
||||
$chars_gen_delims = ':/?#[]@';
|
||||
$chars_pchar = $chars_sub_delims . ':@';
|
||||
|
||||
// validate scheme (MUST BE FIRST!)
|
||||
if (!is_null($this->scheme) && is_null($this->host)) {
|
||||
$def = $config->getDefinition('URI');
|
||||
if ($def->defaultScheme === $this->scheme) {
|
||||
$this->scheme = null;
|
||||
}
|
||||
}
|
||||
|
||||
// validate host
|
||||
if (!is_null($this->host)) {
|
||||
$host_def = new HTMLPurifier_AttrDef_URI_Host();
|
||||
@ -82,6 +74,21 @@ class HTMLPurifier_URI
|
||||
if ($this->host === false) $this->host = null;
|
||||
}
|
||||
|
||||
// validate scheme
|
||||
// NOTE: It's not appropriate to check whether or not this
|
||||
// scheme is in our registry, since a URIFilter may convert a
|
||||
// URI that we don't allow into one we do. So instead, we just
|
||||
// check if the scheme can be dropped because there is no host
|
||||
// and it is our default scheme.
|
||||
if (!is_null($this->scheme) && is_null($this->host) || $this->host === '') {
|
||||
// support for relative paths is pretty abysmal when the
|
||||
// scheme is present, so axe it when possible
|
||||
$def = $config->getDefinition('URI');
|
||||
if ($def->defaultScheme === $this->scheme) {
|
||||
$this->scheme = null;
|
||||
}
|
||||
}
|
||||
|
||||
// validate username
|
||||
if (!is_null($this->userinfo)) {
|
||||
$encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
|
||||
@ -96,23 +103,38 @@ class HTMLPurifier_URI
|
||||
// validate path
|
||||
$path_parts = array();
|
||||
$segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
|
||||
if (!is_null($this->host)) {
|
||||
if (!is_null($this->host)) { // this catches $this->host === ''
|
||||
// path-abempty (hier and relative)
|
||||
// http://www.example.com/my/path
|
||||
// //www.example.com/my/path (looks odd, but works, and
|
||||
// recognized by most browsers)
|
||||
// (this set is valid or invalid on a scheme by scheme
|
||||
// basis, so we'll deal with it later)
|
||||
// file:///my/path
|
||||
// ///my/path
|
||||
$this->path = $segments_encoder->encode($this->path);
|
||||
} elseif ($this->path !== '' && $this->path[0] === '/') {
|
||||
} elseif ($this->path !== '') {
|
||||
if ($this->path[0] === '/') {
|
||||
// path-absolute (hier and relative)
|
||||
// http:/my/path
|
||||
// /my/path
|
||||
if (strlen($this->path) >= 2 && $this->path[1] === '/') {
|
||||
// This shouldn't ever happen!
|
||||
// This could happen if both the host gets stripped
|
||||
// out
|
||||
// http://my/path
|
||||
// //my/path
|
||||
$this->path = '';
|
||||
} else {
|
||||
$this->path = $segments_encoder->encode($this->path);
|
||||
}
|
||||
} elseif (!is_null($this->scheme) && $this->path !== '') {
|
||||
} elseif (!is_null($this->scheme)) {
|
||||
// path-rootless (hier)
|
||||
// http:my/path
|
||||
// Short circuit evaluation means we don't need to check nz
|
||||
$this->path = $segments_encoder->encode($this->path);
|
||||
} elseif (is_null($this->scheme) && $this->path !== '') {
|
||||
} else {
|
||||
// path-noscheme (relative)
|
||||
// my/path
|
||||
// (once again, not checking nz)
|
||||
$segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
|
||||
$c = strpos($this->path, '/');
|
||||
@ -123,6 +145,7 @@ class HTMLPurifier_URI
|
||||
} else {
|
||||
$this->path = $segment_nc_encoder->encode($this->path);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// path-empty (hier and relative)
|
||||
$this->path = ''; // just to be safe
|
||||
@ -150,6 +173,9 @@ class HTMLPurifier_URI
|
||||
public function toString() {
|
||||
// reconstruct authority
|
||||
$authority = null;
|
||||
// there is a rendering difference between a null authority
|
||||
// (http:foo-bar) and an empty string authority
|
||||
// (http:///foo-bar).
|
||||
if (!is_null($this->host)) {
|
||||
$authority = '';
|
||||
if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
|
||||
@ -157,7 +183,12 @@ class HTMLPurifier_URI
|
||||
if(!is_null($this->port)) $authority .= ':' . $this->port;
|
||||
}
|
||||
|
||||
// reconstruct the result
|
||||
// Reconstruct the result
|
||||
// One might wonder about parsing quirks from browsers after
|
||||
// this reconstruction. Unfortunately, parsing behaviro depends
|
||||
// on what *scheme* was employed (file:///foo is handled *very*
|
||||
// differently than http:///foo), so unfortunately we have to
|
||||
// defer to the schemes to do the right thing.
|
||||
$result = '';
|
||||
if (!is_null($this->scheme)) $result .= $this->scheme . ':';
|
||||
if (!is_null($authority)) $result .= '//' . $authority;
|
||||
|
@ -3,11 +3,13 @@
|
||||
/**
|
||||
* Validator for the components of a URI for a specific scheme
|
||||
*/
|
||||
class HTMLPurifier_URIScheme
|
||||
abstract class HTMLPurifier_URIScheme
|
||||
{
|
||||
|
||||
/**
|
||||
* Scheme's default port (integer)
|
||||
* Scheme's default port (integer). If an explicit port number is
|
||||
* specified that coincides with the default port, it will be
|
||||
* elided.
|
||||
*/
|
||||
public $default_port = null;
|
||||
|
||||
@ -24,17 +26,62 @@ class HTMLPurifier_URIScheme
|
||||
public $hierarchical = false;
|
||||
|
||||
/**
|
||||
* Validates the components of a URI
|
||||
* @note This implementation should be called by children if they define
|
||||
* a default port, as it does port processing.
|
||||
* @param $uri Instance of HTMLPurifier_URI
|
||||
* Whether or not the URI may omit a hostname when the scheme is
|
||||
* explicitly specified, ala file:///path/to/file. As of writing,
|
||||
* 'file' is the only scheme that browsers support his properly.
|
||||
*/
|
||||
public $may_omit_host = false;
|
||||
|
||||
/**
|
||||
* Validates the components of a URI for a specific scheme.
|
||||
* @param $uri Reference to a HTMLPurifier_URI object
|
||||
* @param $config HTMLPurifier_Config object
|
||||
* @param $context HTMLPurifier_Context object
|
||||
* @return Bool success or failure
|
||||
*/
|
||||
public abstract function doValidate(&$uri, $config, $context);
|
||||
|
||||
/**
|
||||
* Public interface for validating components of a URI. Performs a
|
||||
* bunch of default actions. Don't overload this method.
|
||||
* @param $uri Reference to a HTMLPurifier_URI object
|
||||
* @param $config HTMLPurifier_Config object
|
||||
* @param $context HTMLPurifier_Context object
|
||||
* @return Bool success or failure
|
||||
*/
|
||||
public function validate(&$uri, $config, $context) {
|
||||
if ($this->default_port == $uri->port) $uri->port = null;
|
||||
return true;
|
||||
// kludge: browsers do funny things when the scheme but not the
|
||||
// authority is set
|
||||
if (!$this->may_omit_host &&
|
||||
// if the scheme is present, a missing host is always in error
|
||||
(!is_null($uri->scheme) && ($uri->host === '' || is_null($uri->host))) ||
|
||||
// if the scheme is not present, a *blank* host is in error,
|
||||
// since this translates into '///path' which most browsers
|
||||
// interpret as being 'http://path'.
|
||||
(is_null($uri->scheme) && $uri->host === '')
|
||||
) {
|
||||
do {
|
||||
if (is_null($uri->scheme)) {
|
||||
if (substr($uri->path, 0, 2) != '//') {
|
||||
$uri->host = null;
|
||||
break;
|
||||
}
|
||||
// URI is '////path', so we cannot nullify the
|
||||
// host to preserve semantics. Try expanding the
|
||||
// hostname instead (fall through)
|
||||
}
|
||||
// first see if we can manually insert a hostname
|
||||
$host = $config->get('URI.Host');
|
||||
if (!is_null($host)) {
|
||||
$uri->host = $host;
|
||||
} else {
|
||||
// we can't do anything sensible, reject the URL.
|
||||
return false;
|
||||
}
|
||||
} while (false);
|
||||
}
|
||||
return $this->doValidate($uri, $config, $context);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -13,8 +13,11 @@ class HTMLPurifier_URIScheme_data extends HTMLPurifier_URIScheme {
|
||||
'image/gif' => true,
|
||||
'image/png' => true,
|
||||
);
|
||||
// this is actually irrelevant since we only write out the path
|
||||
// component
|
||||
public $may_omit_host = true;
|
||||
|
||||
public function validate(&$uri, $config, $context) {
|
||||
public function doValidate(&$uri, $config, $context) {
|
||||
$result = explode(',', $uri->path, 2);
|
||||
$is_base64 = false;
|
||||
$charset = null;
|
||||
|
@ -9,8 +9,14 @@ class HTMLPurifier_URIScheme_file extends HTMLPurifier_URIScheme {
|
||||
// machines, so placing them as an img src is incorrect.
|
||||
public $browsable = false;
|
||||
|
||||
public function validate(&$uri, $config, $context) {
|
||||
parent::validate($uri, $config, $context);
|
||||
// Basically the *only* URI scheme for which this is true, since
|
||||
// accessing files on the local machine is very common. In fact,
|
||||
// browsers on some operating systems don't understand the
|
||||
// authority, though I hear it is used on Windows to refer to
|
||||
// network shares.
|
||||
public $may_omit_host = true;
|
||||
|
||||
public function doValidate(&$uri, $config, $context) {
|
||||
// Authentication method is not supported
|
||||
$uri->userinfo = null;
|
||||
// file:// makes no provisions for accessing the resource
|
||||
|
@ -9,8 +9,7 @@ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
|
||||
public $browsable = true; // usually
|
||||
public $hierarchical = true;
|
||||
|
||||
public function validate(&$uri, $config, $context) {
|
||||
parent::validate($uri, $config, $context);
|
||||
public function doValidate(&$uri, $config, $context) {
|
||||
$uri->query = null;
|
||||
|
||||
// typecode check
|
||||
|
@ -9,8 +9,7 @@ class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme {
|
||||
public $browsable = true;
|
||||
public $hierarchical = true;
|
||||
|
||||
public function validate(&$uri, $config, $context) {
|
||||
parent::validate($uri, $config, $context);
|
||||
public function doValidate(&$uri, $config, $context) {
|
||||
$uri->userinfo = null;
|
||||
return true;
|
||||
}
|
||||
|
@ -12,9 +12,9 @@
|
||||
class HTMLPurifier_URIScheme_mailto extends HTMLPurifier_URIScheme {
|
||||
|
||||
public $browsable = false;
|
||||
public $may_omit_host = true;
|
||||
|
||||
public function validate(&$uri, $config, $context) {
|
||||
parent::validate($uri, $config, $context);
|
||||
public function doValidate(&$uri, $config, $context) {
|
||||
$uri->userinfo = null;
|
||||
$uri->host = null;
|
||||
$uri->port = null;
|
||||
|
@ -6,9 +6,9 @@
|
||||
class HTMLPurifier_URIScheme_news extends HTMLPurifier_URIScheme {
|
||||
|
||||
public $browsable = false;
|
||||
public $may_omit_host = true;
|
||||
|
||||
public function validate(&$uri, $config, $context) {
|
||||
parent::validate($uri, $config, $context);
|
||||
public function doValidate(&$uri, $config, $context) {
|
||||
$uri->userinfo = null;
|
||||
$uri->host = null;
|
||||
$uri->port = null;
|
||||
|
@ -8,8 +8,7 @@ class HTMLPurifier_URIScheme_nntp extends HTMLPurifier_URIScheme {
|
||||
public $default_port = 119;
|
||||
public $browsable = false;
|
||||
|
||||
public function validate(&$uri, $config, $context) {
|
||||
parent::validate($uri, $config, $context);
|
||||
public function doValidate(&$uri, $config, $context) {
|
||||
$uri->userinfo = null;
|
||||
$uri->query = null;
|
||||
return true;
|
||||
|
@ -74,6 +74,15 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness
|
||||
$this->assertDef('mailto:this-looks-like-a-path@example.com');
|
||||
}
|
||||
|
||||
function testResolveNullSchemeAmbiguity() {
|
||||
$this->assertDef('///foo', '/foo');
|
||||
}
|
||||
|
||||
function testResolveNullSchemeDoubleAmbiguity() {
|
||||
$this->config->set('URI.Host', 'example.com');
|
||||
$this->assertDef('////foo', '//example.com//foo');
|
||||
}
|
||||
|
||||
function testURIDefinitionValidation() {
|
||||
$parser = new HTMLPurifier_URIParser();
|
||||
$uri = $parser->parse('http://example.com');
|
||||
|
52
tests/HTMLPurifier/HTMLT/munge.htmlt
Normal file
52
tests/HTMLPurifier/HTMLT/munge.htmlt
Normal file
@ -0,0 +1,52 @@
|
||||
--INI--
|
||||
URI.Munge = "/r/%s"
|
||||
URI.AllowedSchemes = http,ftp,file
|
||||
--HTML--
|
||||
<a href="google.com">foo</a>
|
||||
<a href="/google.com">foo</a>
|
||||
<a href="//google.com">foo</a>
|
||||
<a href="///google.com">foo</a>
|
||||
<a href="////google.com">foo</a>
|
||||
|
||||
<a href="http:google.com">foo</a>
|
||||
<a href="http:/google.com">foo</a>
|
||||
<a href="http://google.com">foo</a>
|
||||
<a href="http:///google.com">foo</a>
|
||||
<a href="http:////google.com">foo</a>
|
||||
|
||||
<a href="ftp:google.com">foo</a>
|
||||
<a href="ftp:/google.com">foo</a>
|
||||
<a href="ftp://google.com">foo</a>
|
||||
<a href="ftp:///google.com">foo</a>
|
||||
<a href="ftp:////google.com">foo</a>
|
||||
|
||||
<a href="file:google.com">foo</a>
|
||||
<a href="file:/google.com">foo</a>
|
||||
<a href="file://google.com">foo</a>
|
||||
<a href="file:///google.com">foo</a>
|
||||
<a href="file:////google.com">foo</a>
|
||||
--EXPECT--
|
||||
<a href="google.com">foo</a>
|
||||
<a href="/google.com">foo</a>
|
||||
<a href="/r/%2F%2Fgoogle.com">foo</a>
|
||||
<a href="/google.com">foo</a>
|
||||
<a>foo</a>
|
||||
|
||||
<a href="google.com">foo</a>
|
||||
<a href="/google.com">foo</a>
|
||||
<a href="/r/http%3A%2F%2Fgoogle.com">foo</a>
|
||||
<a href="/google.com">foo</a>
|
||||
<a>foo</a>
|
||||
|
||||
<a>foo</a>
|
||||
<a>foo</a>
|
||||
<a href="/r/ftp%3A%2F%2Fgoogle.com">foo</a>
|
||||
<a>foo</a>
|
||||
<a>foo</a>
|
||||
|
||||
<a href="file:google.com">foo</a>
|
||||
<a href="file:/google.com">foo</a>
|
||||
<a href="file://google.com">foo</a>
|
||||
<a href="file:///google.com">foo</a>
|
||||
<a href="file:////google.com">foo</a>
|
||||
--# vim: et sw=4 sts=4
|
@ -172,6 +172,17 @@ class HTMLPurifier_URISchemeTest extends HTMLPurifier_URIHarness
|
||||
);
|
||||
}
|
||||
|
||||
function test_file_local() {
|
||||
$this->assertValidation(
|
||||
'file:///foo/bar?baz#frag',
|
||||
'file:///foo/bar#frag'
|
||||
);
|
||||
}
|
||||
|
||||
function test_ftp_empty_host() {
|
||||
$this->assertValidation('ftp:///example.com', false);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// vim: et sw=4 sts=4
|
||||
|
@ -157,7 +157,7 @@ class HTMLPurifier_URITest extends HTMLPurifier_URIHarness
|
||||
}
|
||||
|
||||
function test_validate_invalidHostThatLooksLikeIPv6() {
|
||||
$this->assertValidation('http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]', 'http:');
|
||||
$this->assertValidation('http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]', '');
|
||||
}
|
||||
|
||||
function test_validate_removeRedundantScheme() {
|
||||
|
Loading…
Reference in New Issue
Block a user