0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-09-18 18:25:18 +00:00

[2.1.0] Implement MakeAbsolute URI filter

- Move some directives with complex dependencies to URIDefinition
- Fix a missing extends
- Add hierarchical information to URI schemes
- Fix bug in URIHarness.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1346 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2007-08-02 21:47:24 +00:00
parent 25fe416ab2
commit 7bccc24977
15 changed files with 411 additions and 49 deletions

2
NEWS
View File

@ -20,6 +20,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
! Standalone file now available, which greatly reduces the amount of
includes (although there are still a few files that reside in the
standalone folder)
! Relative URIs can now be transformed into their absolute equivalents
using %URI.Base and %URI.MakeAbsolute
- AutoFormatters emit friendly error messages if tags or attributes they
need are not allowed
- ConfigForm's compactification of directive names is now configurable

View File

@ -22,8 +22,6 @@ time. Note the naming convention: %Namespace.Directive
%URI.AddRelNofollow - will add rel="nofollow" to all links, preventing the
spread of ill-gotten pagerank
%URI.RelativeToAbsolute - transforms all relative URIs to absolute form
%URI.HostBlacklistRegex - regexes that if matching the host are disallowed
%URI.HostWhitelist - domain names that are excluded from the host blacklist
%URI.HostPolicy - determines whether or not its reject all and then whitelist

View File

@ -7,54 +7,59 @@ require_once 'HTMLPurifier/URISchemeRegistry.php';
require_once 'HTMLPurifier/AttrDef/URI/Host.php';
require_once 'HTMLPurifier/PercentEncoder.php';
HTMLPurifier_ConfigSchema::define(
'URI', 'DefaultScheme', 'http', 'string',
'Defines through what scheme the output will be served, in order to '.
'select the proper object validator when no scheme information is present.'
);
// special case filtering directives
HTMLPurifier_ConfigSchema::define(
'URI', 'Host', null, 'string/null',
'Defines the domain name of the server, so we can determine whether or '.
'an absolute URI is from your website or not. Not strictly necessary, '.
'as users should be using relative URIs to reference resources on your '.
'website. It will, however, let you use absolute URIs to link to '.
'subdomains of the domain you post here: i.e. example.com will allow '.
'sub.example.com. However, higher up domains will still be excluded: '.
'if you set %URI.Host to sub.example.com, example.com will be blocked. '.
'This directive has been available since 1.2.0.'
);
'URI', 'Munge', null, 'string/null', '
<p>
Munges all browsable (usually http, https and ftp)
absolute URI\'s into another URI, usually a URI redirection service.
This directive accepts a URI, formatted with a <code>%s</code> where
the url-encoded original URI should be inserted (sample:
<code>http://www.google.com/url?q=%s</code>).
</p>
<p>
Uses for this directive:
</p>
<ul>
<li>
Prevent PageRank leaks, while being fairly transparent
to users (you may also want to add some client side JavaScript to
override the text in the statusbar). <strong>Notice</strong>:
Many security experts believe that this form of protection does not deter spam-bots.
</li>
<li>
Redirect users to a splash page telling them they are leaving your
website. While this is poor usability practice, it is often mandated
in corporate environments.
</li>
</ul>
<p>
This directive has been available since 1.3.0.
</p>
');
// disabling directives
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableResources', false, 'bool',
'Disables embedding resources, essentially meaning no pictures. You can '.
'still link to them though. See %URI.DisableExternalResources for why '.
'this might be a good idea. This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'Munge', null, 'string/null',
'Munges all browsable (usually http, https and ftp) URI\'s into some URL '.
'redirection service. Pass this directive a URI, with %s inserted where '.
'the url-encoded original URI should be inserted (sample: '.
'<code>http://www.google.com/url?q=%s</code>). '.
'This prevents PageRank leaks, while being as transparent as possible '.
'to users (you may also want to add some client side JavaScript to '.
'override the text in the statusbar). Warning: many security experts '.
'believe that this form of protection does not deter spam-bots. '.
'You can also use this directive to redirect users to a splash page '.
'telling them they are leaving your website. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'Disable', false, 'bool',
'Disables all URIs in all forms. Not sure why you\'d want to do that '.
'(after all, the Internet\'s founded on the notion of a hyperlink). '.
'This directive has been available since 1.3.0.'
);
'URI', 'Disable', false, 'bool', '
<p>
Disables all URIs in all forms. Not sure why you\'d want to do that
(after all, the Internet\'s founded on the notion of a hyperlink).
This directive has been available since 1.3.0.
</p>
');
HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable');
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableResources', false, 'bool', '
<p>
Disables embedding resources, essentially meaning no pictures. You can
still link to them though. See %URI.DisableExternalResources for why
this might be a good idea. This directive has been available since 1.3.0.
</p>
');
/**
* Validates a URI as defined by RFC 3986.
* @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
@ -118,7 +123,7 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
// munge scheme off if necessary (this must be last)
if (!is_null($uri->scheme) && is_null($uri->host)) {
if ($config->get('URI', 'DefaultScheme') == $uri->scheme) {
if ($uri_def->defaultScheme == $uri->scheme) {
$uri->scheme = null;
}
}

View File

@ -37,11 +37,12 @@ class HTMLPurifier_URI
if (!$scheme_obj) return false; // invalid scheme, clean it out
} else {
// no scheme: retrieve the default one
$scheme_obj = $registry->getScheme($config->get('URI', 'DefaultScheme'), $config, $context);
$def = $config->getDefinition('URI');
$scheme_obj = $registry->getScheme($def->defaultScheme, $config, $context);
if (!$scheme_obj) {
// something funky happened to the default scheme object
trigger_error(
'Default scheme object "' . $config->get('URI', 'DefaultScheme') . '" was not readable',
'Default scheme object "' . $def->defaultScheme . '" was not readable',
E_USER_WARNING
);
return false;
@ -107,5 +108,12 @@ class HTMLPurifier_URI
return $result;
}
/**
* Returns a copy of the URI object
*/
function copy() {
return unserialize(serialize($this));
}
}

View File

@ -2,10 +2,12 @@
require_once 'HTMLPurifier/Definition.php';
require_once 'HTMLPurifier/URIFilter.php';
require_once 'HTMLPurifier/URIParser.php';
require_once 'HTMLPurifier/URIFilter/DisableExternal.php';
require_once 'HTMLPurifier/URIFilter/DisableExternalResources.php';
require_once 'HTMLPurifier/URIFilter/HostBlacklist.php';
require_once 'HTMLPurifier/URIFilter/MakeAbsolute.php';
HTMLPurifier_ConfigSchema::define(
'URI', 'DefinitionID', null, 'string/null', '
@ -25,6 +27,48 @@ HTMLPurifier_ConfigSchema::define(
</p>
');
// informative URI directives
HTMLPurifier_ConfigSchema::define(
'URI', 'DefaultScheme', 'http', 'string', '
<p>
Defines through what scheme the output will be served, in order to
select the proper object validator when no scheme information is present.
</p>
');
HTMLPurifier_ConfigSchema::define(
'URI', 'Host', null, 'string/null', '
<p>
Defines the domain name of the server, so we can determine whether or
an absolute URI is from your website or not. Not strictly necessary,
as users should be using relative URIs to reference resources on your
website. It will, however, let you use absolute URIs to link to
subdomains of the domain you post here: i.e. example.com will allow
sub.example.com. However, higher up domains will still be excluded:
if you set %URI.Host to sub.example.com, example.com will be blocked.
<strong>Note:</strong> This directive overrides %URI.Base because
a given page may be on a sub-domain, but you wish HTML Purifier to be
more relaxed and allow some of the parent domains too.
This directive has been available since 1.2.0.
</p>
');
HTMLPurifier_ConfigSchema::define(
'URI', 'Base', null, 'string/null', '
<p>
The base URI is the URI of the document this purified HTML will be
inserted into. This information is important if HTML Purifier needs
to calculate absolute URIs from relative URIs, such as when %URI.MakeAbsolute
is on. You may use a non-absolute URI for this value, but behavior
may vary (%URI.MakeAbsolute deals nicely with both absolute and
relative paths, but forwards-compatibility is not guaranteed).
<strong>Warning:</strong> If set, the scheme on this URI
overrides the one specified by %URI.DefaultScheme. This directive has
been available since 2.1.0.
</p>
');
class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition
{
@ -32,10 +76,26 @@ class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition
var $filters = array();
var $registeredFilters = array();
/**
* HTMLPurifier_URI object of the base specified at %URI.Base
*/
var $base;
/**
* String host to consider "home" base
*/
var $host;
/**
* Name of default scheme based on %URI.DefaultScheme and %URI.Base
*/
var $defaultScheme;
function HTMLPurifier_URIDefinition() {
$this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal());
$this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources());
$this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist());
$this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute());
}
function registerFilter($filter) {
@ -43,6 +103,11 @@ class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition
}
function doSetup($config) {
$this->setupFilters($config);
$this->setupMemberVariables($config);
}
function setupFilters($config) {
foreach ($this->registeredFilters as $name => $filter) {
$conf = $config->get('URI', $name);
if ($conf !== false && $conf !== null) {
@ -53,6 +118,18 @@ class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition
unset($this->registeredFilters);
}
function setupMemberVariables($config) {
$this->host = $config->get('URI', 'Host');
$base_uri = $config->get('URI', 'Base');
if (!is_null($base_uri)) {
$parser = new HTMLPurifier_URIParser();
$this->base = $parser->parse($base_uri);
$this->defaultScheme = $this->base->scheme;
if (is_null($this->host)) $this->host = $this->base->host;
}
if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI', 'DefaultScheme');
}
function filter(&$uri, $config, &$context) {
foreach ($this->filters as $name => $x) {
$result = $this->filters[$name]->filter($uri, $config, $context);

View File

@ -10,7 +10,7 @@ HTMLPurifier_ConfigSchema::define(
'This directive has been available since 1.3.0.'
);
class HTMLPurifier_URIFilter_HostBlacklist
class HTMLPurifier_URIFilter_HostBlacklist extends HTMLPurifier_URIFilter
{
var $name = 'HostBlacklist';
var $blacklist = array();

View File

@ -0,0 +1,115 @@
<?php
// does not support network paths
require_once 'HTMLPurifier/URIFilter.php';
HTMLPurifier_ConfigSchema::define(
'URI', 'MakeAbsolute', false, 'bool', '
<p>
Converts all URIs into absolute forms. This is useful when the HTML
being filtered assumes a specific base path, but will actually be
viewed in a different context (and setting an alternate base URI is
not possible). %URI.Base must be set for this directive to work.
This directive has been available since 2.1.0.
</p>
');
class HTMLPurifier_URIFilter_MakeAbsolute extends HTMLPurifier_URIFilter
{
var $name = 'MakeAbsolute';
var $base;
var $basePathStack = array();
function prepare($config) {
$def = $config->getDefinition('URI');
$this->base = $def->base;
if (is_null($this->base)) {
trigger_error('URI.MakeAbsolute is being ignored due to lack of value for URI.Base configuration', E_USER_ERROR);
return;
}
$this->base->fragment = null; // fragment is invalid for base URI
$stack = explode('/', $this->base->path);
array_pop($stack); // discard last segment
$stack = $this->_collapseStack($stack); // do pre-parsing
$this->basePathStack = $stack;
}
function filter(&$uri, $config, &$context) {
if (is_null($this->base)) return true; // abort early
if (
$uri->path === '' && is_null($uri->scheme) &&
is_null($uri->host) && is_null($uri->query) && is_null($uri->fragment)
) {
// reference to current document
$uri = $this->base->copy();
return true;
}
if (!is_null($uri->scheme)) {
// absolute URI already: don't change
if (!is_null($uri->host)) return true;
$scheme_obj = $uri->getSchemeObj($config, $context);
if (!$scheme_obj->hierarchical) {
// non-hierarchal URI with explicit scheme, don't change
return true;
}
// special case: had a scheme but always is hierarchical and had no authority
}
if (!is_null($uri->host)) {
// network path, don't bother
return true;
}
if ($uri->path === '') {
$uri->path = $this->base->path;
}elseif ($uri->path[0] !== '/') {
// relative path, needs more complicated processing
$stack = explode('/', $uri->path);
$new_stack = array_merge($this->basePathStack, $stack);
$new_stack = $this->_collapseStack($new_stack);
$uri->path = implode('/', $new_stack);
}
// re-combine
$uri->scheme = $this->base->scheme;
if (is_null($uri->userinfo)) $uri->userinfo = $this->base->userinfo;
if (is_null($uri->host)) $uri->host = $this->base->host;
if (is_null($uri->port)) $uri->port = $this->base->port;
return true;
}
/**
* Resolve dots and double-dots in a path stack
* @private
*/
function _collapseStack($stack) {
$result = array();
for ($i = 0; isset($stack[$i]); $i++) {
$is_folder = false;
// absorb an internally duplicated slash
if ($stack[$i] == '' && $i && isset($stack[$i+1])) continue;
if ($stack[$i] == '..') {
if (!empty($result)) {
$segment = array_pop($result);
if ($segment === '' && empty($result)) {
// error case: attempted to back out too far:
// restore the leading slash
$result[] = '';
} elseif ($segment === '..') {
$result[] = '..'; // cannot remove .. with ..
}
} else {
// relative path, preserve the double-dots
$result[] = '..';
}
$is_folder = true;
continue;
}
if ($stack[$i] == '.') {
// silently absorb
$is_folder = true;
continue;
}
$result[] = $stack[$i];
}
if ($is_folder) $result[] = '';
return $result;
}
}

View File

@ -19,6 +19,12 @@ class HTMLPurifier_URIScheme
*/
var $browsable = false;
/**
* Whether or not the URI always uses <hier_part>, resolves edge cases
* with making relative URIs absolute
*/
var $hierarchical = false;
/**
* Validates the components of a URI
* @note This implementation should be called by children if they define

View File

@ -9,6 +9,7 @@ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
var $default_port = 21;
var $browsable = true; // usually
var $hierarchical = true;
function validate(&$uri, $config, &$context) {
parent::validate($uri, $config, $context);

View File

@ -9,6 +9,7 @@ class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme {
var $default_port = 80;
var $browsable = true;
var $hierarchical = true;
function validate(&$uri, $config, &$context) {
parent::validate($uri, $config, $context);

View File

@ -31,4 +31,29 @@ class HTMLPurifier_URIDefinitionTest extends HTMLPurifier_URIHarness
$this->assertFalse($def->filter($uri, $this->config, $this->context));
}
function test_setupMemberVariables_collisionPrecedenceIsHostBaseScheme() {
$this->config->set('URI', 'Host', $host = 'example.com');
$this->config->set('URI', 'Base', $base = 'http://sub.example.com/foo/bar.html');
$this->config->set('URI', 'DefaultScheme', 'ftp');
$def = new HTMLPurifier_URIDefinition();
$def->setupMemberVariables($this->config);
$this->assertIdentical($def->host, $host);
$this->assertIdentical($def->base, $this->createURI($base));
$this->assertIdentical($def->defaultScheme, 'http'); // not ftp!
}
function test_setupMemberVariables_onlyScheme() {
$this->config->set('URI', 'DefaultScheme', 'ftp');
$def = new HTMLPurifier_URIDefinition();
$def->setupMemberVariables($this->config);
$this->assertIdentical($def->defaultScheme, 'ftp');
}
function test_setupMemberVariables_onlyBase() {
$this->config->set('URI', 'Base', 'http://sub.example.com/foo/bar.html');
$def = new HTMLPurifier_URIDefinition();
$def->setupMemberVariables($this->config);
$this->assertIdentical($def->host, 'sub.example.com');
}
}

View File

@ -9,6 +9,7 @@ class HTMLPurifier_URIFilter_DisableExternalResourcesTest extends
function setUp() {
parent::setUp();
$this->filter = new HTMLPurifier_URIFilter_DisableExternalResources();
$var = true;
$this->context->register('EmbeddedURI', $var);
}

View File

@ -0,0 +1,122 @@
<?php
require_once 'HTMLPurifier/URIFilter/MakeAbsolute.php';
require_once 'HTMLPurifier/URIFilterHarness.php';
class HTMLPurifier_URIFilter_MakeAbsoluteTest extends HTMLPurifier_URIFilterHarness
{
function setUp() {
parent::setUp();
$this->filter = new HTMLPurifier_URIFilter_MakeAbsolute();
$this->setBase();
}
function setBase($base = 'http://example.com/foo/bar.html?q=s#frag') {
$this->config->set('URI', 'Base', $base);
}
// corresponding to RFC 2396
function testPreserveAbsolute() {
$this->assertFiltering('http://example.com/foo.html');
}
function testFilterBlank() {
$this->assertFiltering('', 'http://example.com/foo/bar.html?q=s');
}
function testFilterEmptyPath() {
$this->assertFiltering('?q=s#frag', 'http://example.com/foo/bar.html?q=s#frag');
}
function testPreserveAltScheme() {
$this->assertFiltering('mailto:bob@example.com');
}
function testFilterIgnoreHTTPSpecialCase() {
$this->assertFiltering('http:/', 'http://example.com/');
}
function testFilterAbsolutePath() {
$this->assertFiltering('/foo.txt', 'http://example.com/foo.txt');
}
function testFilterRelativePath() {
$this->assertFiltering('baz.txt', 'http://example.com/foo/baz.txt');
}
function testFilterRelativePathWithInternalDot() {
$this->assertFiltering('./baz.txt', 'http://example.com/foo/baz.txt');
}
function testFilterRelativePathWithEndingDot() {
$this->assertFiltering('baz/.', 'http://example.com/foo/baz/');
}
function testFilterRelativePathDot() {
$this->assertFiltering('.', 'http://example.com/foo/');
}
function testFilterRelativePathWithInternalDotDot() {
$this->assertFiltering('../baz.txt', 'http://example.com/baz.txt');
}
function testFilterRelativePathWithEndingDotDot() {
$this->assertFiltering('..', 'http://example.com/');
}
function testFilterRelativePathTooManyDotDots() {
$this->assertFiltering('../../', 'http://example.com/');
}
function testFilterAppendingQueryAndFragment() {
$this->assertFiltering('/foo.php?q=s#frag', 'http://example.com/foo.php?q=s#frag');
}
// edge cases below
function testFilterAbsolutePathBase() {
$this->setBase('/foo/baz.txt');
$this->assertFiltering('test.php', '/foo/test.php');
}
function testFilterAbsolutePathBaseDirectory() {
$this->setBase('/foo/');
$this->assertFiltering('test.php', '/foo/test.php');
}
function testFilterAbsolutePathBaseBelow() {
$this->setBase('/foo/baz.txt');
$this->assertFiltering('../../test.php', '/test.php');
}
function testFilterRelativePathBase() {
$this->setBase('foo/baz.html');
$this->assertFiltering('foo.php', 'foo/foo.php');
}
function testFilterRelativePathBaseBelow() {
$this->setBase('../baz.html');
$this->assertFiltering('test/strike.html', '../test/strike.html');
}
function testFilterRelativePathBaseWithAbsoluteURI() {
$this->setBase('../baz.html');
$this->assertFiltering('/test/strike.html');
}
function testFilterRelativePathBaseWithDot() {
$this->setBase('../baz.html');
$this->assertFiltering('.', '../');
}
// error case
function testErrorNoBase() {
$this->setBase(null);
$this->expectError('URI.MakeAbsolute is being ignored due to lack of value for URI.Base configuration');
$this->assertFiltering('foo/bar.txt');
}
}

View File

@ -13,7 +13,7 @@ class HTMLPurifier_URIHarness extends HTMLPurifier_Harness
*/
function prepareURI(&$uri, &$expect_uri) {
$parser = new HTMLPurifier_URIParser();
if ($expect_uri === true) $uri = $expect_uri;
if ($expect_uri === true) $expect_uri = $uri;
$uri = $parser->parse($uri);
if ($expect_uri !== false) {
$expect_uri = $parser->parse($expect_uri);

View File

@ -106,6 +106,7 @@ $test_files[] = 'HTMLPurifier/URIDefinitionTest.php';
$test_files[] = 'HTMLPurifier/URIFilter/DisableExternalTest.php';
$test_files[] = 'HTMLPurifier/URIFilter/DisableExternalResourcesTest.php';
$test_files[] = 'HTMLPurifier/URIFilter/HostBlacklistTest.php';
$test_files[] = 'HTMLPurifier/URIFilter/MakeAbsoluteTest.php';
$test_files[] = 'HTMLPurifier/URIParserTest.php';
$test_files[] = 'HTMLPurifier/URISchemeRegistryTest.php';
$test_files[] = 'HTMLPurifier/URISchemeTest.php';