diff --git a/NEWS b/NEWS index a4c3d59c..a6509bdd 100644 --- a/NEWS +++ b/NEWS @@ -20,6 +20,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier ! Standalone file now available, which greatly reduces the amount of includes (although there are still a few files that reside in the standalone folder) +! Relative URIs can now be transformed into their absolute equivalents + using %URI.Base and %URI.MakeAbsolute - AutoFormatters emit friendly error messages if tags or attributes they need are not allowed - ConfigForm's compactification of directive names is now configurable diff --git a/docs/proposal-new-directives.txt b/docs/proposal-new-directives.txt index 2c08ddbb..b3351b4c 100644 --- a/docs/proposal-new-directives.txt +++ b/docs/proposal-new-directives.txt @@ -22,8 +22,6 @@ time. Note the naming convention: %Namespace.Directive %URI.AddRelNofollow - will add rel="nofollow" to all links, preventing the spread of ill-gotten pagerank -%URI.RelativeToAbsolute - transforms all relative URIs to absolute form - %URI.HostBlacklistRegex - regexes that if matching the host are disallowed %URI.HostWhitelist - domain names that are excluded from the host blacklist %URI.HostPolicy - determines whether or not its reject all and then whitelist diff --git a/library/HTMLPurifier/AttrDef/URI.php b/library/HTMLPurifier/AttrDef/URI.php index 3b97e007..dcf9849c 100644 --- a/library/HTMLPurifier/AttrDef/URI.php +++ b/library/HTMLPurifier/AttrDef/URI.php @@ -7,54 +7,59 @@ require_once 'HTMLPurifier/URISchemeRegistry.php'; require_once 'HTMLPurifier/AttrDef/URI/Host.php'; require_once 'HTMLPurifier/PercentEncoder.php'; -HTMLPurifier_ConfigSchema::define( - 'URI', 'DefaultScheme', 'http', 'string', - 'Defines through what scheme the output will be served, in order to '. - 'select the proper object validator when no scheme information is present.' -); +// special case filtering directives HTMLPurifier_ConfigSchema::define( - 'URI', 'Host', null, 'string/null', - 'Defines the domain name of the server, so we can determine whether or '. - 'an absolute URI is from your website or not. Not strictly necessary, '. - 'as users should be using relative URIs to reference resources on your '. - 'website. It will, however, let you use absolute URIs to link to '. - 'subdomains of the domain you post here: i.e. example.com will allow '. - 'sub.example.com. However, higher up domains will still be excluded: '. - 'if you set %URI.Host to sub.example.com, example.com will be blocked. '. - 'This directive has been available since 1.2.0.' -); + 'URI', 'Munge', null, 'string/null', ' +

+ Munges all browsable (usually http, https and ftp) + absolute URI\'s into another URI, usually a URI redirection service. + This directive accepts a URI, formatted with a %s where + the url-encoded original URI should be inserted (sample: + http://www.google.com/url?q=%s). +

+

+ Uses for this directive: +

+ +

+ This directive has been available since 1.3.0. +

+'); + +// disabling directives HTMLPurifier_ConfigSchema::define( - 'URI', 'DisableResources', false, 'bool', - 'Disables embedding resources, essentially meaning no pictures. You can '. - 'still link to them though. See %URI.DisableExternalResources for why '. - 'this might be a good idea. This directive has been available since 1.3.0.' -); - -HTMLPurifier_ConfigSchema::define( - 'URI', 'Munge', null, 'string/null', - 'Munges all browsable (usually http, https and ftp) URI\'s into some URL '. - 'redirection service. Pass this directive a URI, with %s inserted where '. - 'the url-encoded original URI should be inserted (sample: '. - 'http://www.google.com/url?q=%s). '. - 'This prevents PageRank leaks, while being as transparent as possible '. - 'to users (you may also want to add some client side JavaScript to '. - 'override the text in the statusbar). Warning: many security experts '. - 'believe that this form of protection does not deter spam-bots. '. - 'You can also use this directive to redirect users to a splash page '. - 'telling them they are leaving your website. '. - 'This directive has been available since 1.3.0.' -); - -HTMLPurifier_ConfigSchema::define( - 'URI', 'Disable', false, 'bool', - 'Disables all URIs in all forms. Not sure why you\'d want to do that '. - '(after all, the Internet\'s founded on the notion of a hyperlink). '. - 'This directive has been available since 1.3.0.' -); + 'URI', 'Disable', false, 'bool', ' +

+ Disables all URIs in all forms. Not sure why you\'d want to do that + (after all, the Internet\'s founded on the notion of a hyperlink). + This directive has been available since 1.3.0. +

+'); HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable'); +HTMLPurifier_ConfigSchema::define( + 'URI', 'DisableResources', false, 'bool', ' +

+ Disables embedding resources, essentially meaning no pictures. You can + still link to them though. See %URI.DisableExternalResources for why + this might be a good idea. This directive has been available since 1.3.0. +

+'); + /** * Validates a URI as defined by RFC 3986. * @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme @@ -118,7 +123,7 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef // munge scheme off if necessary (this must be last) if (!is_null($uri->scheme) && is_null($uri->host)) { - if ($config->get('URI', 'DefaultScheme') == $uri->scheme) { + if ($uri_def->defaultScheme == $uri->scheme) { $uri->scheme = null; } } diff --git a/library/HTMLPurifier/URI.php b/library/HTMLPurifier/URI.php index cec71436..ed7ffdd6 100644 --- a/library/HTMLPurifier/URI.php +++ b/library/HTMLPurifier/URI.php @@ -37,11 +37,12 @@ class HTMLPurifier_URI if (!$scheme_obj) return false; // invalid scheme, clean it out } else { // no scheme: retrieve the default one - $scheme_obj = $registry->getScheme($config->get('URI', 'DefaultScheme'), $config, $context); + $def = $config->getDefinition('URI'); + $scheme_obj = $registry->getScheme($def->defaultScheme, $config, $context); if (!$scheme_obj) { // something funky happened to the default scheme object trigger_error( - 'Default scheme object "' . $config->get('URI', 'DefaultScheme') . '" was not readable', + 'Default scheme object "' . $def->defaultScheme . '" was not readable', E_USER_WARNING ); return false; @@ -107,5 +108,12 @@ class HTMLPurifier_URI return $result; } + /** + * Returns a copy of the URI object + */ + function copy() { + return unserialize(serialize($this)); + } + } diff --git a/library/HTMLPurifier/URIDefinition.php b/library/HTMLPurifier/URIDefinition.php index 0623c983..131c95de 100644 --- a/library/HTMLPurifier/URIDefinition.php +++ b/library/HTMLPurifier/URIDefinition.php @@ -2,10 +2,12 @@ require_once 'HTMLPurifier/Definition.php'; require_once 'HTMLPurifier/URIFilter.php'; +require_once 'HTMLPurifier/URIParser.php'; require_once 'HTMLPurifier/URIFilter/DisableExternal.php'; require_once 'HTMLPurifier/URIFilter/DisableExternalResources.php'; require_once 'HTMLPurifier/URIFilter/HostBlacklist.php'; +require_once 'HTMLPurifier/URIFilter/MakeAbsolute.php'; HTMLPurifier_ConfigSchema::define( 'URI', 'DefinitionID', null, 'string/null', ' @@ -25,6 +27,48 @@ HTMLPurifier_ConfigSchema::define(

'); +// informative URI directives + +HTMLPurifier_ConfigSchema::define( + 'URI', 'DefaultScheme', 'http', 'string', ' +

+ Defines through what scheme the output will be served, in order to + select the proper object validator when no scheme information is present. +

+'); + +HTMLPurifier_ConfigSchema::define( + 'URI', 'Host', null, 'string/null', ' +

+ Defines the domain name of the server, so we can determine whether or + an absolute URI is from your website or not. Not strictly necessary, + as users should be using relative URIs to reference resources on your + website. It will, however, let you use absolute URIs to link to + subdomains of the domain you post here: i.e. example.com will allow + sub.example.com. However, higher up domains will still be excluded: + if you set %URI.Host to sub.example.com, example.com will be blocked. + Note: This directive overrides %URI.Base because + a given page may be on a sub-domain, but you wish HTML Purifier to be + more relaxed and allow some of the parent domains too. + This directive has been available since 1.2.0. +

+'); + +HTMLPurifier_ConfigSchema::define( + 'URI', 'Base', null, 'string/null', ' +

+ The base URI is the URI of the document this purified HTML will be + inserted into. This information is important if HTML Purifier needs + to calculate absolute URIs from relative URIs, such as when %URI.MakeAbsolute + is on. You may use a non-absolute URI for this value, but behavior + may vary (%URI.MakeAbsolute deals nicely with both absolute and + relative paths, but forwards-compatibility is not guaranteed). + Warning: If set, the scheme on this URI + overrides the one specified by %URI.DefaultScheme. This directive has + been available since 2.1.0. +

+'); + class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition { @@ -32,10 +76,26 @@ class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition var $filters = array(); var $registeredFilters = array(); + /** + * HTMLPurifier_URI object of the base specified at %URI.Base + */ + var $base; + + /** + * String host to consider "home" base + */ + var $host; + + /** + * Name of default scheme based on %URI.DefaultScheme and %URI.Base + */ + var $defaultScheme; + function HTMLPurifier_URIDefinition() { $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal()); $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources()); $this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist()); + $this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute()); } function registerFilter($filter) { @@ -43,6 +103,11 @@ class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition } function doSetup($config) { + $this->setupFilters($config); + $this->setupMemberVariables($config); + } + + function setupFilters($config) { foreach ($this->registeredFilters as $name => $filter) { $conf = $config->get('URI', $name); if ($conf !== false && $conf !== null) { @@ -53,6 +118,18 @@ class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition unset($this->registeredFilters); } + function setupMemberVariables($config) { + $this->host = $config->get('URI', 'Host'); + $base_uri = $config->get('URI', 'Base'); + if (!is_null($base_uri)) { + $parser = new HTMLPurifier_URIParser(); + $this->base = $parser->parse($base_uri); + $this->defaultScheme = $this->base->scheme; + if (is_null($this->host)) $this->host = $this->base->host; + } + if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI', 'DefaultScheme'); + } + function filter(&$uri, $config, &$context) { foreach ($this->filters as $name => $x) { $result = $this->filters[$name]->filter($uri, $config, $context); diff --git a/library/HTMLPurifier/URIFilter/HostBlacklist.php b/library/HTMLPurifier/URIFilter/HostBlacklist.php index 5f0d790e..d3429d5c 100644 --- a/library/HTMLPurifier/URIFilter/HostBlacklist.php +++ b/library/HTMLPurifier/URIFilter/HostBlacklist.php @@ -10,7 +10,7 @@ HTMLPurifier_ConfigSchema::define( 'This directive has been available since 1.3.0.' ); -class HTMLPurifier_URIFilter_HostBlacklist +class HTMLPurifier_URIFilter_HostBlacklist extends HTMLPurifier_URIFilter { var $name = 'HostBlacklist'; var $blacklist = array(); diff --git a/library/HTMLPurifier/URIFilter/MakeAbsolute.php b/library/HTMLPurifier/URIFilter/MakeAbsolute.php new file mode 100644 index 00000000..9935dc6e --- /dev/null +++ b/library/HTMLPurifier/URIFilter/MakeAbsolute.php @@ -0,0 +1,115 @@ + + Converts all URIs into absolute forms. This is useful when the HTML + being filtered assumes a specific base path, but will actually be + viewed in a different context (and setting an alternate base URI is + not possible). %URI.Base must be set for this directive to work. + This directive has been available since 2.1.0. +

+'); + +class HTMLPurifier_URIFilter_MakeAbsolute extends HTMLPurifier_URIFilter +{ + var $name = 'MakeAbsolute'; + var $base; + var $basePathStack = array(); + function prepare($config) { + $def = $config->getDefinition('URI'); + $this->base = $def->base; + if (is_null($this->base)) { + trigger_error('URI.MakeAbsolute is being ignored due to lack of value for URI.Base configuration', E_USER_ERROR); + return; + } + $this->base->fragment = null; // fragment is invalid for base URI + $stack = explode('/', $this->base->path); + array_pop($stack); // discard last segment + $stack = $this->_collapseStack($stack); // do pre-parsing + $this->basePathStack = $stack; + } + function filter(&$uri, $config, &$context) { + if (is_null($this->base)) return true; // abort early + if ( + $uri->path === '' && is_null($uri->scheme) && + is_null($uri->host) && is_null($uri->query) && is_null($uri->fragment) + ) { + // reference to current document + $uri = $this->base->copy(); + return true; + } + if (!is_null($uri->scheme)) { + // absolute URI already: don't change + if (!is_null($uri->host)) return true; + $scheme_obj = $uri->getSchemeObj($config, $context); + if (!$scheme_obj->hierarchical) { + // non-hierarchal URI with explicit scheme, don't change + return true; + } + // special case: had a scheme but always is hierarchical and had no authority + } + if (!is_null($uri->host)) { + // network path, don't bother + return true; + } + if ($uri->path === '') { + $uri->path = $this->base->path; + }elseif ($uri->path[0] !== '/') { + // relative path, needs more complicated processing + $stack = explode('/', $uri->path); + $new_stack = array_merge($this->basePathStack, $stack); + $new_stack = $this->_collapseStack($new_stack); + $uri->path = implode('/', $new_stack); + } + // re-combine + $uri->scheme = $this->base->scheme; + if (is_null($uri->userinfo)) $uri->userinfo = $this->base->userinfo; + if (is_null($uri->host)) $uri->host = $this->base->host; + if (is_null($uri->port)) $uri->port = $this->base->port; + return true; + } + + /** + * Resolve dots and double-dots in a path stack + * @private + */ + function _collapseStack($stack) { + $result = array(); + for ($i = 0; isset($stack[$i]); $i++) { + $is_folder = false; + // absorb an internally duplicated slash + if ($stack[$i] == '' && $i && isset($stack[$i+1])) continue; + if ($stack[$i] == '..') { + if (!empty($result)) { + $segment = array_pop($result); + if ($segment === '' && empty($result)) { + // error case: attempted to back out too far: + // restore the leading slash + $result[] = ''; + } elseif ($segment === '..') { + $result[] = '..'; // cannot remove .. with .. + } + } else { + // relative path, preserve the double-dots + $result[] = '..'; + } + $is_folder = true; + continue; + } + if ($stack[$i] == '.') { + // silently absorb + $is_folder = true; + continue; + } + $result[] = $stack[$i]; + } + if ($is_folder) $result[] = ''; + return $result; + } +} + diff --git a/library/HTMLPurifier/URIScheme.php b/library/HTMLPurifier/URIScheme.php index a34cdd4a..41c02f70 100644 --- a/library/HTMLPurifier/URIScheme.php +++ b/library/HTMLPurifier/URIScheme.php @@ -19,6 +19,12 @@ class HTMLPurifier_URIScheme */ var $browsable = false; + /** + * Whether or not the URI always uses , resolves edge cases + * with making relative URIs absolute + */ + var $hierarchical = false; + /** * Validates the components of a URI * @note This implementation should be called by children if they define diff --git a/library/HTMLPurifier/URIScheme/ftp.php b/library/HTMLPurifier/URIScheme/ftp.php index 950fe032..5555ef33 100644 --- a/library/HTMLPurifier/URIScheme/ftp.php +++ b/library/HTMLPurifier/URIScheme/ftp.php @@ -9,6 +9,7 @@ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme { var $default_port = 21; var $browsable = true; // usually + var $hierarchical = true; function validate(&$uri, $config, &$context) { parent::validate($uri, $config, $context); diff --git a/library/HTMLPurifier/URIScheme/http.php b/library/HTMLPurifier/URIScheme/http.php index 262e2bd9..7abc6680 100644 --- a/library/HTMLPurifier/URIScheme/http.php +++ b/library/HTMLPurifier/URIScheme/http.php @@ -9,6 +9,7 @@ class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme { var $default_port = 80; var $browsable = true; + var $hierarchical = true; function validate(&$uri, $config, &$context) { parent::validate($uri, $config, $context); diff --git a/tests/HTMLPurifier/URIDefinitionTest.php b/tests/HTMLPurifier/URIDefinitionTest.php index cc782840..149f89b2 100644 --- a/tests/HTMLPurifier/URIDefinitionTest.php +++ b/tests/HTMLPurifier/URIDefinitionTest.php @@ -31,4 +31,29 @@ class HTMLPurifier_URIDefinitionTest extends HTMLPurifier_URIHarness $this->assertFalse($def->filter($uri, $this->config, $this->context)); } + function test_setupMemberVariables_collisionPrecedenceIsHostBaseScheme() { + $this->config->set('URI', 'Host', $host = 'example.com'); + $this->config->set('URI', 'Base', $base = 'http://sub.example.com/foo/bar.html'); + $this->config->set('URI', 'DefaultScheme', 'ftp'); + $def = new HTMLPurifier_URIDefinition(); + $def->setupMemberVariables($this->config); + $this->assertIdentical($def->host, $host); + $this->assertIdentical($def->base, $this->createURI($base)); + $this->assertIdentical($def->defaultScheme, 'http'); // not ftp! + } + + function test_setupMemberVariables_onlyScheme() { + $this->config->set('URI', 'DefaultScheme', 'ftp'); + $def = new HTMLPurifier_URIDefinition(); + $def->setupMemberVariables($this->config); + $this->assertIdentical($def->defaultScheme, 'ftp'); + } + + function test_setupMemberVariables_onlyBase() { + $this->config->set('URI', 'Base', 'http://sub.example.com/foo/bar.html'); + $def = new HTMLPurifier_URIDefinition(); + $def->setupMemberVariables($this->config); + $this->assertIdentical($def->host, 'sub.example.com'); + } + } diff --git a/tests/HTMLPurifier/URIFilter/DisableExternalResourcesTest.php b/tests/HTMLPurifier/URIFilter/DisableExternalResourcesTest.php index 4362cbd8..545e421b 100644 --- a/tests/HTMLPurifier/URIFilter/DisableExternalResourcesTest.php +++ b/tests/HTMLPurifier/URIFilter/DisableExternalResourcesTest.php @@ -9,6 +9,7 @@ class HTMLPurifier_URIFilter_DisableExternalResourcesTest extends function setUp() { parent::setUp(); + $this->filter = new HTMLPurifier_URIFilter_DisableExternalResources(); $var = true; $this->context->register('EmbeddedURI', $var); } diff --git a/tests/HTMLPurifier/URIFilter/MakeAbsoluteTest.php b/tests/HTMLPurifier/URIFilter/MakeAbsoluteTest.php new file mode 100644 index 00000000..d509a6a1 --- /dev/null +++ b/tests/HTMLPurifier/URIFilter/MakeAbsoluteTest.php @@ -0,0 +1,122 @@ +filter = new HTMLPurifier_URIFilter_MakeAbsolute(); + $this->setBase(); + } + + function setBase($base = 'http://example.com/foo/bar.html?q=s#frag') { + $this->config->set('URI', 'Base', $base); + } + + // corresponding to RFC 2396 + + function testPreserveAbsolute() { + $this->assertFiltering('http://example.com/foo.html'); + } + + function testFilterBlank() { + $this->assertFiltering('', 'http://example.com/foo/bar.html?q=s'); + } + + function testFilterEmptyPath() { + $this->assertFiltering('?q=s#frag', 'http://example.com/foo/bar.html?q=s#frag'); + } + + function testPreserveAltScheme() { + $this->assertFiltering('mailto:bob@example.com'); + } + + function testFilterIgnoreHTTPSpecialCase() { + $this->assertFiltering('http:/', 'http://example.com/'); + } + + function testFilterAbsolutePath() { + $this->assertFiltering('/foo.txt', 'http://example.com/foo.txt'); + } + + function testFilterRelativePath() { + $this->assertFiltering('baz.txt', 'http://example.com/foo/baz.txt'); + } + + function testFilterRelativePathWithInternalDot() { + $this->assertFiltering('./baz.txt', 'http://example.com/foo/baz.txt'); + } + + function testFilterRelativePathWithEndingDot() { + $this->assertFiltering('baz/.', 'http://example.com/foo/baz/'); + } + + function testFilterRelativePathDot() { + $this->assertFiltering('.', 'http://example.com/foo/'); + } + + function testFilterRelativePathWithInternalDotDot() { + $this->assertFiltering('../baz.txt', 'http://example.com/baz.txt'); + } + + function testFilterRelativePathWithEndingDotDot() { + $this->assertFiltering('..', 'http://example.com/'); + } + + function testFilterRelativePathTooManyDotDots() { + $this->assertFiltering('../../', 'http://example.com/'); + } + + function testFilterAppendingQueryAndFragment() { + $this->assertFiltering('/foo.php?q=s#frag', 'http://example.com/foo.php?q=s#frag'); + } + + // edge cases below + + function testFilterAbsolutePathBase() { + $this->setBase('/foo/baz.txt'); + $this->assertFiltering('test.php', '/foo/test.php'); + } + + function testFilterAbsolutePathBaseDirectory() { + $this->setBase('/foo/'); + $this->assertFiltering('test.php', '/foo/test.php'); + } + + function testFilterAbsolutePathBaseBelow() { + $this->setBase('/foo/baz.txt'); + $this->assertFiltering('../../test.php', '/test.php'); + } + + function testFilterRelativePathBase() { + $this->setBase('foo/baz.html'); + $this->assertFiltering('foo.php', 'foo/foo.php'); + } + + function testFilterRelativePathBaseBelow() { + $this->setBase('../baz.html'); + $this->assertFiltering('test/strike.html', '../test/strike.html'); + } + + function testFilterRelativePathBaseWithAbsoluteURI() { + $this->setBase('../baz.html'); + $this->assertFiltering('/test/strike.html'); + } + + function testFilterRelativePathBaseWithDot() { + $this->setBase('../baz.html'); + $this->assertFiltering('.', '../'); + } + + // error case + + function testErrorNoBase() { + $this->setBase(null); + $this->expectError('URI.MakeAbsolute is being ignored due to lack of value for URI.Base configuration'); + $this->assertFiltering('foo/bar.txt'); + } + +} diff --git a/tests/HTMLPurifier/URIHarness.php b/tests/HTMLPurifier/URIHarness.php index bec80845..63e6d7d6 100644 --- a/tests/HTMLPurifier/URIHarness.php +++ b/tests/HTMLPurifier/URIHarness.php @@ -13,7 +13,7 @@ class HTMLPurifier_URIHarness extends HTMLPurifier_Harness */ function prepareURI(&$uri, &$expect_uri) { $parser = new HTMLPurifier_URIParser(); - if ($expect_uri === true) $uri = $expect_uri; + if ($expect_uri === true) $expect_uri = $uri; $uri = $parser->parse($uri); if ($expect_uri !== false) { $expect_uri = $parser->parse($expect_uri); diff --git a/tests/test_files.php b/tests/test_files.php index 44bc86f2..93766376 100644 --- a/tests/test_files.php +++ b/tests/test_files.php @@ -106,6 +106,7 @@ $test_files[] = 'HTMLPurifier/URIDefinitionTest.php'; $test_files[] = 'HTMLPurifier/URIFilter/DisableExternalTest.php'; $test_files[] = 'HTMLPurifier/URIFilter/DisableExternalResourcesTest.php'; $test_files[] = 'HTMLPurifier/URIFilter/HostBlacklistTest.php'; +$test_files[] = 'HTMLPurifier/URIFilter/MakeAbsoluteTest.php'; $test_files[] = 'HTMLPurifier/URIParserTest.php'; $test_files[] = 'HTMLPurifier/URISchemeRegistryTest.php'; $test_files[] = 'HTMLPurifier/URISchemeTest.php';