diff --git a/NEWS b/NEWS index 38a1ac94..007a3fb7 100644 --- a/NEWS +++ b/NEWS @@ -10,6 +10,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier ========================== 2.1.0, unknown release date +# flush-htmldefinition-cache.php superseded in favor of a generic + flush-definition-cache.php script ! Phorum mod implemented for HTML Purifier ! With %Core.AggressivelyFixLt, <3 and similar emoticons no longer trigger HTML removal in PHP5 (DOMLex). This directive is not necessary @@ -43,7 +45,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier already exists. May clobber autoload, so I need to keep an eye on it . ConfigSchema heavily optimized, will only collect information and validate definitions when HTMLPURIFIER_SCHEMA_STRICT is true. -. AttrDef_URI unit tests refactored +. AttrDef_URI unit tests and implementation refactored . benchmarks/ directory now protected from public view with .htaccess file; run the tests via command line . URI scheme is munged off if there is no authority and the scheme is the diff --git a/library/HTMLPurifier/AttrDef/CSS/URI.php b/library/HTMLPurifier/AttrDef/CSS/URI.php index 107545cc..b71a8585 100644 --- a/library/HTMLPurifier/AttrDef/CSS/URI.php +++ b/library/HTMLPurifier/AttrDef/CSS/URI.php @@ -15,7 +15,7 @@ class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI { function HTMLPurifier_AttrDef_CSS_URI() { - $this->HTMLPurifier_AttrDef_URI(true); // always embedded + parent::HTMLPurifier_AttrDef_URI(true); // always embedded } function validate($uri_string, $config, &$context) { diff --git a/library/HTMLPurifier/AttrDef/URI.php b/library/HTMLPurifier/AttrDef/URI.php index 979e44b3..8c76fbf3 100644 --- a/library/HTMLPurifier/AttrDef/URI.php +++ b/library/HTMLPurifier/AttrDef/URI.php @@ -1,6 +1,7 @@ host = new HTMLPurifier_AttrDef_URI_Host(); + $this->parser = new HTMLPurifier_URIParser(); $this->embeds_resource = (bool) $embeds_resource; } @@ -108,43 +110,18 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef static $PercentEncoder = null; if ($PercentEncoder === null) $PercentEncoder = new HTMLPurifier_PercentEncoder(); - // We'll write stack-based parsers later, for now, use regexps to - // get things working as fast as possible (irony) - if ($config->get('URI', 'Disable')) return false; - // parse as CDATA + // initial operations $uri = $this->parseCDATA($uri); - - // fix up percent-encoding $uri = $PercentEncoder->normalize($uri); - // while it would be nice to use parse_url(), that's specifically - // for HTTP and thus won't work for our generic URI parsing - - // according to the RFC... (but this cuts corners, i.e. non-validating) - $r_URI = '!'. - '(([^:/?#<>\'"]+):)?'. // 2. Scheme - '(//([^/?#<>\'"]*))?'. // 4. Authority - '([^?#<>\'"]*)'. // 5. Path - '(\?([^#<>\'"]*))?'. // 7. Query - '(#([^<>\'"]*))?'. // 8. Fragment - '!'; - - $matches = array(); - $result = preg_match($r_URI, $uri, $matches); - - if (!$result) return false; // *really* invalid URI - - // seperate out parts - $scheme = !empty($matches[1]) ? $matches[2] : null; - $authority = !empty($matches[3]) ? $matches[4] : null; - $path = $matches[5]; // always present, can be empty - $query = !empty($matches[6]) ? $matches[7] : null; - $fragment = !empty($matches[8]) ? $matches[9] : null; - - + // parse the URI + $parsed_uri = $this->parser->parse($uri); + if ($parsed_uri === false) return false; + list($scheme, $userinfo, $host, $port, $path, $query, $fragment) = $parsed_uri; + // retrieve the scheme object $registry =& HTMLPurifier_URISchemeRegistry::instance(); $default_scheme = $config->get('URI', 'DefaultScheme'); if ($scheme !== null) { @@ -154,31 +131,25 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef $scheme_obj = $registry->getScheme($scheme, $config, $context); if (!$scheme_obj) return false; // invalid scheme, clean it out } else { - $scheme_obj = $registry->getScheme( - $default_scheme, $config, $context - ); + // no scheme: retrieve the default one + $scheme_obj = $registry->getScheme($default_scheme, $config, $context); + if (!$scheme_obj) { + // something funky happened to the default scheme object + trigger_error( + 'Default scheme object "' . $config->get('URI', 'DefaultScheme') . '" was not readable', + E_USER_WARNING + ); + return false; + } } - - // something funky weird happened in the registry, abort! - if (!$scheme_obj) { - trigger_error( - 'Default scheme object "' . $config->get('URI', 'DefaultScheme') . '" was not readable', - E_USER_WARNING - ); - return false; - } - - // the URI we're processing embeds_resource a resource in the page, but the URI - // it references cannot be located if ($this->embeds_resource && !$scheme_obj->browsable) { + // the URI we're processing embeds_resource a resource in the + // page, but the URI it references cannot be physically retrieved return false; } - - if ($authority !== null) { - - // ridiculously inefficient - + // validate host + if ($host !== null) { // remove URI if it's absolute and we disabled externals or // if it's absolute and embedded and we disabled external resources unset($our_host); @@ -192,29 +163,10 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef $our_host = $config->get('URI', 'Host'); if ($our_host === null) return false; } - - $HEXDIG = '[A-Fa-f0-9]'; - $unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with [] - $sub_delims = '!$&\'()'; // needs [] - $pct_encoded = "%$HEXDIG$HEXDIG"; - $r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*"; - $r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/"; - $matches = array(); - preg_match($r_authority, $authority, $matches); - // overloads regexp! - $userinfo = !empty($matches[1]) ? $matches[2] : null; - $host = !empty($matches[3]) ? $matches[3] : null; - $port = !empty($matches[4]) ? $matches[5] : null; - - // validate port - if ($port !== null) { - $port = (int) $port; - if ($port < 1 || $port > 65535) $port = null; - } - $host = $this->host->validate($host, $config, $context); if ($host === false) $host = null; + // check host against blacklist if ($this->checkBlacklist($host, $config, $context)) return false; // more lenient absolute checking @@ -227,11 +179,11 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef if ($host_parts[$i] != $our_host_parts[$i]) return false; } } - - // userinfo and host are validated within the regexp - - } else { - $port = $host = $userinfo = null; + } + + // validate port + if ($port !== null) { + if ($port < 1 || $port > 65535) $port = null; } diff --git a/library/HTMLPurifier/URIParser.php b/library/HTMLPurifier/URIParser.php new file mode 100644 index 00000000..44a24440 --- /dev/null +++ b/library/HTMLPurifier/URIParser.php @@ -0,0 +1,58 @@ +\'"]+):)?'. // 2. Scheme + '(//([^/?#<>\'"]*))?'. // 4. Authority + '([^?#<>\'"]*)'. // 5. Path + '(\?([^#<>\'"]*))?'. // 7. Query + '(#([^<>\'"]*))?'. // 8. Fragment + '!'; + + $matches = array(); + $result = preg_match($r_URI, $uri, $matches); + + if (!$result) return false; // *really* invalid URI + + // seperate out parts + $scheme = !empty($matches[1]) ? $matches[2] : null; + $authority = !empty($matches[3]) ? $matches[4] : null; + $path = $matches[5]; // always present, can be empty + $query = !empty($matches[6]) ? $matches[7] : null; + $fragment = !empty($matches[8]) ? $matches[9] : null; + + // further parse authority + if ($authority !== null) { + // ridiculously inefficient: it's a stacked regex! + $HEXDIG = '[A-Fa-f0-9]'; + $unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with [] + $sub_delims = '!$&\'()'; // needs [] + $pct_encoded = "%$HEXDIG$HEXDIG"; + $r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*"; + $r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/"; + $matches = array(); + preg_match($r_authority, $authority, $matches); + $userinfo = !empty($matches[1]) ? $matches[2] : null; + $host = !empty($matches[3]) ? $matches[3] : ''; + $port = !empty($matches[4]) ? (int) $matches[5] : null; + } else { + $port = $host = $userinfo = null; + } + + return array($scheme, $userinfo, $host, $port, $path, $query, $fragment); + } + +} + diff --git a/maintenance/flush-htmldefinition-cache.php b/maintenance/flush-definition-cache.php similarity index 60% rename from maintenance/flush-htmldefinition-cache.php rename to maintenance/flush-definition-cache.php index c6d31bfb..07f1442b 100644 --- a/maintenance/flush-htmldefinition-cache.php +++ b/maintenance/flush-definition-cache.php @@ -10,14 +10,18 @@ if (php_sapi_name() != 'cli') { exit; } -echo 'Flushing cache... '; +echo "Flushing cache... \n"; require_once(dirname(__FILE__) . '/../library/HTMLPurifier.auto.php'); $config = HTMLPurifier_Config::createDefault(); -$cache = new HTMLPurifier_DefinitionCache_Serializer('HTML'); -$cache->flush($config); +$names = array('HTML', 'CSS', 'Test'); +foreach ($names as $name) { + echo " - Flushing $name\n"; + $cache = new HTMLPurifier_DefinitionCache_Serializer($name); + $cache->flush($config); +} echo 'Cache flushed successfully.'; diff --git a/tests/HTMLPurifier/AttrDef/URITest.php b/tests/HTMLPurifier/AttrDef/URITest.php index fc8500b4..33d058c5 100644 --- a/tests/HTMLPurifier/AttrDef/URITest.php +++ b/tests/HTMLPurifier/AttrDef/URITest.php @@ -66,48 +66,6 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness } - function testParsingRegular() { - $this->assertParsing( - 'http://www.example.com/webhp?q=foo#result2', - null, 'www.example.com', null, '/webhp', 'q=foo' - ); - } - - function testParsingPortAndUsername() { - $this->assertParsing( - 'http://user@authority.part:80/now/the/path?query#fragment', - 'user', 'authority.part', 80, '/now/the/path', 'query' - ); - } - - function testParsingPercentEncoding() { - $this->assertParsing( - 'http://en.wikipedia.org/wiki/Clich%C3%A9', - null, 'en.wikipedia.org', null, '/wiki/Clich%C3%A9', null - ); - } - - function testParsingEmptyQuery() { - $this->assertParsing( - 'http://www.example.com/?#', - null, 'www.example.com', null, '/', '' - ); - } - - function testParsingEmptyPath() { - $this->assertParsing( - 'http://www.example.com', - null, 'www.example.com', null, '', null - ); - } - - function testParsingOpaqueURI() { - $this->assertParsing( - 'mailto:bob@example.com', - null, null, null, 'bob@example.com', null - ); - } - function testParsingImproperPercentEncoding() { // even though we don't resolve percent entities, we have to fix // improper percent-encodes. Taken one at a time: @@ -125,38 +83,6 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness ); } - function testParsingIPv4Address() { - $this->assertParsing( - 'http://192.0.34.166/', - null, '192.0.34.166', null, '/', null - ); - } - - function testParsingFakeIPv4Address() { - $this->assertParsing( - 'http://333.123.32.123/', - null, '333.123.32.123', null, '/', null - ); - } - - function testParsingIPv6Address() { - $this->assertParsing( - 'http://[2001:db8::7]/c=GB?objectClass?one', - null, '[2001:db8::7]', null, '/c=GB', 'objectClass?one' - ); - } - - // We will not implement punycode encoding, that's up to the browsers - // We also will not implement percent to IDNA encoding transformations: - // if you need to use an international domain in a link, make sure that - // you've got it in UTF-8 and send it in raw (no encoding). - function testParsingInternationalizedDomainName() { - $this->assertParsing( - "http://t\xC5\xABdali\xC5\x86.lv", - null, "t\xC5\xABdali\xC5\x86.lv", null, '', null - ); - } - function testParsingInvalidHostThatLooksLikeIPv6Address() { $this->assertParsing( 'http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]', @@ -164,13 +90,6 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness ); } - function testParsingInvalidPort() { - $this->assertParsing( - 'http://example.com:foobar', - null, 'example.com', null, '', null - ); - } - function testParsingOverLargePort() { $this->assertParsing( 'http://example.com:65536', @@ -178,49 +97,6 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness ); } - function testParsingPathAbsolute() { // note this is different from path-rootless - $this->assertParsing( - 'http:/this/is/path', - null, null, null, '/this/is/path', null - ); - } - - function testParsingPathRootless() { - // this should not be used but is allowed - $this->assertParsing( - 'http:this/is/path', - null, null, null, 'this/is/path', null - ); - } - - function testParsingPathEmpty() { - $this->assertParsing( - 'http:', - null, null, null, '', null - ); - } - - function testParsingRelativeURI() { - $this->assertParsing( - '/a/b', - null, null, null, '/a/b', null - ); - } - - function testParsingMalformedTag() { - $this->assertParsing( - 'http://www.google.com/\'>"', - null, 'www.google.com', null, '/', null - ); - } - - function testParsingEmpty() { - $this->assertParsing( - '', - null, null, null, '', null - ); - } - // OUTPUT RELATED TESTS // scheme is mocked to ensure only the URI is being tested diff --git a/tests/HTMLPurifier/URIParserTest.php b/tests/HTMLPurifier/URIParserTest.php new file mode 100644 index 00000000..1bca977d --- /dev/null +++ b/tests/HTMLPurifier/URIParserTest.php @@ -0,0 +1,138 @@ +prepareCommon($config, $context); + $parser = new HTMLPurifier_URIParser(); + $result = $parser->parse($uri, $config, $context); + $this->assertEqual($result, array($scheme, $userinfo, $host, $port, $path, $query, $fragment)); + } + + function testRegular() { + $this->assertParsing( + 'http://www.example.com/webhp?q=foo#result2', + 'http', null, 'www.example.com', null, '/webhp', 'q=foo', 'result2' + ); + } + + function testPortAndUsername() { + $this->assertParsing( + 'http://user@authority.part:80/now/the/path?query#fragment', + 'http', 'user', 'authority.part', 80, '/now/the/path', 'query', 'fragment' + ); + } + + function testPercentEncoding() { + $this->assertParsing( + 'http://en.wikipedia.org/wiki/Clich%C3%A9', + 'http', null, 'en.wikipedia.org', null, '/wiki/Clich%C3%A9', null, null + ); + } + + function testEmptyQuery() { + $this->assertParsing( + 'http://www.example.com/?#', + 'http', null, 'www.example.com', null, '/', '', null + ); + } + + function testEmptyPath() { + $this->assertParsing( + 'http://www.example.com', + 'http', null, 'www.example.com', null, '', null, null + ); + } + + function testOpaqueURI() { + $this->assertParsing( + 'mailto:bob@example.com', + 'mailto', null, null, null, 'bob@example.com', null, null + ); + } + + function testIPv4Address() { + $this->assertParsing( + 'http://192.0.34.166/', + 'http', null, '192.0.34.166', null, '/', null, null + ); + } + + function testFakeIPv4Address() { + $this->assertParsing( + 'http://333.123.32.123/', + 'http', null, '333.123.32.123', null, '/', null, null + ); + } + + function testIPv6Address() { + $this->assertParsing( + 'http://[2001:db8::7]/c=GB?objectClass?one', + 'http', null, '[2001:db8::7]', null, '/c=GB', 'objectClass?one', null + ); + } + + function testInternationalizedDomainName() { + $this->assertParsing( + "http://t\xC5\xABdali\xC5\x86.lv", + 'http', null, "t\xC5\xABdali\xC5\x86.lv", null, '', null, null + ); + } + + function testInvalidPort() { + $this->assertParsing( + 'http://example.com:foobar', + 'http', null, 'example.com', null, '', null, null + ); + } + + function testPathAbsolute() { + $this->assertParsing( + 'http:/this/is/path', + 'http', null, null, null, '/this/is/path', null, null + ); + } + + function testPathRootless() { + // this should not be used but is allowed + $this->assertParsing( + 'http:this/is/path', + 'http', null, null, null, 'this/is/path', null, null + ); + } + + function testPathEmpty() { + $this->assertParsing( + 'http:', + 'http', null, null, null, '', null, null + ); + } + + function testRelativeURI() { + $this->assertParsing( + '/a/b', + null, null, null, null, '/a/b', null, null + ); + } + + function testMalformedTag() { + $this->assertParsing( + 'http://www.example.com/\'>"', + 'http', null, 'www.example.com', null, '/', null, null + ); + } + + function testEmpty() { + $this->assertParsing( + '', + null, null, null, null, '', null, null + ); + } + +} + diff --git a/tests/test_files.php b/tests/test_files.php index 5920981e..845807c7 100644 --- a/tests/test_files.php +++ b/tests/test_files.php @@ -102,6 +102,7 @@ $test_files[] = 'HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php'; $test_files[] = 'HTMLPurifier/Strategy/ValidateAttributesTest.php'; $test_files[] = 'HTMLPurifier/TagTransformTest.php'; $test_files[] = 'HTMLPurifier/TokenTest.php'; +$test_files[] = 'HTMLPurifier/URIParserTest.php'; $test_files[] = 'HTMLPurifier/URISchemeRegistryTest.php'; $test_files[] = 'HTMLPurifier/URISchemeTest.php'; $test_files[] = 'HTMLPurifierTest.php';