0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-09-18 18:25:18 +00:00

[2.1.0] Refactor AttrDef_URI: removed URIParser functionality

- Genericized flush-definition-cache script

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1333 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2007-08-01 14:55:09 +00:00
parent 2a002857ce
commit 8c9dbe142d
8 changed files with 237 additions and 206 deletions

4
NEWS
View File

@ -10,6 +10,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
==========================
2.1.0, unknown release date
# flush-htmldefinition-cache.php superseded in favor of a generic
flush-definition-cache.php script
! Phorum mod implemented for HTML Purifier
! With %Core.AggressivelyFixLt, <3 and similar emoticons no longer
trigger HTML removal in PHP5 (DOMLex). This directive is not necessary
@ -43,7 +45,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
already exists. May clobber autoload, so I need to keep an eye on it
. ConfigSchema heavily optimized, will only collect information and validate
definitions when HTMLPURIFIER_SCHEMA_STRICT is true.
. AttrDef_URI unit tests refactored
. AttrDef_URI unit tests and implementation refactored
. benchmarks/ directory now protected from public view with .htaccess file;
run the tests via command line
. URI scheme is munged off if there is no authority and the scheme is the

View File

@ -15,7 +15,7 @@ class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
{
function HTMLPurifier_AttrDef_CSS_URI() {
$this->HTMLPurifier_AttrDef_URI(true); // always embedded
parent::HTMLPurifier_AttrDef_URI(true); // always embedded
}
function validate($uri_string, $config, &$context) {

View File

@ -1,6 +1,7 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/URIParser.php';
require_once 'HTMLPurifier/URIScheme.php';
require_once 'HTMLPurifier/URISchemeRegistry.php';
require_once 'HTMLPurifier/AttrDef/URI/Host.php';
@ -92,7 +93,7 @@ HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable');
class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
{
var $host;
var $host, $parser;
var $embeds_resource;
/**
@ -100,6 +101,7 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
*/
function HTMLPurifier_AttrDef_URI($embeds_resource = false) {
$this->host = new HTMLPurifier_AttrDef_URI_Host();
$this->parser = new HTMLPurifier_URIParser();
$this->embeds_resource = (bool) $embeds_resource;
}
@ -108,43 +110,18 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
static $PercentEncoder = null;
if ($PercentEncoder === null) $PercentEncoder = new HTMLPurifier_PercentEncoder();
// We'll write stack-based parsers later, for now, use regexps to
// get things working as fast as possible (irony)
if ($config->get('URI', 'Disable')) return false;
// parse as CDATA
// initial operations
$uri = $this->parseCDATA($uri);
// fix up percent-encoding
$uri = $PercentEncoder->normalize($uri);
// while it would be nice to use parse_url(), that's specifically
// for HTTP and thus won't work for our generic URI parsing
// according to the RFC... (but this cuts corners, i.e. non-validating)
$r_URI = '!'.
'(([^:/?#<>\'"]+):)?'. // 2. Scheme
'(//([^/?#<>\'"]*))?'. // 4. Authority
'([^?#<>\'"]*)'. // 5. Path
'(\?([^#<>\'"]*))?'. // 7. Query
'(#([^<>\'"]*))?'. // 8. Fragment
'!';
$matches = array();
$result = preg_match($r_URI, $uri, $matches);
if (!$result) return false; // *really* invalid URI
// seperate out parts
$scheme = !empty($matches[1]) ? $matches[2] : null;
$authority = !empty($matches[3]) ? $matches[4] : null;
$path = $matches[5]; // always present, can be empty
$query = !empty($matches[6]) ? $matches[7] : null;
$fragment = !empty($matches[8]) ? $matches[9] : null;
// parse the URI
$parsed_uri = $this->parser->parse($uri);
if ($parsed_uri === false) return false;
list($scheme, $userinfo, $host, $port, $path, $query, $fragment) = $parsed_uri;
// retrieve the scheme object
$registry =& HTMLPurifier_URISchemeRegistry::instance();
$default_scheme = $config->get('URI', 'DefaultScheme');
if ($scheme !== null) {
@ -154,31 +131,25 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
$scheme_obj = $registry->getScheme($scheme, $config, $context);
if (!$scheme_obj) return false; // invalid scheme, clean it out
} else {
$scheme_obj = $registry->getScheme(
$default_scheme, $config, $context
);
// no scheme: retrieve the default one
$scheme_obj = $registry->getScheme($default_scheme, $config, $context);
if (!$scheme_obj) {
// something funky happened to the default scheme object
trigger_error(
'Default scheme object "' . $config->get('URI', 'DefaultScheme') . '" was not readable',
E_USER_WARNING
);
return false;
}
}
// something funky weird happened in the registry, abort!
if (!$scheme_obj) {
trigger_error(
'Default scheme object "' . $config->get('URI', 'DefaultScheme') . '" was not readable',
E_USER_WARNING
);
return false;
}
// the URI we're processing embeds_resource a resource in the page, but the URI
// it references cannot be located
if ($this->embeds_resource && !$scheme_obj->browsable) {
// the URI we're processing embeds_resource a resource in the
// page, but the URI it references cannot be physically retrieved
return false;
}
if ($authority !== null) {
// ridiculously inefficient
// validate host
if ($host !== null) {
// remove URI if it's absolute and we disabled externals or
// if it's absolute and embedded and we disabled external resources
unset($our_host);
@ -192,29 +163,10 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
$our_host = $config->get('URI', 'Host');
if ($our_host === null) return false;
}
$HEXDIG = '[A-Fa-f0-9]';
$unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with []
$sub_delims = '!$&\'()'; // needs []
$pct_encoded = "%$HEXDIG$HEXDIG";
$r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*";
$r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
$matches = array();
preg_match($r_authority, $authority, $matches);
// overloads regexp!
$userinfo = !empty($matches[1]) ? $matches[2] : null;
$host = !empty($matches[3]) ? $matches[3] : null;
$port = !empty($matches[4]) ? $matches[5] : null;
// validate port
if ($port !== null) {
$port = (int) $port;
if ($port < 1 || $port > 65535) $port = null;
}
$host = $this->host->validate($host, $config, $context);
if ($host === false) $host = null;
// check host against blacklist
if ($this->checkBlacklist($host, $config, $context)) return false;
// more lenient absolute checking
@ -227,11 +179,11 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
if ($host_parts[$i] != $our_host_parts[$i]) return false;
}
}
// userinfo and host are validated within the regexp
} else {
$port = $host = $userinfo = null;
}
// validate port
if ($port !== null) {
if ($port < 1 || $port > 65535) $port = null;
}

View File

@ -0,0 +1,58 @@
<?php
/**
* Parses a URI into the components and fragment identifier as specified
* by RFC 2396.
*/
class HTMLPurifier_URIParser
{
/**
* Parses a URI
* @param $uri string URI to parse
* @return array(userinfo, host, int port, path, query, fragment) components
*/
function parse($uri) {
$r_URI = '!'.
'(([^:/?#<>\'"]+):)?'. // 2. Scheme
'(//([^/?#<>\'"]*))?'. // 4. Authority
'([^?#<>\'"]*)'. // 5. Path
'(\?([^#<>\'"]*))?'. // 7. Query
'(#([^<>\'"]*))?'. // 8. Fragment
'!';
$matches = array();
$result = preg_match($r_URI, $uri, $matches);
if (!$result) return false; // *really* invalid URI
// seperate out parts
$scheme = !empty($matches[1]) ? $matches[2] : null;
$authority = !empty($matches[3]) ? $matches[4] : null;
$path = $matches[5]; // always present, can be empty
$query = !empty($matches[6]) ? $matches[7] : null;
$fragment = !empty($matches[8]) ? $matches[9] : null;
// further parse authority
if ($authority !== null) {
// ridiculously inefficient: it's a stacked regex!
$HEXDIG = '[A-Fa-f0-9]';
$unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with []
$sub_delims = '!$&\'()'; // needs []
$pct_encoded = "%$HEXDIG$HEXDIG";
$r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*";
$r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
$matches = array();
preg_match($r_authority, $authority, $matches);
$userinfo = !empty($matches[1]) ? $matches[2] : null;
$host = !empty($matches[3]) ? $matches[3] : '';
$port = !empty($matches[4]) ? (int) $matches[5] : null;
} else {
$port = $host = $userinfo = null;
}
return array($scheme, $userinfo, $host, $port, $path, $query, $fragment);
}
}

View File

@ -10,14 +10,18 @@ if (php_sapi_name() != 'cli') {
exit;
}
echo 'Flushing cache... ';
echo "Flushing cache... \n";
require_once(dirname(__FILE__) . '/../library/HTMLPurifier.auto.php');
$config = HTMLPurifier_Config::createDefault();
$cache = new HTMLPurifier_DefinitionCache_Serializer('HTML');
$cache->flush($config);
$names = array('HTML', 'CSS', 'Test');
foreach ($names as $name) {
echo " - Flushing $name\n";
$cache = new HTMLPurifier_DefinitionCache_Serializer($name);
$cache->flush($config);
}
echo 'Cache flushed successfully.';

View File

@ -66,48 +66,6 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness
}
function testParsingRegular() {
$this->assertParsing(
'http://www.example.com/webhp?q=foo#result2',
null, 'www.example.com', null, '/webhp', 'q=foo'
);
}
function testParsingPortAndUsername() {
$this->assertParsing(
'http://user@authority.part:80/now/the/path?query#fragment',
'user', 'authority.part', 80, '/now/the/path', 'query'
);
}
function testParsingPercentEncoding() {
$this->assertParsing(
'http://en.wikipedia.org/wiki/Clich%C3%A9',
null, 'en.wikipedia.org', null, '/wiki/Clich%C3%A9', null
);
}
function testParsingEmptyQuery() {
$this->assertParsing(
'http://www.example.com/?#',
null, 'www.example.com', null, '/', ''
);
}
function testParsingEmptyPath() {
$this->assertParsing(
'http://www.example.com',
null, 'www.example.com', null, '', null
);
}
function testParsingOpaqueURI() {
$this->assertParsing(
'mailto:bob@example.com',
null, null, null, 'bob@example.com', null
);
}
function testParsingImproperPercentEncoding() {
// even though we don't resolve percent entities, we have to fix
// improper percent-encodes. Taken one at a time:
@ -125,38 +83,6 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness
);
}
function testParsingIPv4Address() {
$this->assertParsing(
'http://192.0.34.166/',
null, '192.0.34.166', null, '/', null
);
}
function testParsingFakeIPv4Address() {
$this->assertParsing(
'http://333.123.32.123/',
null, '333.123.32.123', null, '/', null
);
}
function testParsingIPv6Address() {
$this->assertParsing(
'http://[2001:db8::7]/c=GB?objectClass?one',
null, '[2001:db8::7]', null, '/c=GB', 'objectClass?one'
);
}
// We will not implement punycode encoding, that's up to the browsers
// We also will not implement percent to IDNA encoding transformations:
// if you need to use an international domain in a link, make sure that
// you've got it in UTF-8 and send it in raw (no encoding).
function testParsingInternationalizedDomainName() {
$this->assertParsing(
"http://t\xC5\xABdali\xC5\x86.lv",
null, "t\xC5\xABdali\xC5\x86.lv", null, '', null
);
}
function testParsingInvalidHostThatLooksLikeIPv6Address() {
$this->assertParsing(
'http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]',
@ -164,13 +90,6 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness
);
}
function testParsingInvalidPort() {
$this->assertParsing(
'http://example.com:foobar',
null, 'example.com', null, '', null
);
}
function testParsingOverLargePort() {
$this->assertParsing(
'http://example.com:65536',
@ -178,49 +97,6 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness
);
}
function testParsingPathAbsolute() { // note this is different from path-rootless
$this->assertParsing(
'http:/this/is/path',
null, null, null, '/this/is/path', null
);
}
function testParsingPathRootless() {
// this should not be used but is allowed
$this->assertParsing(
'http:this/is/path',
null, null, null, 'this/is/path', null
);
}
function testParsingPathEmpty() {
$this->assertParsing(
'http:',
null, null, null, '', null
);
}
function testParsingRelativeURI() {
$this->assertParsing(
'/a/b',
null, null, null, '/a/b', null
);
}
function testParsingMalformedTag() {
$this->assertParsing(
'http://www.google.com/\'>"',
null, 'www.google.com', null, '/', null
);
}
function testParsingEmpty() {
$this->assertParsing(
'',
null, null, null, '', null
);
}
// OUTPUT RELATED TESTS
// scheme is mocked to ensure only the URI is being tested

View File

@ -0,0 +1,138 @@
<?php
require_once 'HTMLPurifier/URIParser.php';
class HTMLPurifier_URIParserTest extends HTMLPurifier_Harness
{
function assertParsing(
$uri, $scheme, $userinfo, $host, $port, $path, $query, $fragment, $config = null, $context = null
) {
$this->prepareCommon($config, $context);
$parser = new HTMLPurifier_URIParser();
$result = $parser->parse($uri, $config, $context);
$this->assertEqual($result, array($scheme, $userinfo, $host, $port, $path, $query, $fragment));
}
function testRegular() {
$this->assertParsing(
'http://www.example.com/webhp?q=foo#result2',
'http', null, 'www.example.com', null, '/webhp', 'q=foo', 'result2'
);
}
function testPortAndUsername() {
$this->assertParsing(
'http://user@authority.part:80/now/the/path?query#fragment',
'http', 'user', 'authority.part', 80, '/now/the/path', 'query', 'fragment'
);
}
function testPercentEncoding() {
$this->assertParsing(
'http://en.wikipedia.org/wiki/Clich%C3%A9',
'http', null, 'en.wikipedia.org', null, '/wiki/Clich%C3%A9', null, null
);
}
function testEmptyQuery() {
$this->assertParsing(
'http://www.example.com/?#',
'http', null, 'www.example.com', null, '/', '', null
);
}
function testEmptyPath() {
$this->assertParsing(
'http://www.example.com',
'http', null, 'www.example.com', null, '', null, null
);
}
function testOpaqueURI() {
$this->assertParsing(
'mailto:bob@example.com',
'mailto', null, null, null, 'bob@example.com', null, null
);
}
function testIPv4Address() {
$this->assertParsing(
'http://192.0.34.166/',
'http', null, '192.0.34.166', null, '/', null, null
);
}
function testFakeIPv4Address() {
$this->assertParsing(
'http://333.123.32.123/',
'http', null, '333.123.32.123', null, '/', null, null
);
}
function testIPv6Address() {
$this->assertParsing(
'http://[2001:db8::7]/c=GB?objectClass?one',
'http', null, '[2001:db8::7]', null, '/c=GB', 'objectClass?one', null
);
}
function testInternationalizedDomainName() {
$this->assertParsing(
"http://t\xC5\xABdali\xC5\x86.lv",
'http', null, "t\xC5\xABdali\xC5\x86.lv", null, '', null, null
);
}
function testInvalidPort() {
$this->assertParsing(
'http://example.com:foobar',
'http', null, 'example.com', null, '', null, null
);
}
function testPathAbsolute() {
$this->assertParsing(
'http:/this/is/path',
'http', null, null, null, '/this/is/path', null, null
);
}
function testPathRootless() {
// this should not be used but is allowed
$this->assertParsing(
'http:this/is/path',
'http', null, null, null, 'this/is/path', null, null
);
}
function testPathEmpty() {
$this->assertParsing(
'http:',
'http', null, null, null, '', null, null
);
}
function testRelativeURI() {
$this->assertParsing(
'/a/b',
null, null, null, null, '/a/b', null, null
);
}
function testMalformedTag() {
$this->assertParsing(
'http://www.example.com/\'>"',
'http', null, 'www.example.com', null, '/', null, null
);
}
function testEmpty() {
$this->assertParsing(
'',
null, null, null, null, '', null, null
);
}
}

View File

@ -102,6 +102,7 @@ $test_files[] = 'HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php';
$test_files[] = 'HTMLPurifier/Strategy/ValidateAttributesTest.php';
$test_files[] = 'HTMLPurifier/TagTransformTest.php';
$test_files[] = 'HTMLPurifier/TokenTest.php';
$test_files[] = 'HTMLPurifier/URIParserTest.php';
$test_files[] = 'HTMLPurifier/URISchemeRegistryTest.php';
$test_files[] = 'HTMLPurifier/URISchemeTest.php';
$test_files[] = 'HTMLPurifierTest.php';