0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-03 05:11:52 +00:00

[3.1.0] Revamp URI handling of percent encoding and validation.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1709 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2008-05-14 02:19:00 +00:00
parent 77ce3e8b4a
commit cb5d5d0648
13 changed files with 261 additions and 50 deletions

2
NEWS
View File

@ -32,6 +32,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
! Commas, not dashes, used for serializer IDs. This change is forwards-compatible ! Commas, not dashes, used for serializer IDs. This change is forwards-compatible
and allows for version numbers like "3.1.0-dev". and allows for version numbers like "3.1.0-dev".
! %HTML.Allowed deals gracefully with whitespace anywhere, anytime! ! %HTML.Allowed deals gracefully with whitespace anywhere, anytime!
! HTML Purifier's URI handling is a lot more robust, with much stricter
validation checks and better percent encoding handling.
- InterchangeBuilder now alphabetizes its lists - InterchangeBuilder now alphabetizes its lists
- Validation error in configdoc output fixed - Validation error in configdoc output fixed
- Iconv and other encoding errors muted even with custom error handlers that - Iconv and other encoding errors muted even with custom error handlers that

3
TODO
View File

@ -11,6 +11,8 @@ If no interest is expressed for a feature that may require a considerable
amount of effort to implement, it may get endlessly delayed. Do not be amount of effort to implement, it may get endlessly delayed. Do not be
afraid to cast your vote for the next feature to be implemented! afraid to cast your vote for the next feature to be implemented!
- Implement validation for query and for fragment
FUTURE VERSIONS FUTURE VERSIONS
--------------- ---------------
@ -47,6 +49,7 @@ FUTURE VERSIONS
AttrDef class). Probably will use CSSTidy class? AttrDef class). Probably will use CSSTidy class?
# More control over allowed CSS properties using a modularization # More control over allowed CSS properties using a modularization
# HTML 5 support # HTML 5 support
# IRI support
- Standardize token armor for all areas of processing - Standardize token armor for all areas of processing
- Convert RTL/LTR override characters to <bdo> tags, or vice versa on demand. - Convert RTL/LTR override characters to <bdo> tags, or vice versa on demand.
Also, enable disabling of directionality Also, enable disabling of directionality

View File

@ -215,12 +215,12 @@
</directive> </directive>
<directive id="URI.Disable"> <directive id="URI.Disable">
<file name="HTMLPurifier/AttrDef/URI.php"> <file name="HTMLPurifier/AttrDef/URI.php">
<line>24</line> <line>23</line>
</file> </file>
</directive> </directive>
<directive id="URI.Munge"> <directive id="URI.Munge">
<file name="HTMLPurifier/AttrDef/URI.php"> <file name="HTMLPurifier/AttrDef/URI.php">
<line>78</line> <line>68</line>
</file> </file>
</directive> </directive>
<directive id="Core.ColorKeywords"> <directive id="Core.ColorKeywords">

View File

@ -7,7 +7,7 @@
class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
{ {
protected $parser, $percentEncoder; protected $parser;
protected $embedsResource; protected $embedsResource;
/** /**
@ -15,7 +15,6 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
*/ */
public function __construct($embeds_resource = false) { public function __construct($embeds_resource = false) {
$this->parser = new HTMLPurifier_URIParser(); $this->parser = new HTMLPurifier_URIParser();
$this->percentEncoder = new HTMLPurifier_PercentEncoder();
$this->embedsResource = (bool) $embeds_resource; $this->embedsResource = (bool) $embeds_resource;
} }
@ -23,9 +22,7 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
if ($config->get('URI', 'Disable')) return false; if ($config->get('URI', 'Disable')) return false;
// initial operations
$uri = $this->parseCDATA($uri); $uri = $this->parseCDATA($uri);
$uri = $this->percentEncoder->normalize($uri);
// parse the URI // parse the URI
$uri = $this->parser->parse($uri); $uri = $this->parser->parse($uri);
@ -61,13 +58,6 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
$context->destroy('EmbeddedURI'); $context->destroy('EmbeddedURI');
if (!$ok) return false; if (!$ok) return false;
// munge scheme off if necessary (this must be last)
if (!is_null($uri->scheme) && is_null($uri->host)) {
if ($uri_def->defaultScheme == $uri->scheme) {
$uri->scheme = null;
}
}
// back to string // back to string
$result = $uri->toString(); $result = $uri->toString();

View File

@ -36,11 +36,23 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
$ipv4 = $this->ipv4->validate($string, $config, $context); $ipv4 = $this->ipv4->validate($string, $config, $context);
if ($ipv4 !== false) return $ipv4; if ($ipv4 !== false) return $ipv4;
// validate a domain name here, do filtering, etc etc etc // A regular domain name.
// We could use this, but it would break I18N domain names // This breaks I18N domain names, but we don't have proper IRI support,
//$match = preg_match('/^[a-z0-9][\w\-\.]*[a-z0-9]$/i', $string); // so force users to insert Punycode. If there's complaining we'll
//if (!$match) return false; // try to fix things into an international friendly form.
// The productions describing this are:
$a = '[a-z]'; // alpha
$an = '[a-z0-9]'; // alphanum
$and = '[a-z0-9-]'; // alphanum | "-"
// domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
$domainlabel = "$an($and*$an)?";
// toplabel = alpha | alpha *( alphanum | "-" ) alphanum
$toplabel = "$a($and*$an)?";
// hostname = *( domainlabel "." ) toplabel [ "." ]
$match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string);
if (!$match) return false;
return $string; return $string;
} }

View File

@ -2,12 +2,68 @@
/** /**
* Class that handles operations involving percent-encoding in URIs. * Class that handles operations involving percent-encoding in URIs.
*
* @warning
* Be careful when reusing instances of PercentEncoder. The object
* you use for normalize() SHOULD NOT be used for encode(), or
* vice-versa.
*/ */
class HTMLPurifier_PercentEncoder class HTMLPurifier_PercentEncoder
{ {
/** /**
* Fix up percent-encoding by decoding unreserved characters and normalizing * Reserved characters to preserve when using encode().
*/
protected $preserve = array();
/**
* String of characters that should be preserved while using encode().
*/
public function __construct($preserve = false) {
// unreserved letters, ought to const-ify
for ($i = 48; $i <= 57; $i++) $this->preserve[$i] = true; // digits
for ($i = 65; $i <= 90; $i++) $this->preserve[$i] = true; // upper-case
for ($i = 97; $i <= 122; $i++) $this->preserve[$i] = true; // lower-case
$this->preserve[45] = true; // Dash -
$this->preserve[46] = true; // Period .
$this->preserve[95] = true; // Underscore _
$this->preserve[126]= true; // Tilde ~
// extra letters not to escape
if ($preserve !== false) {
for ($i = 0, $c = strlen($preserve); $i < $c; $i++) {
$this->preserve[ord($preserve[$i])] = true;
}
}
}
/**
* Our replacement for urlencode, it encodes all non-reserved characters,
* as well as any extra characters that were instructed to be preserved.
* @note
* Assumes that the string has already been normalized, making any
* and all percent escape sequences valid. Percents will not be
* re-escaped, regardless of their status in $preserve
* @param $string String to be encoded
* @return Encoded string.
*/
public function encode($string) {
$ret = '';
for ($i = 0, $c = strlen($string); $i < $c; $i++) {
if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])]) ) {
$ret .= '%' . sprintf('%02X', $int);
} else {
$ret .= $string[$i];
}
}
return $ret;
}
/**
* Fix up percent-encoding by decoding unreserved characters and normalizing.
* @warning This function is affected by $preserve, even though the
* usual desired behavior is for this not to preserve those
* characters. Be careful when reusing instances of PercentEncoder!
* @param $string String to normalize * @param $string String to normalize
*/ */
public function normalize($string) { public function normalize($string) {
@ -27,12 +83,7 @@ class HTMLPurifier_PercentEncoder
continue; continue;
} }
$int = hexdec($encoding); $int = hexdec($encoding);
if ( if (isset($this->preserve[$int])) {
($int >= 48 && $int <= 57) || // digits
($int >= 65 && $int <= 90) || // uppercase letters
($int >= 97 && $int <= 122) || // lowercase letters
$int == 126 || $int == 45 || $int == 46 || $int == 95 // ~-._
) {
$ret .= chr($int) . $text; $ret .= chr($int) . $text;
continue; continue;
} }

View File

@ -1,7 +1,12 @@
<?php <?php
/** /**
* HTML Purifier's internal representation of a URI * HTML Purifier's internal representation of a URI.
* @note
* Internal data-structures are completely escaped. If the data needs
* to be used in a non-URI context (which is very unlikely), be sure
* to decode it first. The URI may not necessarily be well-formed until
* validate() is called.
*/ */
class HTMLPurifier_URI class HTMLPurifier_URI
{ {
@ -49,13 +54,27 @@ class HTMLPurifier_URI
} }
/** /**
* Generic validation method applicable for all schemes * Generic validation method applicable for all schemes. May modify
* this URI in order to get it into a compliant form.
* @param $config Instance of HTMLPurifier_Config * @param $config Instance of HTMLPurifier_Config
* @param $context Instance of HTMLPurifier_Context * @param $context Instance of HTMLPurifier_Context
* @return True if validation/filtering succeeds, false if failure * @return True if validation/filtering succeeds, false if failure
*/ */
public function validate($config, $context) { public function validate($config, $context) {
// ABNF definitions from RFC 3986
$chars_sub_delims = '!$&\'()*+,;=';
$chars_gen_delims = ':/?#[]@';
$chars_pchar = $chars_sub_delims . ':@';
// validate scheme (MUST BE FIRST!)
if (!is_null($this->scheme) && is_null($this->host)) {
$def = $config->getDefinition('URI');
if ($def->defaultScheme === $this->scheme) {
$this->scheme = null;
}
}
// validate host // validate host
if (!is_null($this->host)) { if (!is_null($this->host)) {
$host_def = new HTMLPurifier_AttrDef_URI_Host(); $host_def = new HTMLPurifier_AttrDef_URI_Host();
@ -63,18 +82,51 @@ class HTMLPurifier_URI
if ($this->host === false) $this->host = null; if ($this->host === false) $this->host = null;
} }
// validate username
if (!is_null($this->userinfo)) {
$encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
$this->userinfo = $encoder->encode($this->userinfo);
}
// validate port // validate port
if (!is_null($this->port)) { if (!is_null($this->port)) {
if ($this->port < 1 || $this->port > 65535) $this->port = null; if ($this->port < 1 || $this->port > 65535) $this->port = null;
} }
// query and fragment are quite simple in terms of definition: // validate path
// *( pchar / "/" / "?" ), so define their validation routines $path_parts = array();
// when we start fixing percent encoding $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
if (!is_null($this->host)) {
// path gets to be validated against a hodge-podge of rules depending // path-abempty (hier and relative)
// on the status of authority and scheme, but it's not that important, $this->path = $segments_encoder->encode($this->path);
// esp. since it won't be applicable to everyone } elseif ($this->path !== '' && $this->path[0] === '/') {
// path-absolute (hier and relative)
if (strlen($this->path) >= 2 && $this->path[1] === '/') {
// This shouldn't ever happen!
$this->path = '';
} else {
$this->path = $segments_encoder->encode($this->path);
}
} elseif (!is_null($this->scheme) && $this->path !== '') {
// path-rootless (hier)
// Short circuit evaluation means we don't need to check nz
$this->path = $segments_encoder->encode($this->path);
} elseif (is_null($this->scheme) && $this->path !== '') {
// path-noscheme (relative)
// (once again, not checking nz)
$segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
$c = strpos($this->path, '/');
if ($c !== false) {
$this->path =
$segment_nc_encoder->encode(substr($this->path, 0, $c)) .
$segments_encoder->encode(substr($this->path, $c));
} else {
$this->path = $segment_nc_encoder->encode($this->path);
}
} else {
// path-empty (hier and relative)
$this->path = ''; // just to be safe
}
return true; return true;

View File

@ -2,24 +2,39 @@
/** /**
* Parses a URI into the components and fragment identifier as specified * Parses a URI into the components and fragment identifier as specified
* by RFC 2396. * by RFC 3986.
* @todo Replace regexps with a native PHP parser
*/ */
class HTMLPurifier_URIParser class HTMLPurifier_URIParser
{ {
/** /**
* Parses a URI * Instance of HTMLPurifier_PercentEncoder to do normalization with.
*/
protected $percentEncoder;
public function __construct() {
$this->percentEncoder = new HTMLPurifier_PercentEncoder();
}
/**
* Parses a URI.
* @param $uri string URI to parse * @param $uri string URI to parse
* @return HTMLPurifier_URI representation of URI * @return HTMLPurifier_URI representation of URI. This representation has
* not been validated yet and may not conform to RFC.
*/ */
public function parse($uri) { public function parse($uri) {
$uri = $this->percentEncoder->normalize($uri);
// Regexp is as per Appendix B.
// Note that ["<>] are an addition to the RFC's recommended
// characters, because they represent external delimeters.
$r_URI = '!'. $r_URI = '!'.
'(([^:/?#<>\'"]+):)?'. // 2. Scheme '(([^:/?#"<>]+):)?'. // 2. Scheme
'(//([^/?#<>\'"]*))?'. // 4. Authority '(//([^/?#"<>]*))?'. // 4. Authority
'([^?#<>\'"]*)'. // 5. Path '([^?#"<>]*)'. // 5. Path
'(\?([^#<>\'"]*))?'. // 7. Query '(\?([^#"<>]*))?'. // 7. Query
'(#([^<>\'"]*))?'. // 8. Fragment '(#([^"<>]*))?'. // 8. Fragment
'!'; '!';
$matches = array(); $matches = array();
@ -36,13 +51,7 @@ class HTMLPurifier_URIParser
// further parse authority // further parse authority
if ($authority !== null) { if ($authority !== null) {
// ridiculously inefficient: it's a stacked regex! $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
$HEXDIG = '[A-Fa-f0-9]';
$unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with []
$sub_delims = '!$&\'()'; // needs []
$pct_encoded = "%$HEXDIG$HEXDIG";
$r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*";
$r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
$matches = array(); $matches = array();
preg_match($r_authority, $authority, $matches); preg_match($r_authority, $authority, $matches);
$userinfo = !empty($matches[1]) ? $matches[2] : null; $userinfo = !empty($matches[1]) ? $matches[2] : null;

View File

@ -14,6 +14,27 @@ class HTMLPurifier_AttrDef_URI_HostTest extends HTMLPurifier_AttrDefHarness
$this->assertDef('124.15.6.89'); // IPv4 $this->assertDef('124.15.6.89'); // IPv4
$this->assertDef('www.google.com'); // reg-name $this->assertDef('www.google.com'); // reg-name
// more domain name tests
$this->assertDef('test.');
$this->assertDef('sub.test.');
$this->assertDef('.test', false);
$this->assertDef('ff');
$this->assertDef('1f', false);
$this->assertDef('-f', false);
$this->assertDef('f1');
$this->assertDef('f-', false);
$this->assertDef('sub.ff');
$this->assertDef('sub.1f', false);
$this->assertDef('sub.-f', false);
$this->assertDef('sub.f1');
$this->assertDef('sub.f-', false);
$this->assertDef('ff.top');
$this->assertDef('1f.top');
$this->assertDef('-f.top', false);
$this->assertDef('ff.top');
$this->assertDef('f1.top');
$this->assertDef('f-.top', false);
} }
} }

View File

@ -29,6 +29,19 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness
); );
} }
function testPercentEncoding() {
$this->assertDef(
'http:colon:mercenary',
'colon%3Amercenary'
);
}
function testPercentEncodingPreserve() {
$this->assertDef(
'http://www.example.com/abcABC123-_.!~*()\''
);
}
function testEmbeds() { function testEmbeds() {
$this->def = new HTMLPurifier_AttrDef_URI(true); $this->def = new HTMLPurifier_AttrDef_URI(true);
$this->assertDef('http://sub.example.com/alas?foo=asd'); $this->assertDef('http://sub.example.com/alas?foo=asd');

View File

@ -35,5 +35,28 @@ class HTMLPurifier_PercentEncoderTest extends HTMLPurifier_Harness
} }
function assertEncode($string, $expect = true, $preserve = false) {
if ($expect === true) $expect = $string;
$encoder = new HTMLPurifier_PercentEncoder($preserve);
$result = $encoder->encode($string);
$this->assertIdentical($result, $expect);
}
function test_encode_noChange() {
$this->assertEncode('abc012-_~.');
}
function test_encode_encode() {
$this->assertEncode('>', '%3E');
}
function test_encode_preserve() {
$this->assertEncode('<>', '<%3E', '<');
}
function test_encode_low() {
$this->assertEncode("\1", '%01');
}
} }

View File

@ -13,6 +13,13 @@ class HTMLPurifier_URIParserTest extends HTMLPurifier_Harness
$this->assertEqual($result, $expect); $this->assertEqual($result, $expect);
} }
function testPercentNormalization() {
$this->assertParsing(
'%G',
null, null, null, null, '%25G', null, null
);
}
function testRegular() { function testRegular() {
$this->assertParsing( $this->assertParsing(
'http://www.example.com/webhp?q=foo#result2', 'http://www.example.com/webhp?q=foo#result2',
@ -121,7 +128,7 @@ class HTMLPurifier_URIParserTest extends HTMLPurifier_Harness
function testMalformedTag() { function testMalformedTag() {
$this->assertParsing( $this->assertParsing(
'http://www.example.com/\'>"', 'http://www.example.com/>',
'http', null, 'www.example.com', null, '/', null, null 'http', null, 'www.example.com', null, '/', null, null
); );
} }

View File

@ -160,4 +160,32 @@ class HTMLPurifier_URITest extends HTMLPurifier_URIHarness
$this->assertValidation('http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]', 'http:'); $this->assertValidation('http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]', 'http:');
} }
function test_validate_removeRedundantScheme() {
$this->assertValidation('http:foo:/:', 'foo%3A/:');
}
function test_validate_username() {
$this->assertValidation("http://user\xE3\x91\x94:@foo.com", 'http://user%E3%91%94:@foo.com');
}
function test_validate_path_abempty() {
$this->assertValidation("http://host/\xE3\x91\x94:", 'http://host/%E3%91%94:');
}
function test_validate_path_absolute() {
$this->assertValidation("/\xE3\x91\x94:", '/%E3%91%94:');
}
function test_validate_path_rootless() {
$this->assertValidation("mailto:\xE3\x91\x94:", 'mailto:%E3%91%94:');
}
function test_validate_path_noscheme() {
$this->assertValidation("\xE3\x91\x94", '%E3%91%94');
}
function test_validate_path_empty() {
$this->assertValidation('http://google.com');
}
} }