mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-22 08:21:52 +00:00
[3.1.0] Revamp URI handling of percent encoding and validation.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1709 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
77ce3e8b4a
commit
cb5d5d0648
2
NEWS
2
NEWS
@ -32,6 +32,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
||||
! Commas, not dashes, used for serializer IDs. This change is forwards-compatible
|
||||
and allows for version numbers like "3.1.0-dev".
|
||||
! %HTML.Allowed deals gracefully with whitespace anywhere, anytime!
|
||||
! HTML Purifier's URI handling is a lot more robust, with much stricter
|
||||
validation checks and better percent encoding handling.
|
||||
- InterchangeBuilder now alphabetizes its lists
|
||||
- Validation error in configdoc output fixed
|
||||
- Iconv and other encoding errors muted even with custom error handlers that
|
||||
|
3
TODO
3
TODO
@ -11,6 +11,8 @@ If no interest is expressed for a feature that may require a considerable
|
||||
amount of effort to implement, it may get endlessly delayed. Do not be
|
||||
afraid to cast your vote for the next feature to be implemented!
|
||||
|
||||
- Implement validation for query and for fragment
|
||||
|
||||
FUTURE VERSIONS
|
||||
---------------
|
||||
|
||||
@ -47,6 +49,7 @@ FUTURE VERSIONS
|
||||
AttrDef class). Probably will use CSSTidy class?
|
||||
# More control over allowed CSS properties using a modularization
|
||||
# HTML 5 support
|
||||
# IRI support
|
||||
- Standardize token armor for all areas of processing
|
||||
- Convert RTL/LTR override characters to <bdo> tags, or vice versa on demand.
|
||||
Also, enable disabling of directionality
|
||||
|
@ -215,12 +215,12 @@
|
||||
</directive>
|
||||
<directive id="URI.Disable">
|
||||
<file name="HTMLPurifier/AttrDef/URI.php">
|
||||
<line>24</line>
|
||||
<line>23</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="URI.Munge">
|
||||
<file name="HTMLPurifier/AttrDef/URI.php">
|
||||
<line>78</line>
|
||||
<line>68</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.ColorKeywords">
|
||||
|
@ -7,7 +7,7 @@
|
||||
class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
protected $parser, $percentEncoder;
|
||||
protected $parser;
|
||||
protected $embedsResource;
|
||||
|
||||
/**
|
||||
@ -15,7 +15,6 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
|
||||
*/
|
||||
public function __construct($embeds_resource = false) {
|
||||
$this->parser = new HTMLPurifier_URIParser();
|
||||
$this->percentEncoder = new HTMLPurifier_PercentEncoder();
|
||||
$this->embedsResource = (bool) $embeds_resource;
|
||||
}
|
||||
|
||||
@ -23,9 +22,7 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
|
||||
|
||||
if ($config->get('URI', 'Disable')) return false;
|
||||
|
||||
// initial operations
|
||||
$uri = $this->parseCDATA($uri);
|
||||
$uri = $this->percentEncoder->normalize($uri);
|
||||
|
||||
// parse the URI
|
||||
$uri = $this->parser->parse($uri);
|
||||
@ -61,13 +58,6 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
|
||||
$context->destroy('EmbeddedURI');
|
||||
if (!$ok) return false;
|
||||
|
||||
// munge scheme off if necessary (this must be last)
|
||||
if (!is_null($uri->scheme) && is_null($uri->host)) {
|
||||
if ($uri_def->defaultScheme == $uri->scheme) {
|
||||
$uri->scheme = null;
|
||||
}
|
||||
}
|
||||
|
||||
// back to string
|
||||
$result = $uri->toString();
|
||||
|
||||
|
@ -36,11 +36,23 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
|
||||
$ipv4 = $this->ipv4->validate($string, $config, $context);
|
||||
if ($ipv4 !== false) return $ipv4;
|
||||
|
||||
// validate a domain name here, do filtering, etc etc etc
|
||||
// A regular domain name.
|
||||
|
||||
// We could use this, but it would break I18N domain names
|
||||
//$match = preg_match('/^[a-z0-9][\w\-\.]*[a-z0-9]$/i', $string);
|
||||
//if (!$match) return false;
|
||||
// This breaks I18N domain names, but we don't have proper IRI support,
|
||||
// so force users to insert Punycode. If there's complaining we'll
|
||||
// try to fix things into an international friendly form.
|
||||
|
||||
// The productions describing this are:
|
||||
$a = '[a-z]'; // alpha
|
||||
$an = '[a-z0-9]'; // alphanum
|
||||
$and = '[a-z0-9-]'; // alphanum | "-"
|
||||
// domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
|
||||
$domainlabel = "$an($and*$an)?";
|
||||
// toplabel = alpha | alpha *( alphanum | "-" ) alphanum
|
||||
$toplabel = "$a($and*$an)?";
|
||||
// hostname = *( domainlabel "." ) toplabel [ "." ]
|
||||
$match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string);
|
||||
if (!$match) return false;
|
||||
|
||||
return $string;
|
||||
}
|
||||
|
@ -2,12 +2,68 @@
|
||||
|
||||
/**
|
||||
* Class that handles operations involving percent-encoding in URIs.
|
||||
*
|
||||
* @warning
|
||||
* Be careful when reusing instances of PercentEncoder. The object
|
||||
* you use for normalize() SHOULD NOT be used for encode(), or
|
||||
* vice-versa.
|
||||
*/
|
||||
class HTMLPurifier_PercentEncoder
|
||||
{
|
||||
|
||||
/**
|
||||
* Fix up percent-encoding by decoding unreserved characters and normalizing
|
||||
* Reserved characters to preserve when using encode().
|
||||
*/
|
||||
protected $preserve = array();
|
||||
|
||||
/**
|
||||
* String of characters that should be preserved while using encode().
|
||||
*/
|
||||
public function __construct($preserve = false) {
|
||||
// unreserved letters, ought to const-ify
|
||||
for ($i = 48; $i <= 57; $i++) $this->preserve[$i] = true; // digits
|
||||
for ($i = 65; $i <= 90; $i++) $this->preserve[$i] = true; // upper-case
|
||||
for ($i = 97; $i <= 122; $i++) $this->preserve[$i] = true; // lower-case
|
||||
$this->preserve[45] = true; // Dash -
|
||||
$this->preserve[46] = true; // Period .
|
||||
$this->preserve[95] = true; // Underscore _
|
||||
$this->preserve[126]= true; // Tilde ~
|
||||
|
||||
// extra letters not to escape
|
||||
if ($preserve !== false) {
|
||||
for ($i = 0, $c = strlen($preserve); $i < $c; $i++) {
|
||||
$this->preserve[ord($preserve[$i])] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Our replacement for urlencode, it encodes all non-reserved characters,
|
||||
* as well as any extra characters that were instructed to be preserved.
|
||||
* @note
|
||||
* Assumes that the string has already been normalized, making any
|
||||
* and all percent escape sequences valid. Percents will not be
|
||||
* re-escaped, regardless of their status in $preserve
|
||||
* @param $string String to be encoded
|
||||
* @return Encoded string.
|
||||
*/
|
||||
public function encode($string) {
|
||||
$ret = '';
|
||||
for ($i = 0, $c = strlen($string); $i < $c; $i++) {
|
||||
if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])]) ) {
|
||||
$ret .= '%' . sprintf('%02X', $int);
|
||||
} else {
|
||||
$ret .= $string[$i];
|
||||
}
|
||||
}
|
||||
return $ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fix up percent-encoding by decoding unreserved characters and normalizing.
|
||||
* @warning This function is affected by $preserve, even though the
|
||||
* usual desired behavior is for this not to preserve those
|
||||
* characters. Be careful when reusing instances of PercentEncoder!
|
||||
* @param $string String to normalize
|
||||
*/
|
||||
public function normalize($string) {
|
||||
@ -27,12 +83,7 @@ class HTMLPurifier_PercentEncoder
|
||||
continue;
|
||||
}
|
||||
$int = hexdec($encoding);
|
||||
if (
|
||||
($int >= 48 && $int <= 57) || // digits
|
||||
($int >= 65 && $int <= 90) || // uppercase letters
|
||||
($int >= 97 && $int <= 122) || // lowercase letters
|
||||
$int == 126 || $int == 45 || $int == 46 || $int == 95 // ~-._
|
||||
) {
|
||||
if (isset($this->preserve[$int])) {
|
||||
$ret .= chr($int) . $text;
|
||||
continue;
|
||||
}
|
||||
|
@ -1,7 +1,12 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* HTML Purifier's internal representation of a URI
|
||||
* HTML Purifier's internal representation of a URI.
|
||||
* @note
|
||||
* Internal data-structures are completely escaped. If the data needs
|
||||
* to be used in a non-URI context (which is very unlikely), be sure
|
||||
* to decode it first. The URI may not necessarily be well-formed until
|
||||
* validate() is called.
|
||||
*/
|
||||
class HTMLPurifier_URI
|
||||
{
|
||||
@ -49,13 +54,27 @@ class HTMLPurifier_URI
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic validation method applicable for all schemes
|
||||
* Generic validation method applicable for all schemes. May modify
|
||||
* this URI in order to get it into a compliant form.
|
||||
* @param $config Instance of HTMLPurifier_Config
|
||||
* @param $context Instance of HTMLPurifier_Context
|
||||
* @return True if validation/filtering succeeds, false if failure
|
||||
*/
|
||||
public function validate($config, $context) {
|
||||
|
||||
// ABNF definitions from RFC 3986
|
||||
$chars_sub_delims = '!$&\'()*+,;=';
|
||||
$chars_gen_delims = ':/?#[]@';
|
||||
$chars_pchar = $chars_sub_delims . ':@';
|
||||
|
||||
// validate scheme (MUST BE FIRST!)
|
||||
if (!is_null($this->scheme) && is_null($this->host)) {
|
||||
$def = $config->getDefinition('URI');
|
||||
if ($def->defaultScheme === $this->scheme) {
|
||||
$this->scheme = null;
|
||||
}
|
||||
}
|
||||
|
||||
// validate host
|
||||
if (!is_null($this->host)) {
|
||||
$host_def = new HTMLPurifier_AttrDef_URI_Host();
|
||||
@ -63,18 +82,51 @@ class HTMLPurifier_URI
|
||||
if ($this->host === false) $this->host = null;
|
||||
}
|
||||
|
||||
// validate username
|
||||
if (!is_null($this->userinfo)) {
|
||||
$encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
|
||||
$this->userinfo = $encoder->encode($this->userinfo);
|
||||
}
|
||||
|
||||
// validate port
|
||||
if (!is_null($this->port)) {
|
||||
if ($this->port < 1 || $this->port > 65535) $this->port = null;
|
||||
}
|
||||
|
||||
// query and fragment are quite simple in terms of definition:
|
||||
// *( pchar / "/" / "?" ), so define their validation routines
|
||||
// when we start fixing percent encoding
|
||||
|
||||
// path gets to be validated against a hodge-podge of rules depending
|
||||
// on the status of authority and scheme, but it's not that important,
|
||||
// esp. since it won't be applicable to everyone
|
||||
// validate path
|
||||
$path_parts = array();
|
||||
$segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
|
||||
if (!is_null($this->host)) {
|
||||
// path-abempty (hier and relative)
|
||||
$this->path = $segments_encoder->encode($this->path);
|
||||
} elseif ($this->path !== '' && $this->path[0] === '/') {
|
||||
// path-absolute (hier and relative)
|
||||
if (strlen($this->path) >= 2 && $this->path[1] === '/') {
|
||||
// This shouldn't ever happen!
|
||||
$this->path = '';
|
||||
} else {
|
||||
$this->path = $segments_encoder->encode($this->path);
|
||||
}
|
||||
} elseif (!is_null($this->scheme) && $this->path !== '') {
|
||||
// path-rootless (hier)
|
||||
// Short circuit evaluation means we don't need to check nz
|
||||
$this->path = $segments_encoder->encode($this->path);
|
||||
} elseif (is_null($this->scheme) && $this->path !== '') {
|
||||
// path-noscheme (relative)
|
||||
// (once again, not checking nz)
|
||||
$segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
|
||||
$c = strpos($this->path, '/');
|
||||
if ($c !== false) {
|
||||
$this->path =
|
||||
$segment_nc_encoder->encode(substr($this->path, 0, $c)) .
|
||||
$segments_encoder->encode(substr($this->path, $c));
|
||||
} else {
|
||||
$this->path = $segment_nc_encoder->encode($this->path);
|
||||
}
|
||||
} else {
|
||||
// path-empty (hier and relative)
|
||||
$this->path = ''; // just to be safe
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
|
@ -2,24 +2,39 @@
|
||||
|
||||
/**
|
||||
* Parses a URI into the components and fragment identifier as specified
|
||||
* by RFC 2396.
|
||||
* @todo Replace regexps with a native PHP parser
|
||||
* by RFC 3986.
|
||||
*/
|
||||
class HTMLPurifier_URIParser
|
||||
{
|
||||
|
||||
/**
|
||||
* Parses a URI
|
||||
* Instance of HTMLPurifier_PercentEncoder to do normalization with.
|
||||
*/
|
||||
protected $percentEncoder;
|
||||
|
||||
public function __construct() {
|
||||
$this->percentEncoder = new HTMLPurifier_PercentEncoder();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a URI.
|
||||
* @param $uri string URI to parse
|
||||
* @return HTMLPurifier_URI representation of URI
|
||||
* @return HTMLPurifier_URI representation of URI. This representation has
|
||||
* not been validated yet and may not conform to RFC.
|
||||
*/
|
||||
public function parse($uri) {
|
||||
|
||||
$uri = $this->percentEncoder->normalize($uri);
|
||||
|
||||
// Regexp is as per Appendix B.
|
||||
// Note that ["<>] are an addition to the RFC's recommended
|
||||
// characters, because they represent external delimeters.
|
||||
$r_URI = '!'.
|
||||
'(([^:/?#<>\'"]+):)?'. // 2. Scheme
|
||||
'(//([^/?#<>\'"]*))?'. // 4. Authority
|
||||
'([^?#<>\'"]*)'. // 5. Path
|
||||
'(\?([^#<>\'"]*))?'. // 7. Query
|
||||
'(#([^<>\'"]*))?'. // 8. Fragment
|
||||
'(([^:/?#"<>]+):)?'. // 2. Scheme
|
||||
'(//([^/?#"<>]*))?'. // 4. Authority
|
||||
'([^?#"<>]*)'. // 5. Path
|
||||
'(\?([^#"<>]*))?'. // 7. Query
|
||||
'(#([^"<>]*))?'. // 8. Fragment
|
||||
'!';
|
||||
|
||||
$matches = array();
|
||||
@ -36,13 +51,7 @@ class HTMLPurifier_URIParser
|
||||
|
||||
// further parse authority
|
||||
if ($authority !== null) {
|
||||
// ridiculously inefficient: it's a stacked regex!
|
||||
$HEXDIG = '[A-Fa-f0-9]';
|
||||
$unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with []
|
||||
$sub_delims = '!$&\'()'; // needs []
|
||||
$pct_encoded = "%$HEXDIG$HEXDIG";
|
||||
$r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*";
|
||||
$r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
|
||||
$r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
|
||||
$matches = array();
|
||||
preg_match($r_authority, $authority, $matches);
|
||||
$userinfo = !empty($matches[1]) ? $matches[2] : null;
|
||||
|
@ -14,6 +14,27 @@ class HTMLPurifier_AttrDef_URI_HostTest extends HTMLPurifier_AttrDefHarness
|
||||
$this->assertDef('124.15.6.89'); // IPv4
|
||||
$this->assertDef('www.google.com'); // reg-name
|
||||
|
||||
// more domain name tests
|
||||
$this->assertDef('test.');
|
||||
$this->assertDef('sub.test.');
|
||||
$this->assertDef('.test', false);
|
||||
$this->assertDef('ff');
|
||||
$this->assertDef('1f', false);
|
||||
$this->assertDef('-f', false);
|
||||
$this->assertDef('f1');
|
||||
$this->assertDef('f-', false);
|
||||
$this->assertDef('sub.ff');
|
||||
$this->assertDef('sub.1f', false);
|
||||
$this->assertDef('sub.-f', false);
|
||||
$this->assertDef('sub.f1');
|
||||
$this->assertDef('sub.f-', false);
|
||||
$this->assertDef('ff.top');
|
||||
$this->assertDef('1f.top');
|
||||
$this->assertDef('-f.top', false);
|
||||
$this->assertDef('ff.top');
|
||||
$this->assertDef('f1.top');
|
||||
$this->assertDef('f-.top', false);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -29,6 +29,19 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness
|
||||
);
|
||||
}
|
||||
|
||||
function testPercentEncoding() {
|
||||
$this->assertDef(
|
||||
'http:colon:mercenary',
|
||||
'colon%3Amercenary'
|
||||
);
|
||||
}
|
||||
|
||||
function testPercentEncodingPreserve() {
|
||||
$this->assertDef(
|
||||
'http://www.example.com/abcABC123-_.!~*()\''
|
||||
);
|
||||
}
|
||||
|
||||
function testEmbeds() {
|
||||
$this->def = new HTMLPurifier_AttrDef_URI(true);
|
||||
$this->assertDef('http://sub.example.com/alas?foo=asd');
|
||||
|
@ -35,5 +35,28 @@ class HTMLPurifier_PercentEncoderTest extends HTMLPurifier_Harness
|
||||
|
||||
}
|
||||
|
||||
function assertEncode($string, $expect = true, $preserve = false) {
|
||||
if ($expect === true) $expect = $string;
|
||||
$encoder = new HTMLPurifier_PercentEncoder($preserve);
|
||||
$result = $encoder->encode($string);
|
||||
$this->assertIdentical($result, $expect);
|
||||
}
|
||||
|
||||
function test_encode_noChange() {
|
||||
$this->assertEncode('abc012-_~.');
|
||||
}
|
||||
|
||||
function test_encode_encode() {
|
||||
$this->assertEncode('>', '%3E');
|
||||
}
|
||||
|
||||
function test_encode_preserve() {
|
||||
$this->assertEncode('<>', '<%3E', '<');
|
||||
}
|
||||
|
||||
function test_encode_low() {
|
||||
$this->assertEncode("\1", '%01');
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -13,6 +13,13 @@ class HTMLPurifier_URIParserTest extends HTMLPurifier_Harness
|
||||
$this->assertEqual($result, $expect);
|
||||
}
|
||||
|
||||
function testPercentNormalization() {
|
||||
$this->assertParsing(
|
||||
'%G',
|
||||
null, null, null, null, '%25G', null, null
|
||||
);
|
||||
}
|
||||
|
||||
function testRegular() {
|
||||
$this->assertParsing(
|
||||
'http://www.example.com/webhp?q=foo#result2',
|
||||
@ -121,7 +128,7 @@ class HTMLPurifier_URIParserTest extends HTMLPurifier_Harness
|
||||
|
||||
function testMalformedTag() {
|
||||
$this->assertParsing(
|
||||
'http://www.example.com/\'>"',
|
||||
'http://www.example.com/>',
|
||||
'http', null, 'www.example.com', null, '/', null, null
|
||||
);
|
||||
}
|
||||
|
@ -160,4 +160,32 @@ class HTMLPurifier_URITest extends HTMLPurifier_URIHarness
|
||||
$this->assertValidation('http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]', 'http:');
|
||||
}
|
||||
|
||||
function test_validate_removeRedundantScheme() {
|
||||
$this->assertValidation('http:foo:/:', 'foo%3A/:');
|
||||
}
|
||||
|
||||
function test_validate_username() {
|
||||
$this->assertValidation("http://user\xE3\x91\x94:@foo.com", 'http://user%E3%91%94:@foo.com');
|
||||
}
|
||||
|
||||
function test_validate_path_abempty() {
|
||||
$this->assertValidation("http://host/\xE3\x91\x94:", 'http://host/%E3%91%94:');
|
||||
}
|
||||
|
||||
function test_validate_path_absolute() {
|
||||
$this->assertValidation("/\xE3\x91\x94:", '/%E3%91%94:');
|
||||
}
|
||||
|
||||
function test_validate_path_rootless() {
|
||||
$this->assertValidation("mailto:\xE3\x91\x94:", 'mailto:%E3%91%94:');
|
||||
}
|
||||
|
||||
function test_validate_path_noscheme() {
|
||||
$this->assertValidation("\xE3\x91\x94", '%E3%91%94');
|
||||
}
|
||||
|
||||
function test_validate_path_empty() {
|
||||
$this->assertValidation('http://google.com');
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user