0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-12-22 16:31:53 +00:00

[1.3.0] More control of URIs granted

# Invalid images are now removed, rather than replaced with a dud <img src="" alt="Invalid image" />. Previous behavior can be restored with new directive %Core.RemoveInvalidImg set to false.
! New directives %URI.DisableExternalResources and %URI.DisableResources
! New directive %Attr.DisableURI, which eliminates all hyperlinking
- Missing "Available since" documentation added

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@575 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-11-23 23:59:20 +00:00
parent 61b6ee7183
commit 49cb2a4a7c
10 changed files with 168 additions and 50 deletions

5
NEWS
View File

@ -11,6 +11,9 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
1.3.0, unknown release date 1.3.0, unknown release date
(major feature release) (major feature release)
# Invalid images are now removed, rather than replaced with a dud
<img src="" alt="Invalid image" />. Previous behavior can be restored
with new directive %Core.RemoveInvalidImg set to false.
! (X)HTML Strict now supported ! (X)HTML Strict now supported
+ Transparently handles inline elements in block context (blockquote) + Transparently handles inline elements in block context (blockquote)
! Added GET method to demo for easier validation, added 50kb max input size ! Added GET method to demo for easier validation, added 50kb max input size
@ -19,6 +22,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
! New directives %HTML.AllowedElements and %HTML.AllowedAttributes to let ! New directives %HTML.AllowedElements and %HTML.AllowedAttributes to let
users narrow the set of allowed tags users narrow the set of allowed tags
! <li value="4"> and <ul start="2"> now allowed in loose mode ! <li value="4"> and <ul start="2"> now allowed in loose mode
! New directives %URI.DisableExternalResources and %URI.DisableResources
! New directive %Attr.DisableURI, which eliminates all hyperlinking
- Added missing type to ChildDef_Chameleon - Added missing type to ChildDef_Chameleon
- Remove Tidy option from demo if there is not Tidy available - Remove Tidy option from demo if there is not Tidy available
. ChildDef_Required guards against empty tags . ChildDef_Required guards against empty tags

54
TODO
View File

@ -1,41 +1,51 @@
TODO List TODO List
= KEY ====================
# Flagship
- Regular
? At-risk
==========================
1.3 release 1.3 release
- Enable strict-compliant (X)HTML output # More extensive URI filtering schemes (see docs/proposal-new-directives.txt)
- Requires to some extent 2.0 formatters to save elements in blockquote # Allow for background-image and list-style-image (intrinsically tied to above)
- Make URI validation routines tighter (especially mailto) - Aggressive caching
- More extensive URI filtering schemes (see docs/proposal-new-directives.txt) - Pretty-printer of *Definition, allowing users to see at a glance what is
- Allow for background-image and list-style-image (see above) allowed and what isn't
- Error logging for filtering/cleanup procedures ? Rich set* methods and config file loaders for HTMLPurifier_Config
- Rich set* methods and config file loaders for HTMLPurifier_Config ? Configuration profiles: sets of directives that get set with one func call
- Caching of everything ? ConfigSchema directive aliases (so we can rename some of them)
- Configuration profiles: sets of directives that get set with one func call ? URI validation routines tighter (see docs/dev-code-quality.html) (COMPLEX)
1.4 release 1.4 release
- Add various "levels" of cleaning # Error logging for filtering/cleanup procedures
- Requires I18N facilities to be created first (COMPLEX)
1.5 release
# Add pre-packaged "levels" of cleaning (custom behavior already done)
- More fine-grained control over escaping behavior - More fine-grained control over escaping behavior
- Silently drop content inbetween SCRIPT tags (can be generalized to allow - Silently drop content inbetween SCRIPT tags (can be generalized to allow
specification of elements that, when detected as foreign, trigger removal specification of elements that, when detected as foreign, trigger removal
of children, although unbalanced tags could wreck havoc (or at least of children, although unbalanced tags could wreck havoc (or at least
delete the rest of the document)). delete the rest of the document)).
1.5 release 1.6 release
- Additional support for poorly written HTML # Additional support for poorly written HTML
- Implement all non-essential attribute transforms - Implement all non-essential attribute transforms (BIG!)
- Microsoft Word HTML cleaning (i.e. MsoNormal) - Microsoft Word HTML cleaning (i.e. MsoNormal, but research essential!)
- Friendly strict handling of <address> (block -> <br>)
2.0 release 2.0 release
- Formatters for plaintext # Formatters for plaintext (COMPLEX)
- Auto-paragraphing (be sure to leverage fact that we know when things - Auto-paragraphing (be sure to leverage fact that we know when things
shouldn't be paragraphed, such as lists and tables). shouldn't be paragraphed, such as lists and tables).
- Linkify URLs - Linkify URLs
- Smileys - Smileys
- Linkification for HTML Purifier docs: notably configuration and - Linkification for HTML Purifier docs: notably configuration and classes
class names
3.0 release 3.0 release
- Extended HTML capabilities based on namespacing and tag transforms - Extended HTML capabilities based on namespacing and tag transforms (COMPLEX)
- Hooks for adding custom processors to custom namespaced tags and - Hooks for adding custom processors to custom namespaced tags and
attributes, offer default implementation attributes, offer default implementation
- Lots of documentation and samples - Lots of documentation and samples
@ -43,7 +53,11 @@ TODO List
Ongoing Ongoing
- Lots of profiling, make it faster! - Lots of profiling, make it faster!
- Plugins for major CMSes (very tricky issue) - Plugins for major CMSes (COMPLEX)
- Drupal
- WordPress
- eFiction
- more! (look for ones that use WYSIWYGs)
Unknown release (on a scratch-an-itch basis) Unknown release (on a scratch-an-itch basis)
- Fixes for Firefox's inability to handle COL alignment props (Bug 915) - Fixes for Firefox's inability to handle COL alignment props (Bug 915)
@ -54,7 +68,7 @@ Unknown release (on a scratch-an-itch basis)
- Append something to duplicate IDs so they're still usable (impl. note: the - Append something to duplicate IDs so they're still usable (impl. note: the
dupe detector would also need to detect the suffix as well) dupe detector would also need to detect the suffix as well)
- Have 'lang' attribute be checked against official lists - Have 'lang' attribute be checked against official lists
- Info on how to embed YouTube videos (and related content) without patches - Docs on how to embed YouTube videos (and friends) without patches
Encoding workarounds Encoding workarounds
- Non-lossy dumb alternate character encoding transformations, achieved by - Non-lossy dumb alternate character encoding transformations, achieved by

View File

@ -53,7 +53,3 @@ time. Note the naming convention: %Namespace.Directive
absolute DNS. While this is actually the preferred method according to absolute DNS. While this is actually the preferred method according to
the RFC, most people opt to use a relative domain name relative to . (root). the RFC, most people opt to use a relative domain name relative to . (root).
%URI.DisableExternalResources - disallow resource links (i.e. URIs that result
in immediate requests, such as src in IMG) to external websites
%HTML.DisableImg - disables all images

View File

@ -24,7 +24,7 @@ HTMLPurifier_ConfigSchema::define(
'This directive has been available since 1.2.0.' 'This directive has been available since 1.2.0.'
); );
HTMLPurifier_ConfigSchema::Define( HTMLPurifier_ConfigSchema::define(
'URI', 'DisableExternal', false, 'bool', 'URI', 'DisableExternal', false, 'bool',
'Disables links to external websites. This is a highly effective '. 'Disables links to external websites. This is a highly effective '.
'anti-spam and anti-pagerank-leech measure, but comes at a hefty price: no'. 'anti-spam and anti-pagerank-leech measure, but comes at a hefty price: no'.
@ -34,6 +34,26 @@ HTMLPurifier_ConfigSchema::Define(
'This directive has been available since 1.2.0.' 'This directive has been available since 1.2.0.'
); );
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableExternalResources', false, 'bool',
'Disables the embedding of external resources, preventing users from '.
'embedding things like images from other hosts. This prevents '.
'access tracking (good for email viewers), bandwidth leeching, '.
'cross-site request forging, goatse.cx posting, and '.
'other nasties, but also results in '.
'a loss of end-user functionality (they can\'t directly post a pic '.
'they posted from Flickr anymore). Use it if you don\'t have a '.
'robust user-content moderation team. This directive has been '.
'available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableResources', false, 'bool',
'Disables embedding resources, essentially meaning no pictures. You can '.
'still link to them though. See %URI.DisableExternalResources for why '.
'this might be a good idea. This directive has been available since 1.3.0.'
);
/** /**
* Validates a URI as defined by RFC 3986. * Validates a URI as defined by RFC 3986.
* @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme * @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
@ -43,15 +63,15 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
var $host; var $host;
var $PercentEncoder; var $PercentEncoder;
var $embeds; var $embeds_resource;
/** /**
* @param $embeds Does the URI here result in an extra HTTP request? * @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
*/ */
function HTMLPurifier_AttrDef_URI($embeds = false) { function HTMLPurifier_AttrDef_URI($embeds_resource = false) {
$this->host = new HTMLPurifier_AttrDef_Host(); $this->host = new HTMLPurifier_AttrDef_Host();
$this->PercentEncoder = new HTMLPurifier_PercentEncoder(); $this->PercentEncoder = new HTMLPurifier_PercentEncoder();
$this->embeds = (bool) $embeds; $this->embeds_resource = (bool) $embeds_resource;
} }
function validate($uri, $config, &$context) { function validate($uri, $config, &$context) {
@ -105,18 +125,25 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
} }
// the URI we're processing embeds a resource in the page, but the URI // the URI we're processing embeds_resource a resource in the page, but the URI
// it references cannot be located // it references cannot be located
if ($this->embeds && !$scheme_obj->browsable) { if ($this->embeds_resource && !$scheme_obj->browsable) {
return false; return false;
} }
if ($authority !== null) { if ($authority !== null) {
// remove URI if it's absolute and we disallow externals // remove URI if it's absolute and we disabled externals or
// if it's absolute and embedded and we disabled external resources
unset($our_host); unset($our_host);
if ($config->get('URI', 'DisableExternal')) { if (
$config->get('URI', 'DisableExternal') ||
(
$config->get('URI', 'DisableExternalResources') &&
$this->embeds_resource
)
) {
$our_host = $config->get('URI', 'Host'); $our_host = $config->get('URI', 'Host');
if ($our_host === null) return false; if ($our_host === null) return false;
} }

View File

@ -43,7 +43,8 @@ HTMLPurifier_ConfigSchema::define(
HTMLPurifier_ConfigSchema::define( HTMLPurifier_ConfigSchema::define(
'HTML', 'Strict', false, 'bool', 'HTML', 'Strict', false, 'bool',
'Determines whether or not to use Transitional (loose) or Strict rulesets.' 'Determines whether or not to use Transitional (loose) or Strict rulesets. '.
'This directive has been available since 1.3.0.'
); );
HTMLPurifier_ConfigSchema::define( HTMLPurifier_ConfigSchema::define(
@ -53,14 +54,16 @@ HTMLPurifier_ConfigSchema::define(
'Example: by default value, <code>&lt;blockquote&gt;Foo&lt;/blockquote&gt;</code> '. 'Example: by default value, <code>&lt;blockquote&gt;Foo&lt;/blockquote&gt;</code> '.
'would become <code>&lt;blockquote&gt;&lt;p&gt;Foo&lt;/p&gt;&lt;/blockquote&gt;</code>. The '. 'would become <code>&lt;blockquote&gt;&lt;p&gt;Foo&lt;/p&gt;&lt;/blockquote&gt;</code>. The '.
'<code>&lt;p&gt;</code> tags can be replaced '. '<code>&lt;p&gt;</code> tags can be replaced '.
'with whatever you desire, as long as it is a block level element.' 'with whatever you desire, as long as it is a block level element. '.
'This directive has been available since 1.3.0.'
); );
HTMLPurifier_ConfigSchema::define( HTMLPurifier_ConfigSchema::define(
'HTML', 'Parent', 'div', 'string', 'HTML', 'Parent', 'div', 'string',
'String name of element that HTML fragment passed to library will be '. 'String name of element that HTML fragment passed to library will be '.
'inserted in. An interesting variation would be using span as the '. 'inserted in. An interesting variation would be using span as the '.
'parent element, meaning that only inline tags would be allowed.' 'parent element, meaning that only inline tags would be allowed. '.
'This directive has been available since 1.3.0.'
); );
HTMLPurifier_ConfigSchema::define( HTMLPurifier_ConfigSchema::define(
@ -72,7 +75,8 @@ HTMLPurifier_ConfigSchema::define(
'supported in the first place (like embed). If you change this, you '. 'supported in the first place (like embed). If you change this, you '.
'probably also want to change %HTML.AllowedAttributes. '. 'probably also want to change %HTML.AllowedAttributes. '.
'<strong>Warning:</strong> If another directive conflicts with the '. '<strong>Warning:</strong> If another directive conflicts with the '.
'elements here, <em>that</em> directive will win and override.' 'elements here, <em>that</em> directive will win and override. '.
'This directive has been available since 1.3.0.'
); );
HTMLPurifier_ConfigSchema::define( HTMLPurifier_ConfigSchema::define(
@ -84,7 +88,14 @@ HTMLPurifier_ConfigSchema::define(
'elements here, <em>that</em> directive will win and override. For '. 'elements here, <em>that</em> directive will win and override. For '.
'example, %HTML.EnableAttrID will take precedence over *.id in this '. 'example, %HTML.EnableAttrID will take precedence over *.id in this '.
'directive. You must set that directive to true before you can use '. 'directive. You must set that directive to true before you can use '.
'IDs at all.' 'IDs at all. This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'Attr', 'DisableURI', false, 'bool',
'Disables all URIs in all forms. Not sure why you\'d want to do that '.
'(after all, the Internet\'s founded on the notion of a hyperlink). '.
'This directive has been available since 1.3.0.'
); );
/** /**
@ -444,16 +455,18 @@ class HTMLPurifier_HTMLDefinition
$this->info['td']->attr['colspan'] = $this->info['td']->attr['colspan'] =
$this->info['th']->attr['colspan'] = $e__NumberSpan; $this->info['th']->attr['colspan'] = $e__NumberSpan;
$e_URI = new HTMLPurifier_AttrDef_URI(); if (!$config->get('Attr', 'DisableURI')) {
$this->info['a']->attr['href'] = $e_URI = new HTMLPurifier_AttrDef_URI();
$this->info['img']->attr['longdesc'] = $this->info['a']->attr['href'] =
$this->info['del']->attr['cite'] = $this->info['img']->attr['longdesc'] =
$this->info['ins']->attr['cite'] = $this->info['del']->attr['cite'] =
$this->info['blockquote']->attr['cite'] = $this->info['ins']->attr['cite'] =
$this->info['q']->attr['cite'] = $e_URI; $this->info['blockquote']->attr['cite'] =
$this->info['q']->attr['cite'] = $e_URI;
// URI that causes HTTP request // URI that causes HTTP request
$this->info['img']->attr['src'] = new HTMLPurifier_AttrDef_URI(true); $this->info['img']->attr['src'] = new HTMLPurifier_AttrDef_URI(true);
}
if (!$this->strict) { if (!$this->strict) {
$this->info['li']->attr['value'] = new HTMLPurifier_AttrDef_Integer(); $this->info['li']->attr['value'] = new HTMLPurifier_AttrDef_Integer();

View File

@ -5,6 +5,14 @@ require_once 'HTMLPurifier/HTMLDefinition.php';
require_once 'HTMLPurifier/Generator.php'; require_once 'HTMLPurifier/Generator.php';
require_once 'HTMLPurifier/TagTransform.php'; require_once 'HTMLPurifier/TagTransform.php';
HTMLPurifier_ConfigSchema::define(
'Core', 'RemoveInvalidImg', true, 'bool',
'This directive enables pre-emptive URI checking in <code>img</code> '.
'tags, as the attribute validation strategy is not authorized to '.
'remove elements from the document. This directive has been available '.
'since 1.3.0, revert to pre-1.3.0 behavior by setting to false.'
);
/** /**
* Removes all unrecognized tags from the list of tokens. * Removes all unrecognized tags from the list of tokens.
* *
@ -25,7 +33,23 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
if (!empty( $token->is_tag )) { if (!empty( $token->is_tag )) {
// DEFINITION CALL // DEFINITION CALL
if (isset($definition->info[$token->name])) { if (isset($definition->info[$token->name])) {
// leave untouched // leave untouched, except for a few special cases:
// hard-coded image special case, pre-emptively drop
// if not available. Probably not abstract-able
if ( $token->name == 'img' ) {
if (!isset($token->attr['src'])) continue;
if (!isset($definition->info['img']->attr['src'])) {
continue;
}
$token->attr['src'] =
$definition->
info['img']->
attr['src']->
validate($token->attr['src']);
if ($token->attr['src'] === false) continue;
}
} elseif ( } elseif (
isset($definition->info_tag_transform[$token->name]) isset($definition->info_tag_transform[$token->name])
) { ) {

View File

@ -271,6 +271,20 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness
} }
function testDisableExternalResources() {
$this->config->set('URI', 'DisableExternalResources', true);
$this->def = new HTMLPurifier_AttrDef_URI();
$this->assertDef('http://sub.example.com/alas?foo=asd');
$this->assertDef('/img.png');
$this->def = new HTMLPurifier_AttrDef_URI(true);
$this->assertDef('http://sub.example.com/alas?foo=asd', false);
$this->assertDef('/img.png');
}
} }
?> ?>

View File

@ -42,6 +42,12 @@ class HTMLPurifier_Strategy_RemoveForeignElementsTest
' Warning!</span>' ' Warning!</span>'
); );
// test removal of img tag
$this->assertResult(
'<img />',
''
);
} }
} }

View File

@ -125,6 +125,9 @@ class HTMLPurifier_Strategy_ValidateAttributesTest extends
); );
// test required attributes for img // test required attributes for img
// (this should never happen, as RemoveForeignElements
// should have removed the offending image tag)
$this->assertResult( $this->assertResult(
'<img />', '<img />',
'<img src="" alt="Invalid image" />' '<img src="" alt="Invalid image" />'

View File

@ -8,6 +8,10 @@ class HTMLPurifier_Test extends UnitTestCase
{ {
var $purifier; var $purifier;
function setUp() {
$this->purifier = new HTMLPurifier();
}
function assertPurification($input, $expect = null) { function assertPurification($input, $expect = null) {
if ($expect === null) $expect = $input; if ($expect === null) $expect = $input;
$result = $this->purifier->purify($input); $result = $this->purifier->purify($input);
@ -15,7 +19,6 @@ class HTMLPurifier_Test extends UnitTestCase
} }
function testNull() { function testNull() {
$this->purifier = new HTMLPurifier();
$this->assertPurification("Null byte\0", "Null byte"); $this->assertPurification("Null byte\0", "Null byte");
} }
@ -53,6 +56,19 @@ class HTMLPurifier_Test extends UnitTestCase
} }
function testDisableURI() {
$config = HTMLPurifier_Config::createDefault();
$config->set('Attr', 'DisableURI', true);
$this->purifier = new HTMLPurifier($config);
$this->assertPurification(
'<img src="foobar"/>',
''
);
}
} }
?> ?>