0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-09-18 18:25:18 +00:00

[1.3.0] More control of URIs granted

# Invalid images are now removed, rather than replaced with a dud <img src="" alt="Invalid image" />. Previous behavior can be restored with new directive %Core.RemoveInvalidImg set to false.
! New directives %URI.DisableExternalResources and %URI.DisableResources
! New directive %Attr.DisableURI, which eliminates all hyperlinking
- Missing "Available since" documentation added

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@575 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-11-23 23:59:20 +00:00
parent 61b6ee7183
commit 49cb2a4a7c
10 changed files with 168 additions and 50 deletions

5
NEWS
View File

@ -11,6 +11,9 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
1.3.0, unknown release date
(major feature release)
# Invalid images are now removed, rather than replaced with a dud
<img src="" alt="Invalid image" />. Previous behavior can be restored
with new directive %Core.RemoveInvalidImg set to false.
! (X)HTML Strict now supported
+ Transparently handles inline elements in block context (blockquote)
! Added GET method to demo for easier validation, added 50kb max input size
@ -19,6 +22,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
! New directives %HTML.AllowedElements and %HTML.AllowedAttributes to let
users narrow the set of allowed tags
! <li value="4"> and <ul start="2"> now allowed in loose mode
! New directives %URI.DisableExternalResources and %URI.DisableResources
! New directive %Attr.DisableURI, which eliminates all hyperlinking
- Added missing type to ChildDef_Chameleon
- Remove Tidy option from demo if there is not Tidy available
. ChildDef_Required guards against empty tags

54
TODO
View File

@ -1,41 +1,51 @@
TODO List
= KEY ====================
# Flagship
- Regular
? At-risk
==========================
1.3 release
- Enable strict-compliant (X)HTML output
- Requires to some extent 2.0 formatters to save elements in blockquote
- Make URI validation routines tighter (especially mailto)
- More extensive URI filtering schemes (see docs/proposal-new-directives.txt)
- Allow for background-image and list-style-image (see above)
- Error logging for filtering/cleanup procedures
- Rich set* methods and config file loaders for HTMLPurifier_Config
- Caching of everything
- Configuration profiles: sets of directives that get set with one func call
# More extensive URI filtering schemes (see docs/proposal-new-directives.txt)
# Allow for background-image and list-style-image (intrinsically tied to above)
- Aggressive caching
- Pretty-printer of *Definition, allowing users to see at a glance what is
allowed and what isn't
? Rich set* methods and config file loaders for HTMLPurifier_Config
? Configuration profiles: sets of directives that get set with one func call
? ConfigSchema directive aliases (so we can rename some of them)
? URI validation routines tighter (see docs/dev-code-quality.html) (COMPLEX)
1.4 release
- Add various "levels" of cleaning
# Error logging for filtering/cleanup procedures
- Requires I18N facilities to be created first (COMPLEX)
1.5 release
# Add pre-packaged "levels" of cleaning (custom behavior already done)
- More fine-grained control over escaping behavior
- Silently drop content inbetween SCRIPT tags (can be generalized to allow
specification of elements that, when detected as foreign, trigger removal
of children, although unbalanced tags could wreck havoc (or at least
delete the rest of the document)).
1.5 release
- Additional support for poorly written HTML
- Implement all non-essential attribute transforms
- Microsoft Word HTML cleaning (i.e. MsoNormal)
1.6 release
# Additional support for poorly written HTML
- Implement all non-essential attribute transforms (BIG!)
- Microsoft Word HTML cleaning (i.e. MsoNormal, but research essential!)
- Friendly strict handling of <address> (block -> <br>)
2.0 release
- Formatters for plaintext
# Formatters for plaintext (COMPLEX)
- Auto-paragraphing (be sure to leverage fact that we know when things
shouldn't be paragraphed, such as lists and tables).
- Linkify URLs
- Smileys
- Linkification for HTML Purifier docs: notably configuration and
class names
- Linkification for HTML Purifier docs: notably configuration and classes
3.0 release
- Extended HTML capabilities based on namespacing and tag transforms
- Extended HTML capabilities based on namespacing and tag transforms (COMPLEX)
- Hooks for adding custom processors to custom namespaced tags and
attributes, offer default implementation
- Lots of documentation and samples
@ -43,7 +53,11 @@ TODO List
Ongoing
- Lots of profiling, make it faster!
- Plugins for major CMSes (very tricky issue)
- Plugins for major CMSes (COMPLEX)
- Drupal
- WordPress
- eFiction
- more! (look for ones that use WYSIWYGs)
Unknown release (on a scratch-an-itch basis)
- Fixes for Firefox's inability to handle COL alignment props (Bug 915)
@ -54,7 +68,7 @@ Unknown release (on a scratch-an-itch basis)
- Append something to duplicate IDs so they're still usable (impl. note: the
dupe detector would also need to detect the suffix as well)
- Have 'lang' attribute be checked against official lists
- Info on how to embed YouTube videos (and related content) without patches
- Docs on how to embed YouTube videos (and friends) without patches
Encoding workarounds
- Non-lossy dumb alternate character encoding transformations, achieved by

View File

@ -53,7 +53,3 @@ time. Note the naming convention: %Namespace.Directive
absolute DNS. While this is actually the preferred method according to
the RFC, most people opt to use a relative domain name relative to . (root).
%URI.DisableExternalResources - disallow resource links (i.e. URIs that result
in immediate requests, such as src in IMG) to external websites
%HTML.DisableImg - disables all images

View File

@ -24,7 +24,7 @@ HTMLPurifier_ConfigSchema::define(
'This directive has been available since 1.2.0.'
);
HTMLPurifier_ConfigSchema::Define(
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableExternal', false, 'bool',
'Disables links to external websites. This is a highly effective '.
'anti-spam and anti-pagerank-leech measure, but comes at a hefty price: no'.
@ -34,6 +34,26 @@ HTMLPurifier_ConfigSchema::Define(
'This directive has been available since 1.2.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableExternalResources', false, 'bool',
'Disables the embedding of external resources, preventing users from '.
'embedding things like images from other hosts. This prevents '.
'access tracking (good for email viewers), bandwidth leeching, '.
'cross-site request forging, goatse.cx posting, and '.
'other nasties, but also results in '.
'a loss of end-user functionality (they can\'t directly post a pic '.
'they posted from Flickr anymore). Use it if you don\'t have a '.
'robust user-content moderation team. This directive has been '.
'available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableResources', false, 'bool',
'Disables embedding resources, essentially meaning no pictures. You can '.
'still link to them though. See %URI.DisableExternalResources for why '.
'this might be a good idea. This directive has been available since 1.3.0.'
);
/**
* Validates a URI as defined by RFC 3986.
* @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
@ -43,15 +63,15 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
var $host;
var $PercentEncoder;
var $embeds;
var $embeds_resource;
/**
* @param $embeds Does the URI here result in an extra HTTP request?
* @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
*/
function HTMLPurifier_AttrDef_URI($embeds = false) {
function HTMLPurifier_AttrDef_URI($embeds_resource = false) {
$this->host = new HTMLPurifier_AttrDef_Host();
$this->PercentEncoder = new HTMLPurifier_PercentEncoder();
$this->embeds = (bool) $embeds;
$this->embeds_resource = (bool) $embeds_resource;
}
function validate($uri, $config, &$context) {
@ -105,18 +125,25 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
}
// the URI we're processing embeds a resource in the page, but the URI
// the URI we're processing embeds_resource a resource in the page, but the URI
// it references cannot be located
if ($this->embeds && !$scheme_obj->browsable) {
if ($this->embeds_resource && !$scheme_obj->browsable) {
return false;
}
if ($authority !== null) {
// remove URI if it's absolute and we disallow externals
// remove URI if it's absolute and we disabled externals or
// if it's absolute and embedded and we disabled external resources
unset($our_host);
if ($config->get('URI', 'DisableExternal')) {
if (
$config->get('URI', 'DisableExternal') ||
(
$config->get('URI', 'DisableExternalResources') &&
$this->embeds_resource
)
) {
$our_host = $config->get('URI', 'Host');
if ($our_host === null) return false;
}

View File

@ -43,7 +43,8 @@ HTMLPurifier_ConfigSchema::define(
HTMLPurifier_ConfigSchema::define(
'HTML', 'Strict', false, 'bool',
'Determines whether or not to use Transitional (loose) or Strict rulesets.'
'Determines whether or not to use Transitional (loose) or Strict rulesets. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
@ -53,14 +54,16 @@ HTMLPurifier_ConfigSchema::define(
'Example: by default value, <code>&lt;blockquote&gt;Foo&lt;/blockquote&gt;</code> '.
'would become <code>&lt;blockquote&gt;&lt;p&gt;Foo&lt;/p&gt;&lt;/blockquote&gt;</code>. The '.
'<code>&lt;p&gt;</code> tags can be replaced '.
'with whatever you desire, as long as it is a block level element.'
'with whatever you desire, as long as it is a block level element. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'HTML', 'Parent', 'div', 'string',
'String name of element that HTML fragment passed to library will be '.
'inserted in. An interesting variation would be using span as the '.
'parent element, meaning that only inline tags would be allowed.'
'parent element, meaning that only inline tags would be allowed. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
@ -72,7 +75,8 @@ HTMLPurifier_ConfigSchema::define(
'supported in the first place (like embed). If you change this, you '.
'probably also want to change %HTML.AllowedAttributes. '.
'<strong>Warning:</strong> If another directive conflicts with the '.
'elements here, <em>that</em> directive will win and override.'
'elements here, <em>that</em> directive will win and override. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
@ -84,7 +88,14 @@ HTMLPurifier_ConfigSchema::define(
'elements here, <em>that</em> directive will win and override. For '.
'example, %HTML.EnableAttrID will take precedence over *.id in this '.
'directive. You must set that directive to true before you can use '.
'IDs at all.'
'IDs at all. This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'Attr', 'DisableURI', false, 'bool',
'Disables all URIs in all forms. Not sure why you\'d want to do that '.
'(after all, the Internet\'s founded on the notion of a hyperlink). '.
'This directive has been available since 1.3.0.'
);
/**
@ -444,16 +455,18 @@ class HTMLPurifier_HTMLDefinition
$this->info['td']->attr['colspan'] =
$this->info['th']->attr['colspan'] = $e__NumberSpan;
$e_URI = new HTMLPurifier_AttrDef_URI();
$this->info['a']->attr['href'] =
$this->info['img']->attr['longdesc'] =
$this->info['del']->attr['cite'] =
$this->info['ins']->attr['cite'] =
$this->info['blockquote']->attr['cite'] =
$this->info['q']->attr['cite'] = $e_URI;
// URI that causes HTTP request
$this->info['img']->attr['src'] = new HTMLPurifier_AttrDef_URI(true);
if (!$config->get('Attr', 'DisableURI')) {
$e_URI = new HTMLPurifier_AttrDef_URI();
$this->info['a']->attr['href'] =
$this->info['img']->attr['longdesc'] =
$this->info['del']->attr['cite'] =
$this->info['ins']->attr['cite'] =
$this->info['blockquote']->attr['cite'] =
$this->info['q']->attr['cite'] = $e_URI;
// URI that causes HTTP request
$this->info['img']->attr['src'] = new HTMLPurifier_AttrDef_URI(true);
}
if (!$this->strict) {
$this->info['li']->attr['value'] = new HTMLPurifier_AttrDef_Integer();

View File

@ -5,6 +5,14 @@ require_once 'HTMLPurifier/HTMLDefinition.php';
require_once 'HTMLPurifier/Generator.php';
require_once 'HTMLPurifier/TagTransform.php';
HTMLPurifier_ConfigSchema::define(
'Core', 'RemoveInvalidImg', true, 'bool',
'This directive enables pre-emptive URI checking in <code>img</code> '.
'tags, as the attribute validation strategy is not authorized to '.
'remove elements from the document. This directive has been available '.
'since 1.3.0, revert to pre-1.3.0 behavior by setting to false.'
);
/**
* Removes all unrecognized tags from the list of tokens.
*
@ -25,7 +33,23 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
if (!empty( $token->is_tag )) {
// DEFINITION CALL
if (isset($definition->info[$token->name])) {
// leave untouched
// leave untouched, except for a few special cases:
// hard-coded image special case, pre-emptively drop
// if not available. Probably not abstract-able
if ( $token->name == 'img' ) {
if (!isset($token->attr['src'])) continue;
if (!isset($definition->info['img']->attr['src'])) {
continue;
}
$token->attr['src'] =
$definition->
info['img']->
attr['src']->
validate($token->attr['src']);
if ($token->attr['src'] === false) continue;
}
} elseif (
isset($definition->info_tag_transform[$token->name])
) {

View File

@ -271,6 +271,20 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness
}
function testDisableExternalResources() {
$this->config->set('URI', 'DisableExternalResources', true);
$this->def = new HTMLPurifier_AttrDef_URI();
$this->assertDef('http://sub.example.com/alas?foo=asd');
$this->assertDef('/img.png');
$this->def = new HTMLPurifier_AttrDef_URI(true);
$this->assertDef('http://sub.example.com/alas?foo=asd', false);
$this->assertDef('/img.png');
}
}
?>

View File

@ -42,6 +42,12 @@ class HTMLPurifier_Strategy_RemoveForeignElementsTest
' Warning!</span>'
);
// test removal of img tag
$this->assertResult(
'<img />',
''
);
}
}

View File

@ -125,6 +125,9 @@ class HTMLPurifier_Strategy_ValidateAttributesTest extends
);
// test required attributes for img
// (this should never happen, as RemoveForeignElements
// should have removed the offending image tag)
$this->assertResult(
'<img />',
'<img src="" alt="Invalid image" />'

View File

@ -8,6 +8,10 @@ class HTMLPurifier_Test extends UnitTestCase
{
var $purifier;
function setUp() {
$this->purifier = new HTMLPurifier();
}
function assertPurification($input, $expect = null) {
if ($expect === null) $expect = $input;
$result = $this->purifier->purify($input);
@ -15,7 +19,6 @@ class HTMLPurifier_Test extends UnitTestCase
}
function testNull() {
$this->purifier = new HTMLPurifier();
$this->assertPurification("Null byte\0", "Null byte");
}
@ -53,6 +56,19 @@ class HTMLPurifier_Test extends UnitTestCase
}
function testDisableURI() {
$config = HTMLPurifier_Config::createDefault();
$config->set('Attr', 'DisableURI', true);
$this->purifier = new HTMLPurifier($config);
$this->assertPurification(
'<img src="foobar"/>',
''
);
}
}
?>