0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-12-22 08:21:52 +00:00

[1.7.0] Add native support for required elements

- Factored out large portion of ValidateAttributes to AttrValidator
- Implemented ValidateAttributes armor
- Fix clear cache bug
- Implement armoring for ValidateAttributes

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1174 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2007-06-20 21:39:28 +00:00
parent 8bbb73e47d
commit 69996acc9e
17 changed files with 247 additions and 128 deletions

2
NEWS
View File

@ -40,6 +40,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
! Config object gives more friendly error messages when things go wrong ! Config object gives more friendly error messages when things go wrong
! Advanced API implemented: easy functions for creating elements (addElement) ! Advanced API implemented: easy functions for creating elements (addElement)
and attributes (addAttribute) on HTMLDefinition and attributes (addAttribute) on HTMLDefinition
! Add native support for required attributes
- Deprecated and removed EnableRedundantUTF8Cleaning. It didn't even work! - Deprecated and removed EnableRedundantUTF8Cleaning. It didn't even work!
- DOMLex will not emit errors when a custom error handler that does not - DOMLex will not emit errors when a custom error handler that does not
honor error_reporting is used honor error_reporting is used
@ -63,6 +64,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
. DirectLex can now track line-numbers . DirectLex can now track line-numbers
. Preliminary error collector is in place, although no code actually reports . Preliminary error collector is in place, although no code actually reports
errors yet errors yet
. Factor out most of ValidateAttributes to new AttrValidator class
1.6.1, released 2007-05-05 1.6.1, released 2007-05-05
! Support for more deprecated attributes via transformations: ! Support for more deprecated attributes via transformations:

View File

@ -82,19 +82,47 @@ class HTMLPurifier_AttrCollections
* @param $attr_types HTMLPurifier_AttrTypes instance * @param $attr_types HTMLPurifier_AttrTypes instance
*/ */
function expandIdentifiers(&$attr, $attr_types) { function expandIdentifiers(&$attr, $attr_types) {
// because foreach will process new elements we add, make sure we
// skip duplicates
$processed = array();
foreach ($attr as $def_i => $def) { foreach ($attr as $def_i => $def) {
// skip inclusions
if ($def_i === 0) continue; if ($def_i === 0) continue;
if (!is_string($def)) continue;
if (isset($processed[$def_i])) continue;
// determine whether or not attribute is required
if ($required = (strpos($def_i, '*') !== false)) {
// rename the definition
unset($attr[$def_i]);
$def_i = trim($def_i, '*');
$attr[$def_i] = $def;
}
$processed[$def_i] = true;
// if we've already got a literal object, move on
if (is_object($def)) {
// preserve previous required
$attr[$def_i]->required = ($required || $attr[$def_i]->required);
continue;
}
if ($def === false) { if ($def === false) {
unset($attr[$def_i]); unset($attr[$def_i]);
continue; continue;
} }
if ($t = $attr_types->get($def)) { if ($t = $attr_types->get($def)) {
$attr[$def_i] = $t; $attr[$def_i] = $t;
$attr[$def_i]->required = $required;
} else { } else {
unset($attr[$def_i]); unset($attr[$def_i]);
} }
} }
} }
} }

View File

@ -14,11 +14,17 @@ class HTMLPurifier_AttrDef
{ {
/** /**
* Tells us whether or not an HTML attribute is minimized. Only the * Tells us whether or not an HTML attribute is minimized. Has no
* boolean attribute vapourware would use this. * meaning in other contexts.
*/ */
var $minimized = false; var $minimized = false;
/**
* Tells us whether or not an HTML attribute is required. Has no
* meaning in other contexts
*/
var $required = false;
/** /**
* Validates and cleans passed string according to a definition. * Validates and cleans passed string according to a definition.
* *

View File

@ -20,7 +20,10 @@ HTMLPurifier_ConfigSchema::define(
); );
/** /**
* Post-transform that ensures the required attrs of img (alt and src) are set * Transform that supplies default values for the src and alt attributes
* in img tags, as well as prevents the img tag from being removed
* because of a missing alt tag. This needs to be registered as both
* a pre and post attribute transform.
*/ */
class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
{ {
@ -29,6 +32,7 @@ class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
$src = true; $src = true;
if (!isset($attr['src'])) { if (!isset($attr['src'])) {
if ($config->get('Core', 'RemoveInvalidImg')) return $attr;
$attr['src'] = $config->get('Attr', 'DefaultInvalidImage'); $attr['src'] = $config->get('Attr', 'DefaultInvalidImage');
$src = false; $src = false;
} }

View File

@ -0,0 +1,105 @@
<?php
class HTMLPurifier_AttrValidator
{
function validateToken($token, &$config, &$context) {
$definition = $config->getHTMLDefinition();
// create alias to global definition array, see also $defs
// DEFINITION CALL
$d_defs = $definition->info_global_attr;
// copy out attributes for easy manipulation
$attr = $token->attr;
// do global transformations (pre)
// nothing currently utilizes this
foreach ($definition->info_attr_transform_pre as $transform) {
$attr = $transform->transform($attr, $config, $context);
}
// do local transformations only applicable to this element (pre)
// ex. <p align="right"> to <p style="text-align:right;">
foreach ($definition->info[$token->name]->attr_transform_pre
as $transform
) {
$attr = $transform->transform($attr, $config, $context);
}
// create alias to this element's attribute definition array, see
// also $d_defs (global attribute definition array)
// DEFINITION CALL
$defs = $definition->info[$token->name]->attr;
// iterate through all the attribute keypairs
// Watch out for name collisions: $key has previously been used
foreach ($attr as $attr_key => $value) {
// call the definition
if ( isset($defs[$attr_key]) ) {
// there is a local definition defined
if ($defs[$attr_key] === false) {
// We've explicitly been told not to allow this element.
// This is usually when there's a global definition
// that must be overridden.
// Theoretically speaking, we could have a
// AttrDef_DenyAll, but this is faster!
$result = false;
} else {
// validate according to the element's definition
$result = $defs[$attr_key]->validate(
$value, $config, $context
);
}
} elseif ( isset($d_defs[$attr_key]) ) {
// there is a global definition defined, validate according
// to the global definition
$result = $d_defs[$attr_key]->validate(
$value, $config, $context
);
} else {
// system never heard of the attribute? DELETE!
$result = false;
}
// put the results into effect
if ($result === false || $result === null) {
// remove the attribute
unset($attr[$attr_key]);
} elseif (is_string($result)) {
// simple substitution
$attr[$attr_key] = $result;
}
// we'd also want slightly more complicated substitution
// involving an array as the return value,
// although we're not sure how colliding attributes would
// resolve (certain ones would be completely overriden,
// others would prepend themselves).
}
// post transforms
// ex. <x lang="fr"> to <x lang="fr" xml:lang="fr">
foreach ($definition->info_attr_transform_post as $transform) {
$attr = $transform->transform($attr, $config, $context);
}
// ex. <bdo> to <bdo dir="ltr">
foreach ($definition->info[$token->name]->attr_transform_post as $transform) {
$attr = $transform->transform($attr, $config, $context);
}
// commit changes
$token->attr = $attr;
return $token;
}
}
?>

View File

@ -85,6 +85,13 @@ class HTMLPurifier_ElementDef
*/ */
var $descendants_are_inline = false; var $descendants_are_inline = false;
/**
* List of the names of required attributes this element has. Dynamically
* populated.
* @public
*/
var $required_attr = array();
/** /**
* Lookup table of tags excluded from all descendants of this tag. * Lookup table of tags excluded from all descendants of this tag.
* @note SGML permits exclusions for all descendants, but this is * @note SGML permits exclusions for all descendants, but this is
@ -174,6 +181,13 @@ class HTMLPurifier_ElementDef
} }
} }
/**
* Retrieves a copy of the element definition
*/
function copy() {
return unserialize(serialize($this));
}
} }
?> ?>

View File

@ -19,13 +19,15 @@ class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule
$img =& $this->addElement( $img =& $this->addElement(
'img', true, 'Inline', 'Empty', 'Common', 'img', true, 'Inline', 'Empty', 'Common',
array( array(
'alt' => 'Text', 'alt*' => 'Text',
'height' => 'Length', 'height' => 'Length',
'longdesc' => 'URI', 'longdesc' => 'URI',
'src' => new HTMLPurifier_AttrDef_URI(true), // embedded 'src*' => new HTMLPurifier_AttrDef_URI(true), // embedded
'width' => 'Length' 'width' => 'Length'
) )
); );
// kind of strange, but splitting things up would be inefficient
$img->attr_transform_pre[] =
$img->attr_transform_post[] = $img->attr_transform_post[] =
new HTMLPurifier_AttrTransform_ImgRequired(); new HTMLPurifier_AttrTransform_ImgRequired();
} }

View File

@ -405,7 +405,11 @@ class HTMLPurifier_HTMLModuleManager
foreach($this->elementLookup[$name] as $module_name) { foreach($this->elementLookup[$name] as $module_name) {
$module = $modules[$module_name]; $module = $modules[$module_name];
$new_def = $module->info[$name];
// copy is used because, ideally speaking, the original
// definition should not be modified. Usually, this will
// make no difference, but for consistency's sake
$new_def = $module->info[$name]->copy();
// refuse to create/merge in a definition that is deemed unsafe // refuse to create/merge in a definition that is deemed unsafe
if (!$trusted && ($new_def->safe === false)) { if (!$trusted && ($new_def->safe === false)) {
@ -443,6 +447,13 @@ class HTMLPurifier_HTMLModuleManager
$this->contentSets->generateChildDef($def, $module); $this->contentSets->generateChildDef($def, $module);
} }
// add information on required attributes
foreach ($def->attr as $attr_name => $attr_def) {
if ($attr_def->required) {
$def->required_attr[] = $attr_name;
}
}
return $def; return $def;

View File

@ -5,6 +5,8 @@ require_once 'HTMLPurifier/HTMLDefinition.php';
require_once 'HTMLPurifier/Generator.php'; require_once 'HTMLPurifier/Generator.php';
require_once 'HTMLPurifier/TagTransform.php'; require_once 'HTMLPurifier/TagTransform.php';
require_once 'HTMLPurifier/AttrValidator.php';
HTMLPurifier_ConfigSchema::define( HTMLPurifier_ConfigSchema::define(
'Core', 'RemoveInvalidImg', true, 'bool', 'Core', 'RemoveInvalidImg', true, 'bool',
'This directive enables pre-emptive URI checking in <code>img</code> '. 'This directive enables pre-emptive URI checking in <code>img</code> '.
@ -41,6 +43,8 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
$remove_invalid_img = $config->get('Core', 'RemoveInvalidImg'); $remove_invalid_img = $config->get('Core', 'RemoveInvalidImg');
$remove_script_contents = $config->get('Core', 'RemoveScriptContents'); $remove_script_contents = $config->get('Core', 'RemoveScriptContents');
$attr_validator = new HTMLPurifier_AttrValidator();
// removes tokens until it reaches a closing tag with its value // removes tokens until it reaches a closing tag with its value
$remove_until = false; $remove_until = false;
@ -65,24 +69,23 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
} }
if (isset($definition->info[$token->name])) { if (isset($definition->info[$token->name])) {
// leave untouched, except for a few special cases:
// hard-coded image special case, pre-emptively drop // mostly everything's good, but
// if not available. Probably not abstract-able // we need to make sure required attributes are in order
if ( $token->name == 'img' && $remove_invalid_img ) { if (
if (!isset($token->attr['src'])) { $definition->info[$token->name]->required_attr &&
continue; ($token->name != 'img' || $remove_invalid_img) // ensure config option still works
) {
$token = $attr_validator->validateToken($token, $config, $context);
$ok = true;
foreach ($definition->info[$token->name]->required_attr as $name) {
if (!isset($token->attr[$name])) {
$ok = false;
break;
}
} }
if (!isset($definition->info['img']->attr['src'])) { if (!$ok) continue;
continue; $token->armor['ValidateAttributes'] = true;
}
$token->attr['src'] =
$definition->
info['img']->
attr['src']->
validate($token->attr['src'],
$config, $context);
if ($token->attr['src'] === false) continue;
} }
} elseif ($escape_invalid_tags) { } elseif ($escape_invalid_tags) {

View File

@ -4,6 +4,8 @@ require_once 'HTMLPurifier/Strategy.php';
require_once 'HTMLPurifier/HTMLDefinition.php'; require_once 'HTMLPurifier/HTMLDefinition.php';
require_once 'HTMLPurifier/IDAccumulator.php'; require_once 'HTMLPurifier/IDAccumulator.php';
require_once 'HTMLPurifier/AttrValidator.php';
HTMLPurifier_ConfigSchema::define( HTMLPurifier_ConfigSchema::define(
'Attr', 'IDBlacklist', array(), 'list', 'Attr', 'IDBlacklist', array(), 'list',
'Array of IDs not allowed in the document.'); 'Array of IDs not allowed in the document.');
@ -17,16 +19,13 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
function execute($tokens, $config, &$context) { function execute($tokens, $config, &$context) {
$definition = $config->getHTMLDefinition();
// setup id_accumulator context // setup id_accumulator context
$id_accumulator = new HTMLPurifier_IDAccumulator(); $id_accumulator = new HTMLPurifier_IDAccumulator();
$id_accumulator->load($config->get('Attr', 'IDBlacklist')); $id_accumulator->load($config->get('Attr', 'IDBlacklist'));
$context->register('IDAccumulator', $id_accumulator); $context->register('IDAccumulator', $id_accumulator);
// create alias to global definition array, see also $defs // setup validator
// DEFINITION CALL $validator = new HTMLPurifier_AttrValidator();
$d_defs = $definition->info_global_attr;
foreach ($tokens as $key => $token) { foreach ($tokens as $key => $token) {
@ -34,91 +33,12 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
// namely start and empty tags // namely start and empty tags
if ($token->type !== 'start' && $token->type !== 'empty') continue; if ($token->type !== 'start' && $token->type !== 'empty') continue;
// copy out attributes for easy manipulation // skip tokens that are armored
$attr = $token->attr; if (!empty($token->armor['ValidateAttributes'])) continue;
// do global transformations (pre) $tokens[$key] = $validator->validateToken($token, $config, $context);
// nothing currently utilizes this
foreach ($definition->info_attr_transform_pre as $transform) {
$attr = $transform->transform($attr, $config, $context);
}
// do local transformations only applicable to this element (pre)
// ex. <p align="right"> to <p style="text-align:right;">
foreach ($definition->info[$token->name]->attr_transform_pre
as $transform
) {
$attr = $transform->transform($attr, $config, $context);
}
// create alias to this element's attribute definition array, see
// also $d_defs (global attribute definition array)
// DEFINITION CALL
$defs = $definition->info[$token->name]->attr;
// iterate through all the attribute keypairs
// Watch out for name collisions: $key has previously been used
foreach ($attr as $attr_key => $value) {
// call the definition
if ( isset($defs[$attr_key]) ) {
// there is a local definition defined
if ($defs[$attr_key] === false) {
// We've explicitly been told not to allow this element.
// This is usually when there's a global definition
// that must be overridden.
// Theoretically speaking, we could have a
// AttrDef_DenyAll, but this is faster!
$result = false;
} else {
// validate according to the element's definition
$result = $defs[$attr_key]->validate(
$value, $config, $context
);
}
} elseif ( isset($d_defs[$attr_key]) ) {
// there is a global definition defined, validate according
// to the global definition
$result = $d_defs[$attr_key]->validate(
$value, $config, $context
);
} else {
// system never heard of the attribute? DELETE!
$result = false;
}
// put the results into effect
if ($result === false || $result === null) {
// remove the attribute
unset($attr[$attr_key]);
} elseif (is_string($result)) {
// simple substitution
$attr[$attr_key] = $result;
}
// we'd also want slightly more complicated substitution
// involving an array as the return value,
// although we're not sure how colliding attributes would
// resolve (certain ones would be completely overriden,
// others would prepend themselves).
}
// post transforms
// ex. <x lang="fr"> to <x lang="fr" xml:lang="fr">
foreach ($definition->info_attr_transform_post as $transform) {
$attr = $transform->transform($attr, $config, $context);
}
// ex. <bdo> to <bdo dir="ltr">
foreach ($definition->info[$token->name]->attr_transform_post as $transform) {
$attr = $transform->transform($attr, $config, $context);
}
// commit changes
// could interfere with flyweight implementation
$tokens[$key]->attr = $attr;
} }
$context->destroy('IDAccumulator'); $context->destroy('IDAccumulator');
return $tokens; return $tokens;

View File

@ -13,6 +13,12 @@ class HTMLPurifier_Token {
var $type; /**< Type of node to bypass <tt>is_a()</tt>. @public */ var $type; /**< Type of node to bypass <tt>is_a()</tt>. @public */
var $line; /**< Line number node was on in source document. Null if unknown. @public */ var $line; /**< Line number node was on in source document. Null if unknown. @public */
/**
* Lookup array of processing that this token is exempt from.
* Currently, the only valid value is "ValidateAttributes".
*/
var $armor = array();
/** /**
* Copies the tag into a new one (clone substitute). * Copies the tag into a new one (clone substitute).
* @return Copied token * @return Copied token

View File

@ -2,7 +2,7 @@
<?php <?php
/** /**
* Flushes the HTMLDefinition serial cache * Flushes the default HTMLDefinition serial cache
*/ */
if (php_sapi_name() != 'cli') { if (php_sapi_name() != 'cli') {
@ -14,8 +14,10 @@ echo 'Flushing cache... ';
require_once(dirname(__FILE__) . '/../library/HTMLPurifier.auto.php'); require_once(dirname(__FILE__) . '/../library/HTMLPurifier.auto.php');
$config = HTMLPurifier_Config::createDefault();
$cache = new HTMLPurifier_DefinitionCache_Serializer('HTML'); $cache = new HTMLPurifier_DefinitionCache_Serializer('HTML');
$cache->flush(); $cache->flush($config);
echo 'Cache flushed successfully.'; echo 'Cache flushed successfully.';

View File

@ -110,17 +110,24 @@ class HTMLPurifier_AttrCollectionsTest extends UnitTestCase
$attr = array( $attr = array(
'attr1' => 'Color', 'attr1' => 'Color',
'attr2' => 'URI' 'attr2*' => 'URI'
); );
$types->setReturnValue('get', 'ColorObject', array('Color')); $c_object = new HTMLPurifier_AttrDef();
$types->setReturnValue('get', 'URIObject', array('URI')); $c_object->_name = 'Color'; // for testing purposes only
$u_object = new HTMLPurifier_AttrDef();
$u_object->_name = 'URL'; // for testing purposes only
$types->setReturnValue('get', $c_object, array('Color'));
$types->setReturnValue('get', $u_object, array('URI'));
$collections->expandIdentifiers($attr, $types); $collections->expandIdentifiers($attr, $types);
$u_object->required = true;
$this->assertIdentical( $this->assertIdentical(
$attr, $attr,
array( array(
'attr1' => 'ColorObject', 'attr1' => $c_object,
'attr2' => 'URIObject' 'attr2' => $u_object
) )
); );

View File

@ -15,7 +15,10 @@ class HTMLPurifier_AttrTransform_ImgRequiredTest extends HTMLPurifier_AttrTransf
$this->assertResult( $this->assertResult(
array(), array(),
array('src' => '', 'alt' => 'Invalid image') array('src' => '', 'alt' => 'Invalid image'),
array(
'Core.RemoveInvalidImg' => false
)
); );
$this->assertResult( $this->assertResult(
@ -23,7 +26,8 @@ class HTMLPurifier_AttrTransform_ImgRequiredTest extends HTMLPurifier_AttrTransf
array('src' => 'blank.png', 'alt' => 'Pawned!'), array('src' => 'blank.png', 'alt' => 'Pawned!'),
array( array(
'Attr.DefaultInvalidImage' => 'blank.png', 'Attr.DefaultInvalidImage' => 'blank.png',
'Attr.DefaultInvalidImageAlt' => 'Pawned!' 'Attr.DefaultInvalidImageAlt' => 'Pawned!',
'Core.RemoveInvalidImg' => false
) )
); );
@ -34,7 +38,10 @@ class HTMLPurifier_AttrTransform_ImgRequiredTest extends HTMLPurifier_AttrTransf
$this->assertResult( $this->assertResult(
array('alt' => 'intrigue'), array('alt' => 'intrigue'),
array('alt' => 'intrigue', 'src' => '') array('alt' => 'intrigue', 'src' => ''),
array(
'Core.RemoveInvalidImg' => false
)
); );
} }

View File

@ -9,7 +9,8 @@ class HTMLPurifier_HTMLModuleManagerTest extends UnitTestCase
$manager = new HTMLPurifier_HTMLModuleManager(); $manager = new HTMLPurifier_HTMLModuleManager();
$manager->doctypes->register('Blank'); // doctype normally is blank... $manager->doctypes->register('Blank'); // doctype normally is blank...
$attrdef_nmtokens = 1; // magic number $attrdef_nmtokens = new HTMLPurifier_AttrDef();
$attrdef_nmtokens->_name = 'nmtokens'; // for testing only
generate_mock_once('HTMLPurifier_AttrDef'); generate_mock_once('HTMLPurifier_AttrDef');
$attrdef =& new HTMLPurifier_AttrDefMock($this); $attrdef =& new HTMLPurifier_AttrDefMock($this);

View File

@ -61,7 +61,7 @@ class HTMLPurifier_Strategy_RemoveForeignElementsTest
); );
// test preservation of valid img tag // test preservation of valid img tag
$this->assertResult('<img src="foobar.gif" />'); $this->assertResult('<img src="foobar.gif" alt="foobar.gif" />');
// test preservation of invalid img tag when removal is disabled // test preservation of invalid img tag when removal is disabled
$this->assertResult( $this->assertResult(

View File

@ -217,11 +217,10 @@ class HTMLPurifier_Strategy_ValidateAttributesTest extends
} }
function testImg() { function testImg() {
// (this should never happen, as RemoveForeignElements
// should have removed the offending image tag)
$this->assertResult( $this->assertResult(
'<img />', '<img />',
'<img src="" alt="Invalid image" />' '<img src="" alt="Invalid image" />',
array('Core.RemoveInvalidImg' => false)
); );
$this->assertResult( $this->assertResult(
@ -231,12 +230,14 @@ class HTMLPurifier_Strategy_ValidateAttributesTest extends
$this->assertResult( $this->assertResult(
'<img alt="pretty picture" />', '<img alt="pretty picture" />',
'<img alt="pretty picture" src="" />' '<img alt="pretty picture" src="" />',
array('Core.RemoveInvalidImg' => false)
); );
// mailto in image is not allowed // mailto in image is not allowed
$this->assertResult( $this->assertResult(
'<img src="mailto:foo@example.com" />', '<img src="mailto:foo@example.com" />',
'<img src="" alt="Invalid image" />' '<img alt="mailto:foo@example.com" src="" />',
array('Core.RemoveInvalidImg' => false)
); );
// align transformation // align transformation
$this->assertResult( $this->assertResult(