0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-12-22 08:21:52 +00:00

[1.7.0] Add native support for required elements

- Factored out large portion of ValidateAttributes to AttrValidator
- Implemented ValidateAttributes armor
- Fix clear cache bug
- Implement armoring for ValidateAttributes

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1174 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2007-06-20 21:39:28 +00:00
parent 8bbb73e47d
commit 69996acc9e
17 changed files with 247 additions and 128 deletions

2
NEWS
View File

@ -40,6 +40,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
! Config object gives more friendly error messages when things go wrong
! Advanced API implemented: easy functions for creating elements (addElement)
and attributes (addAttribute) on HTMLDefinition
! Add native support for required attributes
- Deprecated and removed EnableRedundantUTF8Cleaning. It didn't even work!
- DOMLex will not emit errors when a custom error handler that does not
honor error_reporting is used
@ -63,6 +64,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
. DirectLex can now track line-numbers
. Preliminary error collector is in place, although no code actually reports
errors yet
. Factor out most of ValidateAttributes to new AttrValidator class
1.6.1, released 2007-05-05
! Support for more deprecated attributes via transformations:

View File

@ -82,19 +82,47 @@ class HTMLPurifier_AttrCollections
* @param $attr_types HTMLPurifier_AttrTypes instance
*/
function expandIdentifiers(&$attr, $attr_types) {
// because foreach will process new elements we add, make sure we
// skip duplicates
$processed = array();
foreach ($attr as $def_i => $def) {
// skip inclusions
if ($def_i === 0) continue;
if (!is_string($def)) continue;
if (isset($processed[$def_i])) continue;
// determine whether or not attribute is required
if ($required = (strpos($def_i, '*') !== false)) {
// rename the definition
unset($attr[$def_i]);
$def_i = trim($def_i, '*');
$attr[$def_i] = $def;
}
$processed[$def_i] = true;
// if we've already got a literal object, move on
if (is_object($def)) {
// preserve previous required
$attr[$def_i]->required = ($required || $attr[$def_i]->required);
continue;
}
if ($def === false) {
unset($attr[$def_i]);
continue;
}
if ($t = $attr_types->get($def)) {
$attr[$def_i] = $t;
$attr[$def_i]->required = $required;
} else {
unset($attr[$def_i]);
}
}
}
}

View File

@ -14,11 +14,17 @@ class HTMLPurifier_AttrDef
{
/**
* Tells us whether or not an HTML attribute is minimized. Only the
* boolean attribute vapourware would use this.
* Tells us whether or not an HTML attribute is minimized. Has no
* meaning in other contexts.
*/
var $minimized = false;
/**
* Tells us whether or not an HTML attribute is required. Has no
* meaning in other contexts
*/
var $required = false;
/**
* Validates and cleans passed string according to a definition.
*

View File

@ -20,7 +20,10 @@ HTMLPurifier_ConfigSchema::define(
);
/**
* Post-transform that ensures the required attrs of img (alt and src) are set
* Transform that supplies default values for the src and alt attributes
* in img tags, as well as prevents the img tag from being removed
* because of a missing alt tag. This needs to be registered as both
* a pre and post attribute transform.
*/
class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
{
@ -29,6 +32,7 @@ class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
$src = true;
if (!isset($attr['src'])) {
if ($config->get('Core', 'RemoveInvalidImg')) return $attr;
$attr['src'] = $config->get('Attr', 'DefaultInvalidImage');
$src = false;
}

View File

@ -0,0 +1,105 @@
<?php
class HTMLPurifier_AttrValidator
{
function validateToken($token, &$config, &$context) {
$definition = $config->getHTMLDefinition();
// create alias to global definition array, see also $defs
// DEFINITION CALL
$d_defs = $definition->info_global_attr;
// copy out attributes for easy manipulation
$attr = $token->attr;
// do global transformations (pre)
// nothing currently utilizes this
foreach ($definition->info_attr_transform_pre as $transform) {
$attr = $transform->transform($attr, $config, $context);
}
// do local transformations only applicable to this element (pre)
// ex. <p align="right"> to <p style="text-align:right;">
foreach ($definition->info[$token->name]->attr_transform_pre
as $transform
) {
$attr = $transform->transform($attr, $config, $context);
}
// create alias to this element's attribute definition array, see
// also $d_defs (global attribute definition array)
// DEFINITION CALL
$defs = $definition->info[$token->name]->attr;
// iterate through all the attribute keypairs
// Watch out for name collisions: $key has previously been used
foreach ($attr as $attr_key => $value) {
// call the definition
if ( isset($defs[$attr_key]) ) {
// there is a local definition defined
if ($defs[$attr_key] === false) {
// We've explicitly been told not to allow this element.
// This is usually when there's a global definition
// that must be overridden.
// Theoretically speaking, we could have a
// AttrDef_DenyAll, but this is faster!
$result = false;
} else {
// validate according to the element's definition
$result = $defs[$attr_key]->validate(
$value, $config, $context
);
}
} elseif ( isset($d_defs[$attr_key]) ) {
// there is a global definition defined, validate according
// to the global definition
$result = $d_defs[$attr_key]->validate(
$value, $config, $context
);
} else {
// system never heard of the attribute? DELETE!
$result = false;
}
// put the results into effect
if ($result === false || $result === null) {
// remove the attribute
unset($attr[$attr_key]);
} elseif (is_string($result)) {
// simple substitution
$attr[$attr_key] = $result;
}
// we'd also want slightly more complicated substitution
// involving an array as the return value,
// although we're not sure how colliding attributes would
// resolve (certain ones would be completely overriden,
// others would prepend themselves).
}
// post transforms
// ex. <x lang="fr"> to <x lang="fr" xml:lang="fr">
foreach ($definition->info_attr_transform_post as $transform) {
$attr = $transform->transform($attr, $config, $context);
}
// ex. <bdo> to <bdo dir="ltr">
foreach ($definition->info[$token->name]->attr_transform_post as $transform) {
$attr = $transform->transform($attr, $config, $context);
}
// commit changes
$token->attr = $attr;
return $token;
}
}
?>

View File

@ -85,6 +85,13 @@ class HTMLPurifier_ElementDef
*/
var $descendants_are_inline = false;
/**
* List of the names of required attributes this element has. Dynamically
* populated.
* @public
*/
var $required_attr = array();
/**
* Lookup table of tags excluded from all descendants of this tag.
* @note SGML permits exclusions for all descendants, but this is
@ -174,6 +181,13 @@ class HTMLPurifier_ElementDef
}
}
/**
* Retrieves a copy of the element definition
*/
function copy() {
return unserialize(serialize($this));
}
}
?>

View File

@ -19,13 +19,15 @@ class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule
$img =& $this->addElement(
'img', true, 'Inline', 'Empty', 'Common',
array(
'alt' => 'Text',
'alt*' => 'Text',
'height' => 'Length',
'longdesc' => 'URI',
'src' => new HTMLPurifier_AttrDef_URI(true), // embedded
'src*' => new HTMLPurifier_AttrDef_URI(true), // embedded
'width' => 'Length'
)
);
// kind of strange, but splitting things up would be inefficient
$img->attr_transform_pre[] =
$img->attr_transform_post[] =
new HTMLPurifier_AttrTransform_ImgRequired();
}

View File

@ -405,7 +405,11 @@ class HTMLPurifier_HTMLModuleManager
foreach($this->elementLookup[$name] as $module_name) {
$module = $modules[$module_name];
$new_def = $module->info[$name];
// copy is used because, ideally speaking, the original
// definition should not be modified. Usually, this will
// make no difference, but for consistency's sake
$new_def = $module->info[$name]->copy();
// refuse to create/merge in a definition that is deemed unsafe
if (!$trusted && ($new_def->safe === false)) {
@ -443,6 +447,13 @@ class HTMLPurifier_HTMLModuleManager
$this->contentSets->generateChildDef($def, $module);
}
// add information on required attributes
foreach ($def->attr as $attr_name => $attr_def) {
if ($attr_def->required) {
$def->required_attr[] = $attr_name;
}
}
return $def;

View File

@ -5,6 +5,8 @@ require_once 'HTMLPurifier/HTMLDefinition.php';
require_once 'HTMLPurifier/Generator.php';
require_once 'HTMLPurifier/TagTransform.php';
require_once 'HTMLPurifier/AttrValidator.php';
HTMLPurifier_ConfigSchema::define(
'Core', 'RemoveInvalidImg', true, 'bool',
'This directive enables pre-emptive URI checking in <code>img</code> '.
@ -41,6 +43,8 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
$remove_invalid_img = $config->get('Core', 'RemoveInvalidImg');
$remove_script_contents = $config->get('Core', 'RemoveScriptContents');
$attr_validator = new HTMLPurifier_AttrValidator();
// removes tokens until it reaches a closing tag with its value
$remove_until = false;
@ -65,24 +69,23 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
}
if (isset($definition->info[$token->name])) {
// leave untouched, except for a few special cases:
// hard-coded image special case, pre-emptively drop
// if not available. Probably not abstract-able
if ( $token->name == 'img' && $remove_invalid_img ) {
if (!isset($token->attr['src'])) {
continue;
// mostly everything's good, but
// we need to make sure required attributes are in order
if (
$definition->info[$token->name]->required_attr &&
($token->name != 'img' || $remove_invalid_img) // ensure config option still works
) {
$token = $attr_validator->validateToken($token, $config, $context);
$ok = true;
foreach ($definition->info[$token->name]->required_attr as $name) {
if (!isset($token->attr[$name])) {
$ok = false;
break;
}
}
if (!isset($definition->info['img']->attr['src'])) {
continue;
}
$token->attr['src'] =
$definition->
info['img']->
attr['src']->
validate($token->attr['src'],
$config, $context);
if ($token->attr['src'] === false) continue;
if (!$ok) continue;
$token->armor['ValidateAttributes'] = true;
}
} elseif ($escape_invalid_tags) {

View File

@ -4,6 +4,8 @@ require_once 'HTMLPurifier/Strategy.php';
require_once 'HTMLPurifier/HTMLDefinition.php';
require_once 'HTMLPurifier/IDAccumulator.php';
require_once 'HTMLPurifier/AttrValidator.php';
HTMLPurifier_ConfigSchema::define(
'Attr', 'IDBlacklist', array(), 'list',
'Array of IDs not allowed in the document.');
@ -17,16 +19,13 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
function execute($tokens, $config, &$context) {
$definition = $config->getHTMLDefinition();
// setup id_accumulator context
$id_accumulator = new HTMLPurifier_IDAccumulator();
$id_accumulator->load($config->get('Attr', 'IDBlacklist'));
$context->register('IDAccumulator', $id_accumulator);
// create alias to global definition array, see also $defs
// DEFINITION CALL
$d_defs = $definition->info_global_attr;
// setup validator
$validator = new HTMLPurifier_AttrValidator();
foreach ($tokens as $key => $token) {
@ -34,91 +33,12 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
// namely start and empty tags
if ($token->type !== 'start' && $token->type !== 'empty') continue;
// copy out attributes for easy manipulation
$attr = $token->attr;
// skip tokens that are armored
if (!empty($token->armor['ValidateAttributes'])) continue;
// do global transformations (pre)
// nothing currently utilizes this
foreach ($definition->info_attr_transform_pre as $transform) {
$attr = $transform->transform($attr, $config, $context);
}
// do local transformations only applicable to this element (pre)
// ex. <p align="right"> to <p style="text-align:right;">
foreach ($definition->info[$token->name]->attr_transform_pre
as $transform
) {
$attr = $transform->transform($attr, $config, $context);
}
// create alias to this element's attribute definition array, see
// also $d_defs (global attribute definition array)
// DEFINITION CALL
$defs = $definition->info[$token->name]->attr;
// iterate through all the attribute keypairs
// Watch out for name collisions: $key has previously been used
foreach ($attr as $attr_key => $value) {
// call the definition
if ( isset($defs[$attr_key]) ) {
// there is a local definition defined
if ($defs[$attr_key] === false) {
// We've explicitly been told not to allow this element.
// This is usually when there's a global definition
// that must be overridden.
// Theoretically speaking, we could have a
// AttrDef_DenyAll, but this is faster!
$result = false;
} else {
// validate according to the element's definition
$result = $defs[$attr_key]->validate(
$value, $config, $context
);
}
} elseif ( isset($d_defs[$attr_key]) ) {
// there is a global definition defined, validate according
// to the global definition
$result = $d_defs[$attr_key]->validate(
$value, $config, $context
);
} else {
// system never heard of the attribute? DELETE!
$result = false;
}
// put the results into effect
if ($result === false || $result === null) {
// remove the attribute
unset($attr[$attr_key]);
} elseif (is_string($result)) {
// simple substitution
$attr[$attr_key] = $result;
}
// we'd also want slightly more complicated substitution
// involving an array as the return value,
// although we're not sure how colliding attributes would
// resolve (certain ones would be completely overriden,
// others would prepend themselves).
}
// post transforms
// ex. <x lang="fr"> to <x lang="fr" xml:lang="fr">
foreach ($definition->info_attr_transform_post as $transform) {
$attr = $transform->transform($attr, $config, $context);
}
// ex. <bdo> to <bdo dir="ltr">
foreach ($definition->info[$token->name]->attr_transform_post as $transform) {
$attr = $transform->transform($attr, $config, $context);
}
// commit changes
// could interfere with flyweight implementation
$tokens[$key]->attr = $attr;
$tokens[$key] = $validator->validateToken($token, $config, $context);
}
$context->destroy('IDAccumulator');
return $tokens;

View File

@ -13,6 +13,12 @@ class HTMLPurifier_Token {
var $type; /**< Type of node to bypass <tt>is_a()</tt>. @public */
var $line; /**< Line number node was on in source document. Null if unknown. @public */
/**
* Lookup array of processing that this token is exempt from.
* Currently, the only valid value is "ValidateAttributes".
*/
var $armor = array();
/**
* Copies the tag into a new one (clone substitute).
* @return Copied token

View File

@ -2,7 +2,7 @@
<?php
/**
* Flushes the HTMLDefinition serial cache
* Flushes the default HTMLDefinition serial cache
*/
if (php_sapi_name() != 'cli') {
@ -14,8 +14,10 @@ echo 'Flushing cache... ';
require_once(dirname(__FILE__) . '/../library/HTMLPurifier.auto.php');
$config = HTMLPurifier_Config::createDefault();
$cache = new HTMLPurifier_DefinitionCache_Serializer('HTML');
$cache->flush();
$cache->flush($config);
echo 'Cache flushed successfully.';

View File

@ -110,17 +110,24 @@ class HTMLPurifier_AttrCollectionsTest extends UnitTestCase
$attr = array(
'attr1' => 'Color',
'attr2' => 'URI'
'attr2*' => 'URI'
);
$types->setReturnValue('get', 'ColorObject', array('Color'));
$types->setReturnValue('get', 'URIObject', array('URI'));
$c_object = new HTMLPurifier_AttrDef();
$c_object->_name = 'Color'; // for testing purposes only
$u_object = new HTMLPurifier_AttrDef();
$u_object->_name = 'URL'; // for testing purposes only
$types->setReturnValue('get', $c_object, array('Color'));
$types->setReturnValue('get', $u_object, array('URI'));
$collections->expandIdentifiers($attr, $types);
$u_object->required = true;
$this->assertIdentical(
$attr,
array(
'attr1' => 'ColorObject',
'attr2' => 'URIObject'
'attr1' => $c_object,
'attr2' => $u_object
)
);

View File

@ -15,7 +15,10 @@ class HTMLPurifier_AttrTransform_ImgRequiredTest extends HTMLPurifier_AttrTransf
$this->assertResult(
array(),
array('src' => '', 'alt' => 'Invalid image')
array('src' => '', 'alt' => 'Invalid image'),
array(
'Core.RemoveInvalidImg' => false
)
);
$this->assertResult(
@ -23,7 +26,8 @@ class HTMLPurifier_AttrTransform_ImgRequiredTest extends HTMLPurifier_AttrTransf
array('src' => 'blank.png', 'alt' => 'Pawned!'),
array(
'Attr.DefaultInvalidImage' => 'blank.png',
'Attr.DefaultInvalidImageAlt' => 'Pawned!'
'Attr.DefaultInvalidImageAlt' => 'Pawned!',
'Core.RemoveInvalidImg' => false
)
);
@ -34,7 +38,10 @@ class HTMLPurifier_AttrTransform_ImgRequiredTest extends HTMLPurifier_AttrTransf
$this->assertResult(
array('alt' => 'intrigue'),
array('alt' => 'intrigue', 'src' => '')
array('alt' => 'intrigue', 'src' => ''),
array(
'Core.RemoveInvalidImg' => false
)
);
}

View File

@ -9,7 +9,8 @@ class HTMLPurifier_HTMLModuleManagerTest extends UnitTestCase
$manager = new HTMLPurifier_HTMLModuleManager();
$manager->doctypes->register('Blank'); // doctype normally is blank...
$attrdef_nmtokens = 1; // magic number
$attrdef_nmtokens = new HTMLPurifier_AttrDef();
$attrdef_nmtokens->_name = 'nmtokens'; // for testing only
generate_mock_once('HTMLPurifier_AttrDef');
$attrdef =& new HTMLPurifier_AttrDefMock($this);

View File

@ -61,7 +61,7 @@ class HTMLPurifier_Strategy_RemoveForeignElementsTest
);
// test preservation of valid img tag
$this->assertResult('<img src="foobar.gif" />');
$this->assertResult('<img src="foobar.gif" alt="foobar.gif" />');
// test preservation of invalid img tag when removal is disabled
$this->assertResult(

View File

@ -217,11 +217,10 @@ class HTMLPurifier_Strategy_ValidateAttributesTest extends
}
function testImg() {
// (this should never happen, as RemoveForeignElements
// should have removed the offending image tag)
$this->assertResult(
'<img />',
'<img src="" alt="Invalid image" />'
'<img src="" alt="Invalid image" />',
array('Core.RemoveInvalidImg' => false)
);
$this->assertResult(
@ -231,12 +230,14 @@ class HTMLPurifier_Strategy_ValidateAttributesTest extends
$this->assertResult(
'<img alt="pretty picture" />',
'<img alt="pretty picture" src="" />'
'<img alt="pretty picture" src="" />',
array('Core.RemoveInvalidImg' => false)
);
// mailto in image is not allowed
$this->assertResult(
'<img src="mailto:foo@example.com" />',
'<img src="" alt="Invalid image" />'
'<img alt="mailto:foo@example.com" src="" />',
array('Core.RemoveInvalidImg' => false)
);
// align transformation
$this->assertResult(