From 69996acc9e14a851a8873660a180292550d22010 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Wed, 20 Jun 2007 21:39:28 +0000 Subject: [PATCH] [1.7.0] Add native support for required elements - Factored out large portion of ValidateAttributes to AttrValidator - Implemented ValidateAttributes armor - Fix clear cache bug - Implement armoring for ValidateAttributes git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1174 48356398-32a2-884e-a903-53898d9a118a --- NEWS | 2 + library/HTMLPurifier/AttrCollections.php | 30 ++++- library/HTMLPurifier/AttrDef.php | 10 +- .../AttrTransform/ImgRequired.php | 6 +- library/HTMLPurifier/AttrValidator.php | 105 ++++++++++++++++++ library/HTMLPurifier/ElementDef.php | 14 +++ library/HTMLPurifier/HTMLModule/Image.php | 6 +- library/HTMLPurifier/HTMLModuleManager.php | 13 ++- .../Strategy/RemoveForeignElements.php | 35 +++--- .../Strategy/ValidateAttributes.php | 96 ++-------------- library/HTMLPurifier/Token.php | 6 + maintenance/flush-htmldefinition-cache.php | 6 +- tests/HTMLPurifier/AttrCollectionsTest.php | 17 ++- .../AttrTransform/ImgRequiredTest.php | 13 ++- tests/HTMLPurifier/HTMLModuleManagerTest.php | 3 +- .../Strategy/RemoveForeignElementsTest.php | 2 +- .../Strategy/ValidateAttributesTest.php | 11 +- 17 files changed, 247 insertions(+), 128 deletions(-) create mode 100644 library/HTMLPurifier/AttrValidator.php diff --git a/NEWS b/NEWS index 9fcf645a..a9f437af 100644 --- a/NEWS +++ b/NEWS @@ -40,6 +40,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier ! Config object gives more friendly error messages when things go wrong ! Advanced API implemented: easy functions for creating elements (addElement) and attributes (addAttribute) on HTMLDefinition +! Add native support for required attributes - Deprecated and removed EnableRedundantUTF8Cleaning. It didn't even work! - DOMLex will not emit errors when a custom error handler that does not honor error_reporting is used @@ -63,6 +64,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier . DirectLex can now track line-numbers . Preliminary error collector is in place, although no code actually reports errors yet +. Factor out most of ValidateAttributes to new AttrValidator class 1.6.1, released 2007-05-05 ! Support for more deprecated attributes via transformations: diff --git a/library/HTMLPurifier/AttrCollections.php b/library/HTMLPurifier/AttrCollections.php index da5445db..61bce40e 100644 --- a/library/HTMLPurifier/AttrCollections.php +++ b/library/HTMLPurifier/AttrCollections.php @@ -82,19 +82,47 @@ class HTMLPurifier_AttrCollections * @param $attr_types HTMLPurifier_AttrTypes instance */ function expandIdentifiers(&$attr, $attr_types) { + + // because foreach will process new elements we add, make sure we + // skip duplicates + $processed = array(); + foreach ($attr as $def_i => $def) { + // skip inclusions if ($def_i === 0) continue; - if (!is_string($def)) continue; + + if (isset($processed[$def_i])) continue; + + // determine whether or not attribute is required + if ($required = (strpos($def_i, '*') !== false)) { + // rename the definition + unset($attr[$def_i]); + $def_i = trim($def_i, '*'); + $attr[$def_i] = $def; + } + + $processed[$def_i] = true; + + // if we've already got a literal object, move on + if (is_object($def)) { + // preserve previous required + $attr[$def_i]->required = ($required || $attr[$def_i]->required); + continue; + } + if ($def === false) { unset($attr[$def_i]); continue; } + if ($t = $attr_types->get($def)) { $attr[$def_i] = $t; + $attr[$def_i]->required = $required; } else { unset($attr[$def_i]); } } + } } diff --git a/library/HTMLPurifier/AttrDef.php b/library/HTMLPurifier/AttrDef.php index 3efc8e69..d9d2d944 100644 --- a/library/HTMLPurifier/AttrDef.php +++ b/library/HTMLPurifier/AttrDef.php @@ -14,11 +14,17 @@ class HTMLPurifier_AttrDef { /** - * Tells us whether or not an HTML attribute is minimized. Only the - * boolean attribute vapourware would use this. + * Tells us whether or not an HTML attribute is minimized. Has no + * meaning in other contexts. */ var $minimized = false; + /** + * Tells us whether or not an HTML attribute is required. Has no + * meaning in other contexts + */ + var $required = false; + /** * Validates and cleans passed string according to a definition. * diff --git a/library/HTMLPurifier/AttrTransform/ImgRequired.php b/library/HTMLPurifier/AttrTransform/ImgRequired.php index 4ff356d8..159afd2f 100644 --- a/library/HTMLPurifier/AttrTransform/ImgRequired.php +++ b/library/HTMLPurifier/AttrTransform/ImgRequired.php @@ -20,7 +20,10 @@ HTMLPurifier_ConfigSchema::define( ); /** - * Post-transform that ensures the required attrs of img (alt and src) are set + * Transform that supplies default values for the src and alt attributes + * in img tags, as well as prevents the img tag from being removed + * because of a missing alt tag. This needs to be registered as both + * a pre and post attribute transform. */ class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform { @@ -29,6 +32,7 @@ class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform $src = true; if (!isset($attr['src'])) { + if ($config->get('Core', 'RemoveInvalidImg')) return $attr; $attr['src'] = $config->get('Attr', 'DefaultInvalidImage'); $src = false; } diff --git a/library/HTMLPurifier/AttrValidator.php b/library/HTMLPurifier/AttrValidator.php new file mode 100644 index 00000000..d6a1a563 --- /dev/null +++ b/library/HTMLPurifier/AttrValidator.php @@ -0,0 +1,105 @@ +getHTMLDefinition(); + + // create alias to global definition array, see also $defs + // DEFINITION CALL + $d_defs = $definition->info_global_attr; + + // copy out attributes for easy manipulation + $attr = $token->attr; + + // do global transformations (pre) + // nothing currently utilizes this + foreach ($definition->info_attr_transform_pre as $transform) { + $attr = $transform->transform($attr, $config, $context); + } + + // do local transformations only applicable to this element (pre) + // ex.

to

+ foreach ($definition->info[$token->name]->attr_transform_pre + as $transform + ) { + $attr = $transform->transform($attr, $config, $context); + } + + // create alias to this element's attribute definition array, see + // also $d_defs (global attribute definition array) + // DEFINITION CALL + $defs = $definition->info[$token->name]->attr; + + // iterate through all the attribute keypairs + // Watch out for name collisions: $key has previously been used + foreach ($attr as $attr_key => $value) { + + // call the definition + if ( isset($defs[$attr_key]) ) { + // there is a local definition defined + if ($defs[$attr_key] === false) { + // We've explicitly been told not to allow this element. + // This is usually when there's a global definition + // that must be overridden. + // Theoretically speaking, we could have a + // AttrDef_DenyAll, but this is faster! + $result = false; + } else { + // validate according to the element's definition + $result = $defs[$attr_key]->validate( + $value, $config, $context + ); + } + } elseif ( isset($d_defs[$attr_key]) ) { + // there is a global definition defined, validate according + // to the global definition + $result = $d_defs[$attr_key]->validate( + $value, $config, $context + ); + } else { + // system never heard of the attribute? DELETE! + $result = false; + } + + // put the results into effect + if ($result === false || $result === null) { + // remove the attribute + unset($attr[$attr_key]); + } elseif (is_string($result)) { + // simple substitution + $attr[$attr_key] = $result; + } + + // we'd also want slightly more complicated substitution + // involving an array as the return value, + // although we're not sure how colliding attributes would + // resolve (certain ones would be completely overriden, + // others would prepend themselves). + } + + // post transforms + + // ex. to + foreach ($definition->info_attr_transform_post as $transform) { + $attr = $transform->transform($attr, $config, $context); + } + + // ex. to + foreach ($definition->info[$token->name]->attr_transform_post as $transform) { + $attr = $transform->transform($attr, $config, $context); + } + + // commit changes + $token->attr = $attr; + return $token; + + } + + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/ElementDef.php b/library/HTMLPurifier/ElementDef.php index 1811f521..286ca8b6 100644 --- a/library/HTMLPurifier/ElementDef.php +++ b/library/HTMLPurifier/ElementDef.php @@ -85,6 +85,13 @@ class HTMLPurifier_ElementDef */ var $descendants_are_inline = false; + /** + * List of the names of required attributes this element has. Dynamically + * populated. + * @public + */ + var $required_attr = array(); + /** * Lookup table of tags excluded from all descendants of this tag. * @note SGML permits exclusions for all descendants, but this is @@ -174,6 +181,13 @@ class HTMLPurifier_ElementDef } } + /** + * Retrieves a copy of the element definition + */ + function copy() { + return unserialize(serialize($this)); + } + } ?> diff --git a/library/HTMLPurifier/HTMLModule/Image.php b/library/HTMLPurifier/HTMLModule/Image.php index dc9c732b..6b1574c3 100644 --- a/library/HTMLPurifier/HTMLModule/Image.php +++ b/library/HTMLPurifier/HTMLModule/Image.php @@ -19,13 +19,15 @@ class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule $img =& $this->addElement( 'img', true, 'Inline', 'Empty', 'Common', array( - 'alt' => 'Text', + 'alt*' => 'Text', 'height' => 'Length', 'longdesc' => 'URI', - 'src' => new HTMLPurifier_AttrDef_URI(true), // embedded + 'src*' => new HTMLPurifier_AttrDef_URI(true), // embedded 'width' => 'Length' ) ); + // kind of strange, but splitting things up would be inefficient + $img->attr_transform_pre[] = $img->attr_transform_post[] = new HTMLPurifier_AttrTransform_ImgRequired(); } diff --git a/library/HTMLPurifier/HTMLModuleManager.php b/library/HTMLPurifier/HTMLModuleManager.php index ed1a8edb..f6b1b089 100644 --- a/library/HTMLPurifier/HTMLModuleManager.php +++ b/library/HTMLPurifier/HTMLModuleManager.php @@ -405,7 +405,11 @@ class HTMLPurifier_HTMLModuleManager foreach($this->elementLookup[$name] as $module_name) { $module = $modules[$module_name]; - $new_def = $module->info[$name]; + + // copy is used because, ideally speaking, the original + // definition should not be modified. Usually, this will + // make no difference, but for consistency's sake + $new_def = $module->info[$name]->copy(); // refuse to create/merge in a definition that is deemed unsafe if (!$trusted && ($new_def->safe === false)) { @@ -443,6 +447,13 @@ class HTMLPurifier_HTMLModuleManager $this->contentSets->generateChildDef($def, $module); } + + // add information on required attributes + foreach ($def->attr as $attr_name => $attr_def) { + if ($attr_def->required) { + $def->required_attr[] = $attr_name; + } + } return $def; diff --git a/library/HTMLPurifier/Strategy/RemoveForeignElements.php b/library/HTMLPurifier/Strategy/RemoveForeignElements.php index f62e4653..9a1e80c0 100644 --- a/library/HTMLPurifier/Strategy/RemoveForeignElements.php +++ b/library/HTMLPurifier/Strategy/RemoveForeignElements.php @@ -5,6 +5,8 @@ require_once 'HTMLPurifier/HTMLDefinition.php'; require_once 'HTMLPurifier/Generator.php'; require_once 'HTMLPurifier/TagTransform.php'; +require_once 'HTMLPurifier/AttrValidator.php'; + HTMLPurifier_ConfigSchema::define( 'Core', 'RemoveInvalidImg', true, 'bool', 'This directive enables pre-emptive URI checking in img '. @@ -41,6 +43,8 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy $remove_invalid_img = $config->get('Core', 'RemoveInvalidImg'); $remove_script_contents = $config->get('Core', 'RemoveScriptContents'); + $attr_validator = new HTMLPurifier_AttrValidator(); + // removes tokens until it reaches a closing tag with its value $remove_until = false; @@ -65,24 +69,23 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy } if (isset($definition->info[$token->name])) { - // leave untouched, except for a few special cases: - // hard-coded image special case, pre-emptively drop - // if not available. Probably not abstract-able - if ( $token->name == 'img' && $remove_invalid_img ) { - if (!isset($token->attr['src'])) { - continue; + // mostly everything's good, but + // we need to make sure required attributes are in order + if ( + $definition->info[$token->name]->required_attr && + ($token->name != 'img' || $remove_invalid_img) // ensure config option still works + ) { + $token = $attr_validator->validateToken($token, $config, $context); + $ok = true; + foreach ($definition->info[$token->name]->required_attr as $name) { + if (!isset($token->attr[$name])) { + $ok = false; + break; + } } - if (!isset($definition->info['img']->attr['src'])) { - continue; - } - $token->attr['src'] = - $definition-> - info['img']-> - attr['src']-> - validate($token->attr['src'], - $config, $context); - if ($token->attr['src'] === false) continue; + if (!$ok) continue; + $token->armor['ValidateAttributes'] = true; } } elseif ($escape_invalid_tags) { diff --git a/library/HTMLPurifier/Strategy/ValidateAttributes.php b/library/HTMLPurifier/Strategy/ValidateAttributes.php index 07744f80..1c9e09b3 100644 --- a/library/HTMLPurifier/Strategy/ValidateAttributes.php +++ b/library/HTMLPurifier/Strategy/ValidateAttributes.php @@ -4,6 +4,8 @@ require_once 'HTMLPurifier/Strategy.php'; require_once 'HTMLPurifier/HTMLDefinition.php'; require_once 'HTMLPurifier/IDAccumulator.php'; +require_once 'HTMLPurifier/AttrValidator.php'; + HTMLPurifier_ConfigSchema::define( 'Attr', 'IDBlacklist', array(), 'list', 'Array of IDs not allowed in the document.'); @@ -17,16 +19,13 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy function execute($tokens, $config, &$context) { - $definition = $config->getHTMLDefinition(); - // setup id_accumulator context $id_accumulator = new HTMLPurifier_IDAccumulator(); $id_accumulator->load($config->get('Attr', 'IDBlacklist')); $context->register('IDAccumulator', $id_accumulator); - // create alias to global definition array, see also $defs - // DEFINITION CALL - $d_defs = $definition->info_global_attr; + // setup validator + $validator = new HTMLPurifier_AttrValidator(); foreach ($tokens as $key => $token) { @@ -34,91 +33,12 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy // namely start and empty tags if ($token->type !== 'start' && $token->type !== 'empty') continue; - // copy out attributes for easy manipulation - $attr = $token->attr; + // skip tokens that are armored + if (!empty($token->armor['ValidateAttributes'])) continue; - // do global transformations (pre) - // nothing currently utilizes this - foreach ($definition->info_attr_transform_pre as $transform) { - $attr = $transform->transform($attr, $config, $context); - } - - // do local transformations only applicable to this element (pre) - // ex.

to

- foreach ($definition->info[$token->name]->attr_transform_pre - as $transform - ) { - $attr = $transform->transform($attr, $config, $context); - } - - // create alias to this element's attribute definition array, see - // also $d_defs (global attribute definition array) - // DEFINITION CALL - $defs = $definition->info[$token->name]->attr; - - // iterate through all the attribute keypairs - // Watch out for name collisions: $key has previously been used - foreach ($attr as $attr_key => $value) { - - // call the definition - if ( isset($defs[$attr_key]) ) { - // there is a local definition defined - if ($defs[$attr_key] === false) { - // We've explicitly been told not to allow this element. - // This is usually when there's a global definition - // that must be overridden. - // Theoretically speaking, we could have a - // AttrDef_DenyAll, but this is faster! - $result = false; - } else { - // validate according to the element's definition - $result = $defs[$attr_key]->validate( - $value, $config, $context - ); - } - } elseif ( isset($d_defs[$attr_key]) ) { - // there is a global definition defined, validate according - // to the global definition - $result = $d_defs[$attr_key]->validate( - $value, $config, $context - ); - } else { - // system never heard of the attribute? DELETE! - $result = false; - } - - // put the results into effect - if ($result === false || $result === null) { - // remove the attribute - unset($attr[$attr_key]); - } elseif (is_string($result)) { - // simple substitution - $attr[$attr_key] = $result; - } - - // we'd also want slightly more complicated substitution - // involving an array as the return value, - // although we're not sure how colliding attributes would - // resolve (certain ones would be completely overriden, - // others would prepend themselves). - } - - // post transforms - - // ex. to - foreach ($definition->info_attr_transform_post as $transform) { - $attr = $transform->transform($attr, $config, $context); - } - - // ex. to - foreach ($definition->info[$token->name]->attr_transform_post as $transform) { - $attr = $transform->transform($attr, $config, $context); - } - - // commit changes - // could interfere with flyweight implementation - $tokens[$key]->attr = $attr; + $tokens[$key] = $validator->validateToken($token, $config, $context); } + $context->destroy('IDAccumulator'); return $tokens; diff --git a/library/HTMLPurifier/Token.php b/library/HTMLPurifier/Token.php index 82b2a88c..dfcc5cbc 100644 --- a/library/HTMLPurifier/Token.php +++ b/library/HTMLPurifier/Token.php @@ -13,6 +13,12 @@ class HTMLPurifier_Token { var $type; /**< Type of node to bypass is_a(). @public */ var $line; /**< Line number node was on in source document. Null if unknown. @public */ + /** + * Lookup array of processing that this token is exempt from. + * Currently, the only valid value is "ValidateAttributes". + */ + var $armor = array(); + /** * Copies the tag into a new one (clone substitute). * @return Copied token diff --git a/maintenance/flush-htmldefinition-cache.php b/maintenance/flush-htmldefinition-cache.php index a2d3fd48..780be373 100644 --- a/maintenance/flush-htmldefinition-cache.php +++ b/maintenance/flush-htmldefinition-cache.php @@ -2,7 +2,7 @@ flush(); +$cache->flush($config); echo 'Cache flushed successfully.'; diff --git a/tests/HTMLPurifier/AttrCollectionsTest.php b/tests/HTMLPurifier/AttrCollectionsTest.php index c121fd50..22cb81b4 100644 --- a/tests/HTMLPurifier/AttrCollectionsTest.php +++ b/tests/HTMLPurifier/AttrCollectionsTest.php @@ -110,17 +110,24 @@ class HTMLPurifier_AttrCollectionsTest extends UnitTestCase $attr = array( 'attr1' => 'Color', - 'attr2' => 'URI' + 'attr2*' => 'URI' ); - $types->setReturnValue('get', 'ColorObject', array('Color')); - $types->setReturnValue('get', 'URIObject', array('URI')); + $c_object = new HTMLPurifier_AttrDef(); + $c_object->_name = 'Color'; // for testing purposes only + $u_object = new HTMLPurifier_AttrDef(); + $u_object->_name = 'URL'; // for testing purposes only + + $types->setReturnValue('get', $c_object, array('Color')); + $types->setReturnValue('get', $u_object, array('URI')); + $collections->expandIdentifiers($attr, $types); + $u_object->required = true; $this->assertIdentical( $attr, array( - 'attr1' => 'ColorObject', - 'attr2' => 'URIObject' + 'attr1' => $c_object, + 'attr2' => $u_object ) ); diff --git a/tests/HTMLPurifier/AttrTransform/ImgRequiredTest.php b/tests/HTMLPurifier/AttrTransform/ImgRequiredTest.php index dff045ee..a9ad9a8c 100644 --- a/tests/HTMLPurifier/AttrTransform/ImgRequiredTest.php +++ b/tests/HTMLPurifier/AttrTransform/ImgRequiredTest.php @@ -15,7 +15,10 @@ class HTMLPurifier_AttrTransform_ImgRequiredTest extends HTMLPurifier_AttrTransf $this->assertResult( array(), - array('src' => '', 'alt' => 'Invalid image') + array('src' => '', 'alt' => 'Invalid image'), + array( + 'Core.RemoveInvalidImg' => false + ) ); $this->assertResult( @@ -23,7 +26,8 @@ class HTMLPurifier_AttrTransform_ImgRequiredTest extends HTMLPurifier_AttrTransf array('src' => 'blank.png', 'alt' => 'Pawned!'), array( 'Attr.DefaultInvalidImage' => 'blank.png', - 'Attr.DefaultInvalidImageAlt' => 'Pawned!' + 'Attr.DefaultInvalidImageAlt' => 'Pawned!', + 'Core.RemoveInvalidImg' => false ) ); @@ -34,7 +38,10 @@ class HTMLPurifier_AttrTransform_ImgRequiredTest extends HTMLPurifier_AttrTransf $this->assertResult( array('alt' => 'intrigue'), - array('alt' => 'intrigue', 'src' => '') + array('alt' => 'intrigue', 'src' => ''), + array( + 'Core.RemoveInvalidImg' => false + ) ); } diff --git a/tests/HTMLPurifier/HTMLModuleManagerTest.php b/tests/HTMLPurifier/HTMLModuleManagerTest.php index b677b1dc..4f94bc1b 100644 --- a/tests/HTMLPurifier/HTMLModuleManagerTest.php +++ b/tests/HTMLPurifier/HTMLModuleManagerTest.php @@ -9,7 +9,8 @@ class HTMLPurifier_HTMLModuleManagerTest extends UnitTestCase $manager = new HTMLPurifier_HTMLModuleManager(); $manager->doctypes->register('Blank'); // doctype normally is blank... - $attrdef_nmtokens = 1; // magic number + $attrdef_nmtokens = new HTMLPurifier_AttrDef(); + $attrdef_nmtokens->_name = 'nmtokens'; // for testing only generate_mock_once('HTMLPurifier_AttrDef'); $attrdef =& new HTMLPurifier_AttrDefMock($this); diff --git a/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php b/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php index 7d1f2b48..a357abbb 100644 --- a/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php +++ b/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php @@ -61,7 +61,7 @@ class HTMLPurifier_Strategy_RemoveForeignElementsTest ); // test preservation of valid img tag - $this->assertResult(''); + $this->assertResult('foobar.gif'); // test preservation of invalid img tag when removal is disabled $this->assertResult( diff --git a/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php b/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php index 54a49a31..06b8060a 100644 --- a/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php +++ b/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php @@ -217,11 +217,10 @@ class HTMLPurifier_Strategy_ValidateAttributesTest extends } function testImg() { - // (this should never happen, as RemoveForeignElements - // should have removed the offending image tag) $this->assertResult( '', - 'Invalid image' + 'Invalid image', + array('Core.RemoveInvalidImg' => false) ); $this->assertResult( @@ -231,12 +230,14 @@ class HTMLPurifier_Strategy_ValidateAttributesTest extends $this->assertResult( 'pretty picture', - 'pretty picture' + 'pretty picture', + array('Core.RemoveInvalidImg' => false) ); // mailto in image is not allowed $this->assertResult( '', - 'Invalid image' + 'mailto:foo@example.com', + array('Core.RemoveInvalidImg' => false) ); // align transformation $this->assertResult(