From 77d9e05a0763ce443d48373d5c00f3acb89aea20 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sun, 4 Feb 2007 03:53:57 +0000 Subject: [PATCH] [1.5.0] Massive refactoring for Blockquote and Chameleon to be more extensible and accommodating of XHTMLDefinition. - Fixed buggy chameleon-support for ins and del . Removed context variable ParentType, replaced with IsInline, which is false when you're not inline and an integer of the parent that caused you to become inline when you are (so possibly zero) . Removed ElementDef->type in favor of ElementDef->descendants_are_inline and HTMLDefinition->content_sets . StrictBlockquote now reports what elements its supposed to allow, rather than what it does allow . Removed HTMLDefinition->info_flow_elements in favor of HTMLDefinition->content_sets['Flow'] . Removed redundant "exclusionary" definitions from DTD roster . StrictBlockquote now requires a construction parameter as if it were an Required ChildDef, this is the "real" set of allowed elements git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@710 48356398-32a2-884e-a903-53898d9a118a --- NEWS | 13 +++++ library/HTMLPurifier/ChildDef/Chameleon.php | 21 +++----- .../ChildDef/StrictBlockquote.php | 22 +++++--- library/HTMLPurifier/HTMLDefinition.php | 54 ++++++------------- library/HTMLPurifier/HTMLModule/Text.php | 1 + .../HTMLPurifier/Printer/HTMLDefinition.php | 7 ++- library/HTMLPurifier/Strategy/FixNesting.php | 19 ++++--- library/HTMLPurifier/XHTMLDefinition.php | 49 +++++++++++------ tests/HTMLPurifier/ChildDef/ChameleonTest.php | 6 +-- .../ChildDef/StrictBlockquoteTest.php | 2 +- .../HTMLPurifier/Strategy/FixNestingTest.php | 15 +++++- 11 files changed, 114 insertions(+), 95 deletions(-) diff --git a/NEWS b/NEWS index 63ee7bf8..ae82e27f 100644 --- a/NEWS +++ b/NEWS @@ -12,8 +12,21 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier 1.5.0, unknown release date ! Added a rudimentary I18N and L10N system modeled off MediaWiki - Allow 'x' subtag in language codes +- Fixed buggy chameleon-support for ins and del . Added support for IDREF attributes (i.e. for) . Renamed HTMLPurifier_AttrDef_Class to HTMLPurifier_AttrDef_Nmtokens +. Removed context variable ParentType, replaced with IsInline, which + is false when you're not inline and an integer of the parent that + caused you to become inline when you are (so possibly zero) +. Removed ElementDef->type in favor of ElementDef->descendants_are_inline + and HTMLDefinition->content_sets +. StrictBlockquote now reports what elements its supposed to allow, + rather than what it does allow +. Removed HTMLDefinition->info_flow_elements in favor of + HTMLDefinition->content_sets['Flow'] +. Removed redundant "exclusionary" definitions from DTD roster +. StrictBlockquote now requires a construction parameter as if it + were an Required ChildDef, this is the "real" set of allowed elements 1.4.2, unknown release date ! docs/enduser-utf8.html explains how to use UTF-8 and HTML Purifier diff --git a/library/HTMLPurifier/ChildDef/Chameleon.php b/library/HTMLPurifier/ChildDef/Chameleon.php index feb84a15..afe0299f 100644 --- a/library/HTMLPurifier/ChildDef/Chameleon.php +++ b/library/HTMLPurifier/ChildDef/Chameleon.php @@ -38,22 +38,13 @@ class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef } function validateChildren($tokens_of_children, $config, &$context) { - $parent_type = $context->get('ParentType'); - switch ($parent_type) { - case 'unknown': - case 'inline': - $result = $this->inline->validateChildren( - $tokens_of_children, $config, $context); - break; - case 'block': - $result = $this->block->validateChildren( - $tokens_of_children, $config, $context); - break; - default: - trigger_error('Invalid context', E_USER_ERROR); - return false; + if ($context->get('IsInline') === false) { + return $this->block->validateChildren( + $tokens_of_children, $config, $context); + } else { + return $this->inline->validateChildren( + $tokens_of_children, $config, $context); } - return $result; } } diff --git a/library/HTMLPurifier/ChildDef/StrictBlockquote.php b/library/HTMLPurifier/ChildDef/StrictBlockquote.php index 980acac3..0352e488 100644 --- a/library/HTMLPurifier/ChildDef/StrictBlockquote.php +++ b/library/HTMLPurifier/ChildDef/StrictBlockquote.php @@ -4,27 +4,31 @@ require_once 'HTMLPurifier/ChildDef/Required.php'; /** * Takes the contents of blockquote when in strict and reformats for validation. - * - * From XHTML 1.0 Transitional to Strict, there is a notable change where */ class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Required { + var $real_elements; + var $fake_elements; var $allow_empty = true; var $type = 'strictblockquote'; var $init = false; - function HTMLPurifier_ChildDef_StrictBlockquote() {} function validateChildren($tokens_of_children, $config, &$context) { $def = $config->getHTMLDefinition(); if (!$this->init) { // allow all inline elements - $this->elements = $def->info_flow_elements; - $this->elements['#PCDATA'] = true; + $this->real_elements = $this->elements; + $this->fake_elements = $def->content_sets['Flow']; + $this->fake_elements['#PCDATA'] = true; $this->init = true; } + // trick the parent class into thinking it allows more + $this->elements = $this->fake_elements; $result = parent::validateChildren($tokens_of_children, $config, $context); + $this->elements = $this->real_elements; + if ($result === false) return array(); if ($result === true) $result = $tokens_of_children; @@ -40,8 +44,10 @@ extends HTMLPurifier_ChildDef_Required // ifs are nested for readability if (!$is_inline) { if (!$depth) { - if (($token->type == 'text') || - ($def->info[$token->name]->type == 'inline')) { + if ( + $token->type == 'text' || + !isset($this->elements[$token->name]) + ) { $is_inline = true; $ret[] = $block_wrap_start; } @@ -50,7 +56,7 @@ extends HTMLPurifier_ChildDef_Required if (!$depth) { // starting tokens have been inline text / empty if ($token->type == 'start' || $token->type == 'empty') { - if ($def->info[$token->name]->type == 'block') { + if (isset($this->elements[$token->name])) { // ended $ret[] = $block_wrap_end; $is_inline = false; diff --git a/library/HTMLPurifier/HTMLDefinition.php b/library/HTMLPurifier/HTMLDefinition.php index 00bcface..ea2582c2 100644 --- a/library/HTMLPurifier/HTMLDefinition.php +++ b/library/HTMLPurifier/HTMLDefinition.php @@ -163,23 +163,19 @@ class HTMLPurifier_HTMLDefinition var $info_attr_transform_pre = array(); /** - * List of HTMLPurifier_AttrTransform to be performed after validation/ + * List of HTMLPurifier_AttrTransform to be performed after validation. * @public */ var $info_attr_transform_post = array(); - /** - * Lookup table of flow elements - * @public - */ - var $info_flow_elements = array(); - /** * Boolean is a strict definition? * @public */ var $strict; + var $content_sets = array(); + /** * Initializes the definition, the meat of the class. */ @@ -258,11 +254,6 @@ class HTMLPurifier_HTMLDefinition $e_Block = new HTMLPurifier_ChildDef_Optional($e_block); $e__flow = "#PCDATA | $e_block | form | $e_inline | $e_misc"; $e_Flow = new HTMLPurifier_ChildDef_Optional($e__flow); - $e_a_content = new HTMLPurifier_ChildDef_Optional("#PCDATA". - " | $e_special | $e_fontstyle | $e_phrase | $e_inline_forms | $e_misc_inline"); - $e_pre_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | a". - " | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic | $e_inline_forms". - " | $e_misc_inline"); $e_form_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_block | $e_inline | $e_misc");//unused $e_form_button_content = new HTMLPurifier_ChildDef_Optional( "#PCDATA | p | $e_heading | div | $e_lists | $e_blocktext |". @@ -278,7 +269,7 @@ class HTMLPurifier_HTMLDefinition $this->info['div']->child = $e_Flow; if ($this->strict) { - $this->info['blockquote']->child = new HTMLPurifier_ChildDef_StrictBlockquote(); + $this->info['blockquote']->child = new HTMLPurifier_ChildDef_StrictBlockquote($e_block); } else { $this->info['blockquote']->child = $e_Flow; } @@ -337,9 +328,9 @@ class HTMLPurifier_HTMLDefinition $this->info['br']->child = $this->info['hr']->child = new HTMLPurifier_ChildDef_Empty(); - $this->info['pre']->child = $e_pre_content; - - $this->info['a']->child = $e_a_content; + // exclusionary + $this->info['pre']->child = $e_Inline; + $this->info['a']->child = $e_Inline; $this->info['table']->child = new HTMLPurifier_ChildDef_Table(); @@ -355,27 +346,16 @@ class HTMLPurifier_HTMLDefinition $this->info['td']->child = $e_Flow; ////////////////////////////////////////////////////////////////////// - // info[]->type : defines the type of the element (block or inline) + // misc compat stuff with XHTMLDefinition - // unknown until proven inline/block - foreach ($this->info as $i => $x) { - $this->info[$i]->type = 'unknown'; - } - - // reuses $e_Inline and $e_Block - foreach ($e_Inline->elements as $name => $bool) { - if ($name == '#PCDATA') continue; - if (!isset($this->info[$name])) continue; - $this->info[$name]->type = 'inline'; - } - - foreach ($e_Block->elements as $name => $bool) { - if (!isset($this->info[$name])) continue; - $this->info[$name]->type = 'block'; + foreach ($this->info as $key => $def) { + if ($this->info[$key]->child == $e_Inline) { + $this->info[$key]->descendants_are_inline = true; + } } foreach ($e_Flow->elements as $name => $bool) { - $this->info_flow_elements[$name] = true; + $this->content_sets['Flow'][$name] = true; } ////////////////////////////////////////////////////////////////////// @@ -649,11 +629,7 @@ class HTMLPurifier_ElementDef var $content_model; var $content_model_type; - /** - * Type of the tag: inline or block or unknown? - * @public - */ - var $type; + var $descendants_are_inline; /** * Lookup table of tags excluded from all descendants of this tag. @@ -663,4 +639,4 @@ class HTMLPurifier_ElementDef } -?> \ No newline at end of file +?> diff --git a/library/HTMLPurifier/HTMLModule/Text.php b/library/HTMLPurifier/HTMLModule/Text.php index 12c91072..35c9d8ed 100644 --- a/library/HTMLPurifier/HTMLModule/Text.php +++ b/library/HTMLPurifier/HTMLModule/Text.php @@ -37,6 +37,7 @@ class HTMLPurifier_HTMLModule_Text extends HTMLPurifier_HTMLModule if ($element == 'br') { $this->info[$element]->content_model_type = 'empty'; } elseif ($element == 'blockquote') { + $this->info[$element]->content_model = 'Heading | Block | List'; $this->info[$element]->content_model_type = 'strictblockquote'; } elseif ($element == 'div') { $this->info[$element]->content_model = '#PCDATA | Flow'; diff --git a/library/HTMLPurifier/Printer/HTMLDefinition.php b/library/HTMLPurifier/Printer/HTMLDefinition.php index 20671912..c7314bcd 100644 --- a/library/HTMLPurifier/Printer/HTMLDefinition.php +++ b/library/HTMLPurifier/Printer/HTMLDefinition.php @@ -16,8 +16,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer $this->config =& $config; if (isset($_GET['x'])) { // hidden settings - $this->def = new HTMLPurifier_XHTMLDefinition(); - $this->def->initialize($config); + $this->def = new HTMLPurifier_XHTMLDefinition($config); $this->def->setup($config); } else { $this->def = $config->getHTMLDefinition(); @@ -88,8 +87,8 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer $ret .= $this->element('th', "<$name>", array('class'=>'heavy', 'colspan' => 2)); $ret .= $this->end('tr'); $ret .= $this->start('tr'); - $ret .= $this->element('th', 'Type'); - $ret .= $this->element('td', ucfirst($def->type)); + $ret .= $this->element('th', 'Inline content'); + $ret .= $this->element('td', $def->descendants_are_inline ? 'Yes' : 'No'); $ret .= $this->end('tr'); if (!empty($def->excludes)) { $ret .= $this->start('tr'); diff --git a/library/HTMLPurifier/Strategy/FixNesting.php b/library/HTMLPurifier/Strategy/FixNesting.php index dd5a920f..08f90756 100644 --- a/library/HTMLPurifier/Strategy/FixNesting.php +++ b/library/HTMLPurifier/Strategy/FixNesting.php @@ -49,8 +49,8 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy $tokens[] = new HTMLPurifier_Token_End($parent_name); // setup the context variables - $parent_type = 'unknown'; // reference var that we alter - $context->register('ParentType', $parent_type); + $is_inline = false; // reference var that we alter + $context->register('IsInline', $is_inline); //####################################################################// // Loop initialization @@ -115,11 +115,16 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy } // calculate context - if (isset($parent_def)) { - $parent_type = $parent_def->type; + if ($is_inline === false) { + // check if conditions make it inline + if (!empty($parent_def) && $parent_def->descendants_are_inline) { + $is_inline = $count - 1; + } } else { - // generally found in specialized elements like UL - $parent_type = 'unknown'; + // check if we're out of inline + if ($count === $is_inline) { + $is_inline = false; + } } //################################################################// @@ -273,7 +278,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy array_pop($tokens); // remove context variables - $context->destroy('ParentType'); + $context->destroy('IsInline'); //####################################################################// // Return diff --git a/library/HTMLPurifier/XHTMLDefinition.php b/library/HTMLPurifier/XHTMLDefinition.php index bc35bf0e..38efb64c 100644 --- a/library/HTMLPurifier/XHTMLDefinition.php +++ b/library/HTMLPurifier/XHTMLDefinition.php @@ -19,8 +19,9 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition var $modules = array(); var $attr_types; var $attr_collection; + var $content_sets; - function initialize($config) { + function HTMLPurifier_XHTMLDefinition($config) { $this->modules['Text'] = new HTMLPurifier_HTMLModule_Text(); $this->modules['Hypertext'] = new HTMLPurifier_HTMLModule_Hypertext(); @@ -48,6 +49,7 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition } } } + // perform content_set expansions foreach ($content_sets as $i => $set) { // only performed once, so infinite recursion is not @@ -59,41 +61,48 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition array_values($content_sets), $set); } + // define convenient variables $content_sets_keys = array_keys($content_sets); $content_sets_values = array_values($content_sets); + foreach ($content_sets as $name => $set) { + $this->content_sets[$name] = $this->convertToLookup($set); + } foreach ($this->modules as $module_i => $module) { - foreach ($module->info as $element_i => $element) { - $element =& $this->modules[$module_i]->info[$element_i]; + foreach ($module->info as $name => $def) { + $def =& $this->modules[$module_i]->info[$name]; // attribute value expansions - $this->attr_collection->performInclusions($element->attr); + $this->attr_collection->performInclusions($def->attr); $this->attr_collection->expandStringIdentifiers( - $element->attr, $this->attr_types); + $def->attr, $this->attr_types); // perform content model expansions - $content_model = $element->content_model; + $content_model = $def->content_model; if (is_string($content_model)) { - $element->content_model = str_replace( + if (strpos($content_model, 'Inline') !== false) { + $def->descendants_are_inline = true; + } + $def->content_model = str_replace( $content_sets_keys, $content_sets_values, $content_model); } // get child def from content model - $element->child = $this->getChildDef($element); + $def->child = $this->getChildDef($def); // setup info - $this->info[$element_i] = $element; - if ($this->info_parent == $element_i) { - $this->info_parent_def = $this->info[$element_i]; + $this->info[$name] = $def; + if ($this->info_parent == $name) { + $this->info_parent_def = $this->info[$name]; } } } } - function getChildDef($element) { - $value = $element->content_model; - $type = $element->content_model_type; + function getChildDef($def) { + $value = $def->content_model; + $type = $def->content_model_type; switch ($type) { case 'required': return new HTMLPurifier_ChildDef_Required($value); @@ -102,7 +111,7 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition case 'empty': return new HTMLPurifier_ChildDef_Empty(); case 'strictblockquote': - return new HTMLPurifier_ChildDef_StrictBlockquote(); + return new HTMLPurifier_ChildDef_StrictBlockquote($value); case 'table': return new HTMLPurifier_ChildDef_Table(); case 'chameleon': @@ -114,6 +123,14 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition return HTMLPurifier_ChildDef_Empty(); } + function convertToLookup($string) { + $array = explode('|', str_replace(' ', '', $string)); + foreach ($array as $i => $k) { + $array[$i] = true; + } + return $array; + } + } -?> \ No newline at end of file +?> diff --git a/tests/HTMLPurifier/ChildDef/ChameleonTest.php b/tests/HTMLPurifier/ChildDef/ChameleonTest.php index b4181196..529d9193 100644 --- a/tests/HTMLPurifier/ChildDef/ChameleonTest.php +++ b/tests/HTMLPurifier/ChildDef/ChameleonTest.php @@ -15,17 +15,17 @@ class HTMLPurifier_ChildDef_ChameleonTest extends HTMLPurifier_ChildDefHarness $this->assertResult( 'Allowed.', true, - array(), array('ParentType' => 'inline') + array(), array('IsInline' => true) ); $this->assertResult( '
Not allowed.
', '', - array(), array('ParentType' => 'inline') + array(), array('IsInline' => true) ); $this->assertResult( '
Allowed.
', true, - array(), array('ParentType' => 'block') + array(), array('IsInline' => false) ); } diff --git a/tests/HTMLPurifier/ChildDef/StrictBlockquoteTest.php b/tests/HTMLPurifier/ChildDef/StrictBlockquoteTest.php index 27aacc81..56405e91 100644 --- a/tests/HTMLPurifier/ChildDef/StrictBlockquoteTest.php +++ b/tests/HTMLPurifier/ChildDef/StrictBlockquoteTest.php @@ -9,7 +9,7 @@ extends HTMLPurifier_ChildDefHarness function test() { - $this->obj = new HTMLPurifier_ChildDef_StrictBlockquote(); + $this->obj = new HTMLPurifier_ChildDef_StrictBlockquote('div | p'); $this->assertResult(''); $this->assertResult('

Valid

'); diff --git a/tests/HTMLPurifier/Strategy/FixNestingTest.php b/tests/HTMLPurifier/Strategy/FixNestingTest.php index 38bd996b..55fc108b 100644 --- a/tests/HTMLPurifier/Strategy/FixNestingTest.php +++ b/tests/HTMLPurifier/Strategy/FixNestingTest.php @@ -70,19 +70,30 @@ class HTMLPurifier_Strategy_FixNestingTest extends HTMLPurifier_StrategyHarness 'Not allowed!' ); - // block in inline ins not allowed - $this->assertResult( + $this->assertResult( // alt config '
Not allowed!
', '<div>Not allowed!</div>', array('Core.EscapeInvalidChildren' => true) ); + // test block element that has inline content + $this->assertResult( + '

Not allowed!

', + '

Not allowed!

' + ); + // test exclusions $this->assertResult( 'Not allowed', '' ); + // stacked ins/del + $this->assertResult( + '

Not allowed!

', + '

Not allowed!

' + ); + // test inline parent $this->assertResult( 'Bold', true, array('HTML.Parent' => 'span')