diff --git a/NEWS b/NEWS index 60eda910..493afb8a 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier quotes, apostrophes and less than or greater than signs. - Enforce alphanumeric namespace and directive names for configuration. - Directive documentation generation using XSLT +- Table child definition made more flexible, will fix up poorly ordered elements 1.0.2, unknown release date (bugfix release may be dropped if no bugs found) diff --git a/TODO b/TODO index 3bd98579..bd014883 100644 --- a/TODO +++ b/TODO @@ -6,7 +6,6 @@ Ongoing - Plugins for major CMSes (very tricky issue) 1.1 release - - Rewrite table's child definition to be faster, smart, and regexp free - Allow HTML 4.01 output (cosmetic changes to the generator) - Formatters for plaintext - Auto-paragraphing (be sure to leverage fact that we know when things diff --git a/library/HTMLPurifier/ChildDef.php b/library/HTMLPurifier/ChildDef.php index 91bda67e..04c73c80 100644 --- a/library/HTMLPurifier/ChildDef.php +++ b/library/HTMLPurifier/ChildDef.php @@ -5,13 +5,6 @@ // false = delete parent node and all children // array(...) = replace children nodes with these -// this is the hardest one to implement. We'll use fancy regexp tricks -// right now, we only expect it to return TRUE or FALSE (it won't attempt -// to fix the tree) - -// we may end up writing custom code for each HTML case -// in order to make it self correcting - HTMLPurifier_ConfigDef::define( 'Core', 'EscapeInvalidChildren', false, 'bool', 'When true, a child is found that is not allowed in the context of the '. @@ -62,9 +55,7 @@ class HTMLPurifier_ChildDef * Custom validation class, accepts DTD child definitions * * @warning Currently this class is an all or nothing proposition, that is, - * it will only give a bool return value. Table is the only - * child definition that uses this class, and we ought to give - * it a dedicated one. + * it will only give a bool return value. */ class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef { @@ -307,4 +298,129 @@ class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef } } +/** + * Definition for tables + */ +class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef +{ + var $allow_empty = false; + var $type = 'table'; + function HTMLPurifier_ChildDef_Table() {} + function validateChildren($tokens_of_children, $config, $context) { + if (empty($tokens_of_children)) return false; + + // this ensures that the loop gets run one last time before closing + // up. It's a little bit of a hack, but it works! Just make sure you + // get rid of the token later. + $tokens_of_children[] = false; + + // only one of these elements is allowed in a table + $caption = false; + $thead = false; + $tfoot = false; + + // as many of these as you want + $cols = array(); + $content = array(); + + $nesting = 0; // current depth so we can determine nodes + $is_collecting = false; // are we globbing together tokens to package + // into one of the collectors? + $collection = array(); // collected nodes + + foreach ($tokens_of_children as $token) { + $is_child = ($nesting == 0); + + if ($token === false) { + // terminating sequence started + } elseif ($token->type == 'start') { + $nesting++; + } elseif ($token->type == 'end') { + $nesting--; + } + + // handle node collection + if ($is_collecting) { + if ($is_child) { + // okay, let's stash the tokens away + // first token tells us the type of the collection + switch ($collection[0]->name) { + case 'tr': + case 'tbody': + $content[] = $collection; + break; + case 'caption': + if ($caption !== false) break; + $caption = $collection; + break; + case 'thead': + case 'tfoot': + // access the appropriate variable, $thead or $tfoot + $var = $collection[0]->name; + if ($$var === false) { + $$var = $collection; + } else { + // transmutate the first and less entries into + // tbody tags, and then put into content + $collection[0]->name = 'tbody'; + $collection[count($collection)-1]->name = 'tbody'; + $content[] = $collection; + } + break; + case 'colgroup': + $cols[] = $collection; + break; + } + $collection = array(); + $is_collecting = false; + } else { + // add the node to the collection + $collection[] = $token; + } + } + + // terminate + if ($token === false) break; + + if ($is_child) { + // determine what we're dealing with + if ($token->name == 'col') { + // the only empty tag in the possie, we can handle it + // immediately + $cols[] = array($token); + continue; + } + switch($token->name) { + case 'caption': + case 'colgroup': + case 'thead': + case 'tfoot': + case 'tbody': + case 'tr': + $is_collecting = true; + $collection[] = $token; + continue; + default: + // unrecognized, drop silently + continue; + } + } + } + + if (empty($content)) return false; + + $ret = array(); + if ($caption !== false) $ret = array_merge($ret, $caption); + if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array); + if ($thead !== false) $ret = array_merge($ret, $thead); + if ($tfoot !== false) $ret = array_merge($ret, $tfoot); + foreach ($content as $token_array) $ret = array_merge($ret, $token_array); + + array_pop($tokens_of_children); // remove phantom token + + return ($ret === $tokens_of_children) ? true : $ret; + + } +} + ?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLDefinition.php b/library/HTMLPurifier/HTMLDefinition.php index 4d261526..34791ee7 100644 --- a/library/HTMLPurifier/HTMLDefinition.php +++ b/library/HTMLPurifier/HTMLDefinition.php @@ -209,8 +209,7 @@ class HTMLPurifier_HTMLDefinition $this->info['a']->child = $e_a_content; - $this->info['table']->child = new HTMLPurifier_ChildDef_Custom( - '(caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))'); + $this->info['table']->child = new HTMLPurifier_ChildDef_Table(); // not a real entity, watch the double underscore $e__row = new HTMLPurifier_ChildDef_Required('tr'); diff --git a/library/HTMLPurifier/Strategy/FixNesting.php b/library/HTMLPurifier/Strategy/FixNesting.php index f682ea89..18eb60b5 100644 --- a/library/HTMLPurifier/Strategy/FixNesting.php +++ b/library/HTMLPurifier/Strategy/FixNesting.php @@ -187,6 +187,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy if (!$parent_def->child->allow_empty) { // we need to do a double-check $i = $parent_index; + array_pop($stack); } // PROJECTED OPTIMIZATION: Process all children elements before @@ -255,4 +256,4 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy } -?> \ No newline at end of file +?> diff --git a/tests/HTMLPurifier/ChildDefTest.php b/tests/HTMLPurifier/ChildDefTest.php index 7420f72d..33c9907b 100644 --- a/tests/HTMLPurifier/ChildDefTest.php +++ b/tests/HTMLPurifier/ChildDefTest.php @@ -59,7 +59,7 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase } - function atest_table() { + function test_table() { // currently inactive, awaiting augmentation @@ -71,19 +71,33 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase $inputs[0] = ''; $expect[0] = false; - // we really don't care what's inside, because if it turns out - // this tr is illegal, we'll end up re-evaluating the parent node - // anyway. - $inputs[1] = '