[1.1] Table child definition made more flexible, will fix up poorly ordered elements

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@417 48356398-32a2-884e-a903-53898d9a118a
2024-12-22 16:31:53 +00:00 · 2006-09-15 01:52:22 +00:00 · 2006-09-15 01:52:22 +00:00 · e440f25bce
commit e440f25bce
parent 665e80d223
6 changed files with 152 additions and 22 deletions
--- a/1
+++ b/1
@ -6,6 +6,7 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
  quotes, apostrophes and less than or greater than signs.
 - Enforce alphanumeric namespace and directive names for configuration.
 - Directive documentation generation using XSLT
 - Table child definition made more flexible, will fix up poorly ordered elements
 1.0.2, unknown release date
 (bugfix release may be dropped if no bugs found)
--- a/1
+++ b/1
@ -6,7 +6,6 @@ Ongoing
 - Plugins for major CMSes (very tricky issue)
 1.1 release
 - Rewrite table's child definition to be faster, smart, and regexp free
 - Allow HTML 4.01 output (cosmetic changes to the generator)
 - Formatters for plaintext
    - Auto-paragraphing (be sure to leverage fact that we know when things
--- a/library/HTMLPurifier/ChildDef.php
+++ b/library/HTMLPurifier/ChildDef.php
@ -5,13 +5,6 @@
 // false = delete parent node and all children
 // array(...) = replace children nodes with these
 // this is the hardest one to implement. We'll use fancy regexp tricks
 // right now, we only expect it to return TRUE or FALSE (it won't attempt
 // to fix the tree)
 // we may end up writing custom code for each HTML case
 // in order to make it self correcting
 HTMLPurifier_ConfigDef::define(
    'Core', 'EscapeInvalidChildren', false, 'bool',
    'When true, a child is found that is not allowed in the context of the '.
@ -62,9 +55,7 @@ class HTMLPurifier_ChildDef
 * Custom validation class, accepts DTD child definitions
 * 
 * @warning Currently this class is an all or nothing proposition, that is,
- *          it will only give a bool return value.  Table is the only
+ *          it will only give a bool return value.
 *          child definition that uses this class, and we ought to give
 *          it a dedicated one.
 */
 class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
 {
@ -307,4 +298,129 @@ class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
    }
 }
 /**
 * Definition for tables
 */
 class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
 {
    var $allow_empty = false;
    var $type = 'table';
    function HTMLPurifier_ChildDef_Table() {}
    function validateChildren($tokens_of_children, $config, $context) {
        if (empty($tokens_of_children)) return false;
        // this ensures that the loop gets run one last time before closing
        // up. It's a little bit of a hack, but it works! Just make sure you
        // get rid of the token later.
        $tokens_of_children[] = false;
        // only one of these elements is allowed in a table
        $caption = false;
        $thead   = false;
        $tfoot   = false;
        // as many of these as you want
        $cols    = array();
        $content = array();
        $nesting = 0; // current depth so we can determine nodes
        $is_collecting = false; // are we globbing together tokens to package
                                // into one of the collectors?
        $collection = array(); // collected nodes
        foreach ($tokens_of_children as $token) {
            $is_child = ($nesting == 0);
            if ($token === false) {
                // terminating sequence started
            } elseif ($token->type == 'start') {
                $nesting++;
            } elseif ($token->type == 'end') {
                $nesting--;
            }
            // handle node collection
            if ($is_collecting) {
                if ($is_child) {
                    // okay, let's stash the tokens away
                    // first token tells us the type of the collection
                    switch ($collection[0]->name) {
                        case 'tr':
                        case 'tbody':
                            $content[] = $collection;
                            break;
                        case 'caption':
                            if ($caption !== false) break;
                            $caption = $collection;
                            break;
                        case 'thead':
                        case 'tfoot':
                            // access the appropriate variable, $thead or $tfoot
                            $var = $collection[0]->name;
                            if ($$var === false) {
                                $$var = $collection;
                            } else {
                                // transmutate the first and less entries into
                                // tbody tags, and then put into content
                                $collection[0]->name = 'tbody';
                                $collection[count($collection)-1]->name = 'tbody';
                                $content[] = $collection;
                            }
                            break;
                         case 'colgroup':
                            $cols[] = $collection;
                            break;
                    }
                    $collection = array();
                    $is_collecting = false;
                } else {
                    // add the node to the collection
                    $collection[] = $token;
                }
            }
            // terminate
            if ($token === false) break;
            if ($is_child) {
                // determine what we're dealing with
                if ($token->name == 'col') {
                    // the only empty tag in the possie, we can handle it
                    // immediately
                    $cols[] = array($token);
                    continue;
                }
                switch($token->name) {
                    case 'caption':
                    case 'colgroup':
                    case 'thead':
                    case 'tfoot':
                    case 'tbody':
                    case 'tr':
                        $is_collecting = true;
                        $collection[] = $token;
                        continue;
                    default:
                        // unrecognized, drop silently
                        continue;
                }
            }
        }
        if (empty($content)) return false;
        $ret = array();
        if ($caption !== false) $ret = array_merge($ret, $caption);
        if ($cols !== false)    foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
        if ($thead !== false)   $ret = array_merge($ret, $thead);
        if ($tfoot !== false)   $ret = array_merge($ret, $tfoot);
        foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
        array_pop($tokens_of_children); // remove phantom token
        return ($ret === $tokens_of_children) ? true : $ret;
    }
 }
 ?>
--- a/library/HTMLPurifier/HTMLDefinition.php
+++ b/library/HTMLPurifier/HTMLDefinition.php
@ -209,8 +209,7 @@ class HTMLPurifier_HTMLDefinition
        $this->info['a']->child    = $e_a_content;
-        $this->info['table']->child = new HTMLPurifier_ChildDef_Custom(
+        $this->info['table']->child = new HTMLPurifier_ChildDef_Table();
            '(caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))');
        // not a real entity, watch the double underscore
        $e__row = new HTMLPurifier_ChildDef_Required('tr');
--- a/library/HTMLPurifier/Strategy/FixNesting.php
+++ b/library/HTMLPurifier/Strategy/FixNesting.php
@ -187,6 +187,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
                if (!$parent_def->child->allow_empty) {
                    // we need to do a double-check
                    $i = $parent_index;
                    array_pop($stack);
                }
                // PROJECTED OPTIMIZATION: Process all children elements before
@ -255,4 +256,4 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
 }
-?>
+?>
--- a/tests/HTMLPurifier/ChildDefTest.php
+++ b/tests/HTMLPurifier/ChildDefTest.php
@ -59,7 +59,7 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
    }
-    function atest_table() {
+    function test_table() {
        // currently inactive, awaiting augmentation
@ -71,19 +71,33 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
        $inputs[0] = '';
        $expect[0] = false;
-        // we really don't care what's inside, because if it turns out
+        // we're using empty tags to compact the tests: under real circumstances
-        // this tr is illegal, we'll end up re-evaluating the parent node
+        // there would be contents in them
-        // anyway.
+        
-        $inputs[1] = '<tr></tr>';
+        $inputs[1] = '<tr />';
        $expect[1] = true;
-        $inputs[2] = '<caption></caption><col></col><thead></thead>' .
+        $inputs[2] = '<caption /><col /><thead /><tfoot /><tbody>'.
-                     '<tfoot></tfoot><tbody></tbody>';
+            '<tr><td>asdf</td></tr></tbody>';
        $expect[2] = true;
-        $inputs[3] = '<col></col><col></col><col></col><tr></tr>';
+        $inputs[3] = '<col /><col /><col /><tr />';
        $expect[3] = true;
        // mixed up order
        $inputs[4] = '<col /><colgroup /><tbody /><tfoot /><thead /><tr>1</tr><caption /><tr />';
        $expect[4] = '<caption /><col /><colgroup /><thead /><tfoot /><tbody /><tr>1</tr><tr />';
        // duplicates of singles
        // - first caption serves
        // - trailing tfoots/theads get turned into tbodys
        $inputs[5] = '<caption>1</caption><caption /><tbody /><tbody /><tfoot>1</tfoot><tfoot />';
        $expect[5] = '<caption>1</caption><tfoot>1</tfoot><tbody /><tbody /><tbody />';
        // errant text dropped (until bubbling is implemented)
        $inputs[6] = 'foo';
        $expect[6] = false;
        $this->assertSeries($inputs, $expect, $config);
    }