Fix broken table content model, easily seen in XHTML1.1

Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
2024-12-22 08:21:52 +00:00 · 2011-12-26 14:49:26 +08:00 · 2011-12-26 14:49:26 +08:00 · e41af46a8b
commit e41af46a8b
parent 3570c9985a
4 changed files with 111 additions and 6 deletions
--- a/2
+++ b/2
@ -29,6 +29,8 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
  problems.
 - Fix iconv truncation bug, where non-UTF-8 target encodings see
  output truncated after around 8000 characters.
+- Fix broken table content model for XHTML1.1 (and also earlier
+  versions, although the W3C validator doesn't catch those violations)

 4.3.0, released 2011-03-27
 # Fixed broken caching of customized raw definitions, but requires an
--- a/4
+++ b/4
@ -112,6 +112,10 @@ Neat feature related
    3. Extend the tag exclusion system to specify whether or not the
    contents should be dropped or not (currently, there's code that could do
    something like this if it didn't drop the inner text too.)
+ ? Make AutoParagraph also support paragraph-izing double <br> tags, and not
+   just double newlines.  This is kind of tough to do in the current framework,
+   though, and might be reasonably approximated by search replacing double <br>s
+   with newlines before running it through HTML Purifier.

 Maintenance related (slightly boring)
 # CHMOD install script for PEAR installs
--- a/library/HTMLPurifier/ChildDef/Table.php
+++ b/library/HTMLPurifier/ChildDef/Table.php
@ -1,7 +1,33 @@
 <?php

 /**
- * Definition for tables
+ * Definition for tables.  The general idea is to extract out all of the
+ * essential bits, and then reconstruct it later.
+ *
+ * This is a bit confusing, because the DTDs and the W3C
+ * validators seem to disagree on the appropriate definition. The
+ * DTD claims:
+ *
+ *      (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+)
+ *
+ * But actually, the HTML4 spec then has this to say:
+ *
+ *      The TBODY start tag is always required except when the table
+ *      contains only one table body and no table head or foot sections.
+ *      The TBODY end tag may always be safely omitted.
+ *
+ * So the DTD is kind of wrong.  The validator is, unfortunately, kind
+ * of on crack.
+ *
+ * The definition changed again in XHTML1.1; and in my opinion, this
+ * formulation makes the most sense.
+ *
+ *      caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ ))
+ *
+ * Essentially, we have two modes: thead/tfoot/tbody mode, and tr mode.
+ * If we encounter a thead, tfoot or tbody, we are placed in the former
+ * mode, and we *must* wrap any stray tr segments with a tbody. But if
+ * we don't run into any of them, just have tr tags is OK.
 */
 class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
 {
@ -33,6 +59,8 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
        $collection = array(); // collected nodes
        $tag_index = 0; // the first node might be whitespace,
                            // so this tells us where the start tag is
+        $tbody_mode = false; // if true, then we need to wrap any stray
+                             // <tr>s with a <tbody>.

        foreach ($tokens_of_children as $token) {
            $is_child = ($nesting == 0);
@ -51,8 +79,9 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
                    // okay, let's stash the tokens away
                    // first token tells us the type of the collection
                    switch ($collection[$tag_index]->name) {
-                        case 'tr':
                        case 'tbody':
+                            $tbody_mode = true;
+                        case 'tr':
                            $content[] = $collection;
                            break;
                        case 'caption':
@ -61,13 +90,28 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
                            break;
                        case 'thead':
                        case 'tfoot':
+                            $tbody_mode = true;
+                            // XXX This breaks rendering properties with
+                            // Firefox, which never floats a <thead> to
+                            // the top. Ever. (Our scheme will float the
+                            // first <thead> to the top.)  So maybe
+                            // <thead>s that are not first should be
+                            // turned into <tbody>? Very tricky, indeed.
+
                            // access the appropriate variable, $thead or $tfoot
                            $var = $collection[$tag_index]->name;
                            if ($$var === false) {
                                $$var = $collection;
                            } else {
-                                // transmutate the first and less entries into
-                                // tbody tags, and then put into content
+                                // Oops, there's a second one! What
+                                // should we do?  Current behavior is to
+                                // transmutate the first and last entries into
+                                // tbody tags, and then put into content.
+                                // Maybe a better idea is to *attach
+                                // it* to the existing thead or tfoot?
+                                // We don't do this, because Firefox
+                                // doesn't float an extra tfoot to the
+                                // bottom like it does for the first one.
                                $collection[$tag_index]->name = 'tbody';
                                $collection[count($collection)-1]->name = 'tbody';
                                $content[] = $collection;
@ -126,7 +170,48 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
        if ($cols !== false)    foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
        if ($thead !== false)   $ret = array_merge($ret, $thead);
        if ($tfoot !== false)   $ret = array_merge($ret, $tfoot);
-        foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
+
+        if ($tbody_mode) {
+            // a little tricky, since the start of the collection may be
+            // whitespace
+            $inside_tbody = false;
+            foreach ($content as $token_array) {
+                // find the starting token
+                foreach ($token_array as $t) {
+                    if ($t->name === 'tr' || $t->name === 'tbody') {
+                        break;
+                    }
+                } // iterator variable carries over
+                if ($t->name === 'tr') {
+                    if ($inside_tbody) {
+                        $ret = array_merge($ret, $token_array);
+                    } else {
+                        $ret[] = new HTMLPurifier_Token_Start('tbody');
+                        $ret = array_merge($ret, $token_array);
+                        $inside_tbody = true;
+                    }
+                } elseif ($t->name === 'tbody') {
+                    if ($inside_tbody) {
+                        $ret[] = new HTMLPurifier_Token_End('tbody');
+                        $inside_tbody = false;
+                        $ret = array_merge($ret, $token_array);
+                    } else {
+                        $ret = array_merge($ret, $token_array);
+                    }
+                } else {
+                    trigger_error("tr/tbody in content invariant failed in Table ChildDef", E_USER_ERROR);
+                }
+            }
+            if ($inside_tbody) {
+                $ret[] = new HTMLPurifier_Token_End('tbody');
+            }
+        } else {
+            foreach ($content as $token_array) {
+                // invariant: everything in here is <tr>s
+                $ret = array_merge($ret, $token_array);
+            }
+        }
+
        if (!empty($collection) && $is_collecting == false){
            // grab the trailing space
            $ret = array_merge($ret, $collection);
--- a/tests/HTMLPurifier/ChildDef/TableTest.php
+++ b/tests/HTMLPurifier/ChildDef/TableTest.php
@ -28,7 +28,21 @@ class HTMLPurifier_ChildDef_TableTest extends HTMLPurifier_ChildDefHarness
    function testReorderContents() {
        $this->assertResult(
          '<col /><colgroup /><tbody /><tfoot /><thead /><tr>1</tr><caption /><tr />',
-          '<caption /><col /><colgroup /><thead /><tfoot /><tbody /><tr>1</tr><tr />');
+          '<caption /><col /><colgroup /><thead /><tfoot /><tbody /><tbody><tr>1</tr><tr /></tbody>');
+    }
+
+    function testXhtml11Illegal() {
+        $this->assertResult(
+            '<thead><tr><th>a</th></tr></thead><tr><td>a</td></tr>',
+            '<thead><tr><th>a</th></tr></thead><tbody><tr><td>a</td></tr></tbody>'
+        );
+    }
+
+    function testTrOverflowAndClose() {
+        $this->assertResult(
+            '<tr><td>a</td></tr><tr><td>b</td></tr><tbody><tr><td>c</td></tr></tbody><tr><td>d</td></tr>',
+            '<tbody><tr><td>a</td></tr><tr><td>b</td></tr></tbody><tbody><tr><td>c</td></tr></tbody><tbody><tr><td>d</td></tr></tbody>'
+        );
    }

    function testDuplicateProcessing() {