0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-03 05:11:52 +00:00

Fix broken table content model, easily seen in XHTML1.1

Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
This commit is contained in:
Edward Z. Yang 2011-12-26 14:49:26 +08:00
parent 3570c9985a
commit e41af46a8b
4 changed files with 111 additions and 6 deletions

2
NEWS
View File

@ -29,6 +29,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
problems. problems.
- Fix iconv truncation bug, where non-UTF-8 target encodings see - Fix iconv truncation bug, where non-UTF-8 target encodings see
output truncated after around 8000 characters. output truncated after around 8000 characters.
- Fix broken table content model for XHTML1.1 (and also earlier
versions, although the W3C validator doesn't catch those violations)
4.3.0, released 2011-03-27 4.3.0, released 2011-03-27
# Fixed broken caching of customized raw definitions, but requires an # Fixed broken caching of customized raw definitions, but requires an

4
TODO
View File

@ -112,6 +112,10 @@ Neat feature related
3. Extend the tag exclusion system to specify whether or not the 3. Extend the tag exclusion system to specify whether or not the
contents should be dropped or not (currently, there's code that could do contents should be dropped or not (currently, there's code that could do
something like this if it didn't drop the inner text too.) something like this if it didn't drop the inner text too.)
? Make AutoParagraph also support paragraph-izing double <br> tags, and not
just double newlines. This is kind of tough to do in the current framework,
though, and might be reasonably approximated by search replacing double <br>s
with newlines before running it through HTML Purifier.
Maintenance related (slightly boring) Maintenance related (slightly boring)
# CHMOD install script for PEAR installs # CHMOD install script for PEAR installs

View File

@ -1,7 +1,33 @@
<?php <?php
/** /**
* Definition for tables * Definition for tables. The general idea is to extract out all of the
* essential bits, and then reconstruct it later.
*
* This is a bit confusing, because the DTDs and the W3C
* validators seem to disagree on the appropriate definition. The
* DTD claims:
*
* (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+)
*
* But actually, the HTML4 spec then has this to say:
*
* The TBODY start tag is always required except when the table
* contains only one table body and no table head or foot sections.
* The TBODY end tag may always be safely omitted.
*
* So the DTD is kind of wrong. The validator is, unfortunately, kind
* of on crack.
*
* The definition changed again in XHTML1.1; and in my opinion, this
* formulation makes the most sense.
*
* caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ ))
*
* Essentially, we have two modes: thead/tfoot/tbody mode, and tr mode.
* If we encounter a thead, tfoot or tbody, we are placed in the former
* mode, and we *must* wrap any stray tr segments with a tbody. But if
* we don't run into any of them, just have tr tags is OK.
*/ */
class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
{ {
@ -33,6 +59,8 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
$collection = array(); // collected nodes $collection = array(); // collected nodes
$tag_index = 0; // the first node might be whitespace, $tag_index = 0; // the first node might be whitespace,
// so this tells us where the start tag is // so this tells us where the start tag is
$tbody_mode = false; // if true, then we need to wrap any stray
// <tr>s with a <tbody>.
foreach ($tokens_of_children as $token) { foreach ($tokens_of_children as $token) {
$is_child = ($nesting == 0); $is_child = ($nesting == 0);
@ -51,8 +79,9 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
// okay, let's stash the tokens away // okay, let's stash the tokens away
// first token tells us the type of the collection // first token tells us the type of the collection
switch ($collection[$tag_index]->name) { switch ($collection[$tag_index]->name) {
case 'tr':
case 'tbody': case 'tbody':
$tbody_mode = true;
case 'tr':
$content[] = $collection; $content[] = $collection;
break; break;
case 'caption': case 'caption':
@ -61,13 +90,28 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
break; break;
case 'thead': case 'thead':
case 'tfoot': case 'tfoot':
$tbody_mode = true;
// XXX This breaks rendering properties with
// Firefox, which never floats a <thead> to
// the top. Ever. (Our scheme will float the
// first <thead> to the top.) So maybe
// <thead>s that are not first should be
// turned into <tbody>? Very tricky, indeed.
// access the appropriate variable, $thead or $tfoot // access the appropriate variable, $thead or $tfoot
$var = $collection[$tag_index]->name; $var = $collection[$tag_index]->name;
if ($$var === false) { if ($$var === false) {
$$var = $collection; $$var = $collection;
} else { } else {
// transmutate the first and less entries into // Oops, there's a second one! What
// tbody tags, and then put into content // should we do? Current behavior is to
// transmutate the first and last entries into
// tbody tags, and then put into content.
// Maybe a better idea is to *attach
// it* to the existing thead or tfoot?
// We don't do this, because Firefox
// doesn't float an extra tfoot to the
// bottom like it does for the first one.
$collection[$tag_index]->name = 'tbody'; $collection[$tag_index]->name = 'tbody';
$collection[count($collection)-1]->name = 'tbody'; $collection[count($collection)-1]->name = 'tbody';
$content[] = $collection; $content[] = $collection;
@ -126,7 +170,48 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array); if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
if ($thead !== false) $ret = array_merge($ret, $thead); if ($thead !== false) $ret = array_merge($ret, $thead);
if ($tfoot !== false) $ret = array_merge($ret, $tfoot); if ($tfoot !== false) $ret = array_merge($ret, $tfoot);
foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
if ($tbody_mode) {
// a little tricky, since the start of the collection may be
// whitespace
$inside_tbody = false;
foreach ($content as $token_array) {
// find the starting token
foreach ($token_array as $t) {
if ($t->name === 'tr' || $t->name === 'tbody') {
break;
}
} // iterator variable carries over
if ($t->name === 'tr') {
if ($inside_tbody) {
$ret = array_merge($ret, $token_array);
} else {
$ret[] = new HTMLPurifier_Token_Start('tbody');
$ret = array_merge($ret, $token_array);
$inside_tbody = true;
}
} elseif ($t->name === 'tbody') {
if ($inside_tbody) {
$ret[] = new HTMLPurifier_Token_End('tbody');
$inside_tbody = false;
$ret = array_merge($ret, $token_array);
} else {
$ret = array_merge($ret, $token_array);
}
} else {
trigger_error("tr/tbody in content invariant failed in Table ChildDef", E_USER_ERROR);
}
}
if ($inside_tbody) {
$ret[] = new HTMLPurifier_Token_End('tbody');
}
} else {
foreach ($content as $token_array) {
// invariant: everything in here is <tr>s
$ret = array_merge($ret, $token_array);
}
}
if (!empty($collection) && $is_collecting == false){ if (!empty($collection) && $is_collecting == false){
// grab the trailing space // grab the trailing space
$ret = array_merge($ret, $collection); $ret = array_merge($ret, $collection);

View File

@ -28,7 +28,21 @@ class HTMLPurifier_ChildDef_TableTest extends HTMLPurifier_ChildDefHarness
function testReorderContents() { function testReorderContents() {
$this->assertResult( $this->assertResult(
'<col /><colgroup /><tbody /><tfoot /><thead /><tr>1</tr><caption /><tr />', '<col /><colgroup /><tbody /><tfoot /><thead /><tr>1</tr><caption /><tr />',
'<caption /><col /><colgroup /><thead /><tfoot /><tbody /><tr>1</tr><tr />'); '<caption /><col /><colgroup /><thead /><tfoot /><tbody /><tbody><tr>1</tr><tr /></tbody>');
}
function testXhtml11Illegal() {
$this->assertResult(
'<thead><tr><th>a</th></tr></thead><tr><td>a</td></tr>',
'<thead><tr><th>a</th></tr></thead><tbody><tr><td>a</td></tr></tbody>'
);
}
function testTrOverflowAndClose() {
$this->assertResult(
'<tr><td>a</td></tr><tr><td>b</td></tr><tbody><tr><td>c</td></tr></tbody><tr><td>d</td></tr>',
'<tbody><tr><td>a</td></tr><tr><td>b</td></tr></tbody><tbody><tr><td>c</td></tr></tbody><tbody><tr><td>d</td></tr></tbody>'
);
} }
function testDuplicateProcessing() { function testDuplicateProcessing() {