0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-11-08 06:48:42 +00:00

[1.1] Table child definition made more flexible, will fix up poorly ordered elements

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@417 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-09-15 01:52:22 +00:00
parent 665e80d223
commit e440f25bce
6 changed files with 152 additions and 22 deletions

1
NEWS
View File

@ -6,6 +6,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
quotes, apostrophes and less than or greater than signs.
- Enforce alphanumeric namespace and directive names for configuration.
- Directive documentation generation using XSLT
- Table child definition made more flexible, will fix up poorly ordered elements
1.0.2, unknown release date
(bugfix release may be dropped if no bugs found)

1
TODO
View File

@ -6,7 +6,6 @@ Ongoing
- Plugins for major CMSes (very tricky issue)
1.1 release
- Rewrite table's child definition to be faster, smart, and regexp free
- Allow HTML 4.01 output (cosmetic changes to the generator)
- Formatters for plaintext
- Auto-paragraphing (be sure to leverage fact that we know when things

View File

@ -5,13 +5,6 @@
// false = delete parent node and all children
// array(...) = replace children nodes with these
// this is the hardest one to implement. We'll use fancy regexp tricks
// right now, we only expect it to return TRUE or FALSE (it won't attempt
// to fix the tree)
// we may end up writing custom code for each HTML case
// in order to make it self correcting
HTMLPurifier_ConfigDef::define(
'Core', 'EscapeInvalidChildren', false, 'bool',
'When true, a child is found that is not allowed in the context of the '.
@ -62,9 +55,7 @@ class HTMLPurifier_ChildDef
* Custom validation class, accepts DTD child definitions
*
* @warning Currently this class is an all or nothing proposition, that is,
* it will only give a bool return value. Table is the only
* child definition that uses this class, and we ought to give
* it a dedicated one.
* it will only give a bool return value.
*/
class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
{
@ -307,4 +298,129 @@ class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
}
}
/**
* Definition for tables
*/
class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
{
var $allow_empty = false;
var $type = 'table';
function HTMLPurifier_ChildDef_Table() {}
function validateChildren($tokens_of_children, $config, $context) {
if (empty($tokens_of_children)) return false;
// this ensures that the loop gets run one last time before closing
// up. It's a little bit of a hack, but it works! Just make sure you
// get rid of the token later.
$tokens_of_children[] = false;
// only one of these elements is allowed in a table
$caption = false;
$thead = false;
$tfoot = false;
// as many of these as you want
$cols = array();
$content = array();
$nesting = 0; // current depth so we can determine nodes
$is_collecting = false; // are we globbing together tokens to package
// into one of the collectors?
$collection = array(); // collected nodes
foreach ($tokens_of_children as $token) {
$is_child = ($nesting == 0);
if ($token === false) {
// terminating sequence started
} elseif ($token->type == 'start') {
$nesting++;
} elseif ($token->type == 'end') {
$nesting--;
}
// handle node collection
if ($is_collecting) {
if ($is_child) {
// okay, let's stash the tokens away
// first token tells us the type of the collection
switch ($collection[0]->name) {
case 'tr':
case 'tbody':
$content[] = $collection;
break;
case 'caption':
if ($caption !== false) break;
$caption = $collection;
break;
case 'thead':
case 'tfoot':
// access the appropriate variable, $thead or $tfoot
$var = $collection[0]->name;
if ($$var === false) {
$$var = $collection;
} else {
// transmutate the first and less entries into
// tbody tags, and then put into content
$collection[0]->name = 'tbody';
$collection[count($collection)-1]->name = 'tbody';
$content[] = $collection;
}
break;
case 'colgroup':
$cols[] = $collection;
break;
}
$collection = array();
$is_collecting = false;
} else {
// add the node to the collection
$collection[] = $token;
}
}
// terminate
if ($token === false) break;
if ($is_child) {
// determine what we're dealing with
if ($token->name == 'col') {
// the only empty tag in the possie, we can handle it
// immediately
$cols[] = array($token);
continue;
}
switch($token->name) {
case 'caption':
case 'colgroup':
case 'thead':
case 'tfoot':
case 'tbody':
case 'tr':
$is_collecting = true;
$collection[] = $token;
continue;
default:
// unrecognized, drop silently
continue;
}
}
}
if (empty($content)) return false;
$ret = array();
if ($caption !== false) $ret = array_merge($ret, $caption);
if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
if ($thead !== false) $ret = array_merge($ret, $thead);
if ($tfoot !== false) $ret = array_merge($ret, $tfoot);
foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
array_pop($tokens_of_children); // remove phantom token
return ($ret === $tokens_of_children) ? true : $ret;
}
}
?>

View File

@ -209,8 +209,7 @@ class HTMLPurifier_HTMLDefinition
$this->info['a']->child = $e_a_content;
$this->info['table']->child = new HTMLPurifier_ChildDef_Custom(
'(caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))');
$this->info['table']->child = new HTMLPurifier_ChildDef_Table();
// not a real entity, watch the double underscore
$e__row = new HTMLPurifier_ChildDef_Required('tr');

View File

@ -187,6 +187,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
if (!$parent_def->child->allow_empty) {
// we need to do a double-check
$i = $parent_index;
array_pop($stack);
}
// PROJECTED OPTIMIZATION: Process all children elements before

View File

@ -59,7 +59,7 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
}
function atest_table() {
function test_table() {
// currently inactive, awaiting augmentation
@ -71,19 +71,33 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
$inputs[0] = '';
$expect[0] = false;
// we really don't care what's inside, because if it turns out
// this tr is illegal, we'll end up re-evaluating the parent node
// anyway.
$inputs[1] = '<tr></tr>';
// we're using empty tags to compact the tests: under real circumstances
// there would be contents in them
$inputs[1] = '<tr />';
$expect[1] = true;
$inputs[2] = '<caption></caption><col></col><thead></thead>' .
'<tfoot></tfoot><tbody></tbody>';
$inputs[2] = '<caption /><col /><thead /><tfoot /><tbody>'.
'<tr><td>asdf</td></tr></tbody>';
$expect[2] = true;
$inputs[3] = '<col></col><col></col><col></col><tr></tr>';
$inputs[3] = '<col /><col /><col /><tr />';
$expect[3] = true;
// mixed up order
$inputs[4] = '<col /><colgroup /><tbody /><tfoot /><thead /><tr>1</tr><caption /><tr />';
$expect[4] = '<caption /><col /><colgroup /><thead /><tfoot /><tbody /><tr>1</tr><tr />';
// duplicates of singles
// - first caption serves
// - trailing tfoots/theads get turned into tbodys
$inputs[5] = '<caption>1</caption><caption /><tbody /><tbody /><tfoot>1</tfoot><tfoot />';
$expect[5] = '<caption>1</caption><tfoot>1</tfoot><tbody /><tbody /><tbody />';
// errant text dropped (until bubbling is implemented)
$inputs[6] = 'foo';
$expect[6] = false;
$this->assertSeries($inputs, $expect, $config);
}