0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-11-08 06:48:42 +00:00

[1.1] Table child definition made more flexible, will fix up poorly ordered elements

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@417 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-09-15 01:52:22 +00:00
parent 665e80d223
commit e440f25bce
6 changed files with 152 additions and 22 deletions

1
NEWS
View File

@ -6,6 +6,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
quotes, apostrophes and less than or greater than signs. quotes, apostrophes and less than or greater than signs.
- Enforce alphanumeric namespace and directive names for configuration. - Enforce alphanumeric namespace and directive names for configuration.
- Directive documentation generation using XSLT - Directive documentation generation using XSLT
- Table child definition made more flexible, will fix up poorly ordered elements
1.0.2, unknown release date 1.0.2, unknown release date
(bugfix release may be dropped if no bugs found) (bugfix release may be dropped if no bugs found)

1
TODO
View File

@ -6,7 +6,6 @@ Ongoing
- Plugins for major CMSes (very tricky issue) - Plugins for major CMSes (very tricky issue)
1.1 release 1.1 release
- Rewrite table's child definition to be faster, smart, and regexp free
- Allow HTML 4.01 output (cosmetic changes to the generator) - Allow HTML 4.01 output (cosmetic changes to the generator)
- Formatters for plaintext - Formatters for plaintext
- Auto-paragraphing (be sure to leverage fact that we know when things - Auto-paragraphing (be sure to leverage fact that we know when things

View File

@ -5,13 +5,6 @@
// false = delete parent node and all children // false = delete parent node and all children
// array(...) = replace children nodes with these // array(...) = replace children nodes with these
// this is the hardest one to implement. We'll use fancy regexp tricks
// right now, we only expect it to return TRUE or FALSE (it won't attempt
// to fix the tree)
// we may end up writing custom code for each HTML case
// in order to make it self correcting
HTMLPurifier_ConfigDef::define( HTMLPurifier_ConfigDef::define(
'Core', 'EscapeInvalidChildren', false, 'bool', 'Core', 'EscapeInvalidChildren', false, 'bool',
'When true, a child is found that is not allowed in the context of the '. 'When true, a child is found that is not allowed in the context of the '.
@ -62,9 +55,7 @@ class HTMLPurifier_ChildDef
* Custom validation class, accepts DTD child definitions * Custom validation class, accepts DTD child definitions
* *
* @warning Currently this class is an all or nothing proposition, that is, * @warning Currently this class is an all or nothing proposition, that is,
* it will only give a bool return value. Table is the only * it will only give a bool return value.
* child definition that uses this class, and we ought to give
* it a dedicated one.
*/ */
class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
{ {
@ -307,4 +298,129 @@ class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
} }
} }
/**
* Definition for tables
*/
class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
{
var $allow_empty = false;
var $type = 'table';
function HTMLPurifier_ChildDef_Table() {}
function validateChildren($tokens_of_children, $config, $context) {
if (empty($tokens_of_children)) return false;
// this ensures that the loop gets run one last time before closing
// up. It's a little bit of a hack, but it works! Just make sure you
// get rid of the token later.
$tokens_of_children[] = false;
// only one of these elements is allowed in a table
$caption = false;
$thead = false;
$tfoot = false;
// as many of these as you want
$cols = array();
$content = array();
$nesting = 0; // current depth so we can determine nodes
$is_collecting = false; // are we globbing together tokens to package
// into one of the collectors?
$collection = array(); // collected nodes
foreach ($tokens_of_children as $token) {
$is_child = ($nesting == 0);
if ($token === false) {
// terminating sequence started
} elseif ($token->type == 'start') {
$nesting++;
} elseif ($token->type == 'end') {
$nesting--;
}
// handle node collection
if ($is_collecting) {
if ($is_child) {
// okay, let's stash the tokens away
// first token tells us the type of the collection
switch ($collection[0]->name) {
case 'tr':
case 'tbody':
$content[] = $collection;
break;
case 'caption':
if ($caption !== false) break;
$caption = $collection;
break;
case 'thead':
case 'tfoot':
// access the appropriate variable, $thead or $tfoot
$var = $collection[0]->name;
if ($$var === false) {
$$var = $collection;
} else {
// transmutate the first and less entries into
// tbody tags, and then put into content
$collection[0]->name = 'tbody';
$collection[count($collection)-1]->name = 'tbody';
$content[] = $collection;
}
break;
case 'colgroup':
$cols[] = $collection;
break;
}
$collection = array();
$is_collecting = false;
} else {
// add the node to the collection
$collection[] = $token;
}
}
// terminate
if ($token === false) break;
if ($is_child) {
// determine what we're dealing with
if ($token->name == 'col') {
// the only empty tag in the possie, we can handle it
// immediately
$cols[] = array($token);
continue;
}
switch($token->name) {
case 'caption':
case 'colgroup':
case 'thead':
case 'tfoot':
case 'tbody':
case 'tr':
$is_collecting = true;
$collection[] = $token;
continue;
default:
// unrecognized, drop silently
continue;
}
}
}
if (empty($content)) return false;
$ret = array();
if ($caption !== false) $ret = array_merge($ret, $caption);
if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
if ($thead !== false) $ret = array_merge($ret, $thead);
if ($tfoot !== false) $ret = array_merge($ret, $tfoot);
foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
array_pop($tokens_of_children); // remove phantom token
return ($ret === $tokens_of_children) ? true : $ret;
}
}
?> ?>

View File

@ -209,8 +209,7 @@ class HTMLPurifier_HTMLDefinition
$this->info['a']->child = $e_a_content; $this->info['a']->child = $e_a_content;
$this->info['table']->child = new HTMLPurifier_ChildDef_Custom( $this->info['table']->child = new HTMLPurifier_ChildDef_Table();
'(caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))');
// not a real entity, watch the double underscore // not a real entity, watch the double underscore
$e__row = new HTMLPurifier_ChildDef_Required('tr'); $e__row = new HTMLPurifier_ChildDef_Required('tr');

View File

@ -187,6 +187,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
if (!$parent_def->child->allow_empty) { if (!$parent_def->child->allow_empty) {
// we need to do a double-check // we need to do a double-check
$i = $parent_index; $i = $parent_index;
array_pop($stack);
} }
// PROJECTED OPTIMIZATION: Process all children elements before // PROJECTED OPTIMIZATION: Process all children elements before

View File

@ -59,7 +59,7 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
} }
function atest_table() { function test_table() {
// currently inactive, awaiting augmentation // currently inactive, awaiting augmentation
@ -71,19 +71,33 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
$inputs[0] = ''; $inputs[0] = '';
$expect[0] = false; $expect[0] = false;
// we really don't care what's inside, because if it turns out // we're using empty tags to compact the tests: under real circumstances
// this tr is illegal, we'll end up re-evaluating the parent node // there would be contents in them
// anyway.
$inputs[1] = '<tr></tr>'; $inputs[1] = '<tr />';
$expect[1] = true; $expect[1] = true;
$inputs[2] = '<caption></caption><col></col><thead></thead>' . $inputs[2] = '<caption /><col /><thead /><tfoot /><tbody>'.
'<tfoot></tfoot><tbody></tbody>'; '<tr><td>asdf</td></tr></tbody>';
$expect[2] = true; $expect[2] = true;
$inputs[3] = '<col></col><col></col><col></col><tr></tr>'; $inputs[3] = '<col /><col /><col /><tr />';
$expect[3] = true; $expect[3] = true;
// mixed up order
$inputs[4] = '<col /><colgroup /><tbody /><tfoot /><thead /><tr>1</tr><caption /><tr />';
$expect[4] = '<caption /><col /><colgroup /><thead /><tfoot /><tbody /><tr>1</tr><tr />';
// duplicates of singles
// - first caption serves
// - trailing tfoots/theads get turned into tbodys
$inputs[5] = '<caption>1</caption><caption /><tbody /><tbody /><tfoot>1</tfoot><tfoot />';
$expect[5] = '<caption>1</caption><tfoot>1</tfoot><tbody /><tbody /><tbody />';
// errant text dropped (until bubbling is implemented)
$inputs[6] = 'foo';
$expect[6] = false;
$this->assertSeries($inputs, $expect, $config); $this->assertSeries($inputs, $expect, $config);
} }