0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-05 06:01:52 +00:00

Finish implementing fixNesting(). Removed security-in-depth check for optimization reasons, since the info array will never cause such a condition.

git-svn-id: http://htmlpurifier.org/svnroot/html_purifier/trunk@58 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-07-20 00:30:35 +00:00
parent 3e6bcb7a0f
commit ff8f24458d
3 changed files with 84 additions and 14 deletions

View File

@ -44,6 +44,10 @@ class PureHTMLDefinition
// transforms: font, menu, dir, center // transforms: font, menu, dir, center
// DON'T MONKEY AROUND THIS unless you know what you are doing
// and also know the assumptions the code makes about what this
// contains for optimization purposes (see fixNesting)
$e_special_extra = 'img'; $e_special_extra = 'img';
$e_special_basic = 'br | span | bdo'; $e_special_basic = 'br | span | bdo';
$e_special = "$e_special_basic | $e_special_extra"; $e_special = "$e_special_basic | $e_special_extra";
@ -338,14 +342,19 @@ class PureHTMLDefinition
for ($j = $i, $depth = 0; ; $j++) { for ($j = $i, $depth = 0; ; $j++) {
if ($tokens[$j]->type == 'start') { if ($tokens[$j]->type == 'start') {
$depth++; $depth++;
// skip token assignment on first iteration
if ($depth == 1) continue; if ($depth == 1) continue;
} elseif ($tokens[$j]->type == 'end') { } elseif ($tokens[$j]->type == 'end') {
$depth--; $depth--;
// skip token assignment on last iteration
if ($depth == 0) break; if ($depth == 0) break;
} }
$child_tokens[] = $tokens[$j]; $child_tokens[] = $tokens[$j];
} }
// $i is index of start token
// $j is index of end token
// have DTD child def validate children // have DTD child def validate children
$element_def = $this->info[$tokens[$i]->name]; $element_def = $this->info[$tokens[$i]->name];
$result = $element_def->child_def->validateChildren($child_tokens); $result = $element_def->child_def->validateChildren($child_tokens);
@ -353,14 +362,48 @@ class PureHTMLDefinition
// process result // process result
if ($result === true) { if ($result === true) {
// leave the nodes as is, scroll to next node // leave the nodes as is
$i++;
while ($i < $size and $tokens[$i]->type != 'start') { } elseif($result === false) {
$i++;
} // WARNING WARNING WARNING!!!
// While for the original DTD, there will never be
// cascading removal, more complex ones may have such
// a problem.
// If you modify the info array such that an element
// that requires children may contain a child that requires
// children, you need to also scroll back and re-check that
// elements parent node
$length = $j - $i + 1;
// remove entire node
array_splice($tokens, $i, $length);
// change size
$size -= $length;
// ensure that we scroll to the next node
$i--;
} else {
$length = $j - $i - 1;
// replace node with $result
array_splice($tokens, $i + 1, $length, $result);
// change size
$size -= $length;
$size += count($result);
} }
// scroll to next node
$i++;
while ($i < $size and $tokens[$i]->type != 'start') $i++;
} }
// remove implicit divs // remove implicit divs
@ -404,6 +447,7 @@ class HTMLDTD_Element
// in order to make it self correcting // in order to make it self correcting
class HTMLDTD_ChildDef class HTMLDTD_ChildDef
{ {
var $type = 'custom';
var $dtd_regex; var $dtd_regex;
var $_pcre_regex; var $_pcre_regex;
function HTMLDTD_ChildDef($dtd_regex) { function HTMLDTD_ChildDef($dtd_regex) {

View File

@ -155,12 +155,18 @@ The way, I suppose, one would check for it, is whenever a node is removed,
scroll to it's parent start, and re-evaluate it. Make sure you're able to do scroll to it's parent start, and re-evaluate it. Make sure you're able to do
that with minimal code repetition. that with minimal code repetition.
EDITOR'S NOTE: this behavior is not implemented by default, because the
default configuration has a setup that ensures that cascading node removals
will never happen. However, there will be warning signs in case someone tries
to hack it further.
The most complex case can probably be done by using some fancy regexp The most complex case can probably be done by using some fancy regexp
expressions and transformations. However, it doesn't seem right that, say, expressions and transformations. However, it doesn't seem right that, say,
a stray <b> in a <table> can cause the entire table to be removed. Fixing it, a stray <b> in a <table> can cause the entire table to be removed. Fixing it,
however, may be too difficult. however, may be too difficult (or not, see below).
This code was ripped from the PEAR class XML_DTD. It implements regexp checking. This code was excerpted from the PEAR class XML_DTD. It implements regexp
checking.
-- --
@ -259,6 +265,11 @@ So... I say delete the node when PCDATA isn't allowed (or the regex is too
complicated to determine where PCDATA could be inserted), and translate the node complicated to determine where PCDATA could be inserted), and translate the node
to text when PCDATA is allowed. to text when PCDATA is allowed.
--
Note that generic child definitions are not usually desirable: we should
implement custom handlers for each one that specify the stuff correctly.
== STAGE 4 - check attributes == == STAGE 4 - check attributes ==
While we're doing all this nesting hocus-pocus, attributes are also being While we're doing all this nesting hocus-pocus, attributes are also being

View File

@ -413,14 +413,29 @@ class Test_PureHTMLDefinition extends UnitTestCase
new MF_EndTag('b'), new MF_EndTag('b'),
); );
// need test of empty set that's required, resulting in removal of node // test of empty set that's required, resulting in removal of node
$inputs[3] = array(
new MF_StartTag('ul'),
new MF_EndTag('ul')
);
$expect[3] = array();
// need test of cascading removal (if possible) // test illegal text which gets removed
$inputs[4] = array(
// ! cover all child element conditions new MF_StartTag('ul'),
new MF_Text('Illegal Text'),
// execute only one test at a time: new MF_StartTag('li'),
$inputs = array( $inputs[0] ); new MF_Text('Legal item'),
new MF_EndTag('li'),
new MF_EndTag('ul')
);
$expect[4] = array(
new MF_StartTag('ul'),
new MF_StartTag('li'),
new MF_Text('Legal item'),
new MF_EndTag('li'),
new MF_EndTag('ul')
);
foreach ($inputs as $i => $input) { foreach ($inputs as $i => $input) {
$result = $this->def->fixNesting($input); $result = $this->def->fixNesting($input);