From ff8f24458d86fa7794efecf1e5e9687e87716122 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Thu, 20 Jul 2006 00:30:35 +0000 Subject: [PATCH] Finish implementing fixNesting(). Removed security-in-depth check for optimization reasons, since the info array will never cause such a condition. git-svn-id: http://htmlpurifier.org/svnroot/html_purifier/trunk@58 48356398-32a2-884e-a903-53898d9a118a --- PureHTMLDefinition.php | 54 ++++++++++++++++++++++++++++++++---- docs/spec.txt | 15 ++++++++-- tests/PureHTMLDefinition.php | 29 ++++++++++++++----- 3 files changed, 84 insertions(+), 14 deletions(-) diff --git a/PureHTMLDefinition.php b/PureHTMLDefinition.php index db4bfc77..d9ae2f30 100644 --- a/PureHTMLDefinition.php +++ b/PureHTMLDefinition.php @@ -44,6 +44,10 @@ class PureHTMLDefinition // transforms: font, menu, dir, center + // DON'T MONKEY AROUND THIS unless you know what you are doing + // and also know the assumptions the code makes about what this + // contains for optimization purposes (see fixNesting) + $e_special_extra = 'img'; $e_special_basic = 'br | span | bdo'; $e_special = "$e_special_basic | $e_special_extra"; @@ -338,14 +342,19 @@ class PureHTMLDefinition for ($j = $i, $depth = 0; ; $j++) { if ($tokens[$j]->type == 'start') { $depth++; + // skip token assignment on first iteration if ($depth == 1) continue; } elseif ($tokens[$j]->type == 'end') { $depth--; + // skip token assignment on last iteration if ($depth == 0) break; } $child_tokens[] = $tokens[$j]; } + // $i is index of start token + // $j is index of end token + // have DTD child def validate children $element_def = $this->info[$tokens[$i]->name]; $result = $element_def->child_def->validateChildren($child_tokens); @@ -353,14 +362,48 @@ class PureHTMLDefinition // process result if ($result === true) { - // leave the nodes as is, scroll to next node - $i++; - while ($i < $size and $tokens[$i]->type != 'start') { - $i++; - } + // leave the nodes as is + + } elseif($result === false) { + + // WARNING WARNING WARNING!!! + // While for the original DTD, there will never be + // cascading removal, more complex ones may have such + // a problem. + + // If you modify the info array such that an element + // that requires children may contain a child that requires + // children, you need to also scroll back and re-check that + // elements parent node + + $length = $j - $i + 1; + + // remove entire node + array_splice($tokens, $i, $length); + + // change size + $size -= $length; + + // ensure that we scroll to the next node + $i--; + + } else { + + $length = $j - $i - 1; + + // replace node with $result + array_splice($tokens, $i + 1, $length, $result); + + // change size + $size -= $length; + $size += count($result); } + // scroll to next node + $i++; + while ($i < $size and $tokens[$i]->type != 'start') $i++; + } // remove implicit divs @@ -404,6 +447,7 @@ class HTMLDTD_Element // in order to make it self correcting class HTMLDTD_ChildDef { + var $type = 'custom'; var $dtd_regex; var $_pcre_regex; function HTMLDTD_ChildDef($dtd_regex) { diff --git a/docs/spec.txt b/docs/spec.txt index 5b70e26c..834078a6 100644 --- a/docs/spec.txt +++ b/docs/spec.txt @@ -155,12 +155,18 @@ The way, I suppose, one would check for it, is whenever a node is removed, scroll to it's parent start, and re-evaluate it. Make sure you're able to do that with minimal code repetition. +EDITOR'S NOTE: this behavior is not implemented by default, because the +default configuration has a setup that ensures that cascading node removals +will never happen. However, there will be warning signs in case someone tries +to hack it further. + The most complex case can probably be done by using some fancy regexp expressions and transformations. However, it doesn't seem right that, say, a stray in a can cause the entire table to be removed. Fixing it, -however, may be too difficult. +however, may be too difficult (or not, see below). -This code was ripped from the PEAR class XML_DTD. It implements regexp checking. +This code was excerpted from the PEAR class XML_DTD. It implements regexp +checking. -- @@ -259,6 +265,11 @@ So... I say delete the node when PCDATA isn't allowed (or the regex is too complicated to determine where PCDATA could be inserted), and translate the node to text when PCDATA is allowed. +-- + +Note that generic child definitions are not usually desirable: we should +implement custom handlers for each one that specify the stuff correctly. + == STAGE 4 - check attributes == While we're doing all this nesting hocus-pocus, attributes are also being diff --git a/tests/PureHTMLDefinition.php b/tests/PureHTMLDefinition.php index a66ae83e..5e9c6511 100644 --- a/tests/PureHTMLDefinition.php +++ b/tests/PureHTMLDefinition.php @@ -413,14 +413,29 @@ class Test_PureHTMLDefinition extends UnitTestCase new MF_EndTag('b'), ); - // need test of empty set that's required, resulting in removal of node + // test of empty set that's required, resulting in removal of node + $inputs[3] = array( + new MF_StartTag('ul'), + new MF_EndTag('ul') + ); + $expect[3] = array(); - // need test of cascading removal (if possible) - - // ! cover all child element conditions - - // execute only one test at a time: - $inputs = array( $inputs[0] ); + // test illegal text which gets removed + $inputs[4] = array( + new MF_StartTag('ul'), + new MF_Text('Illegal Text'), + new MF_StartTag('li'), + new MF_Text('Legal item'), + new MF_EndTag('li'), + new MF_EndTag('ul') + ); + $expect[4] = array( + new MF_StartTag('ul'), + new MF_StartTag('li'), + new MF_Text('Legal item'), + new MF_EndTag('li'), + new MF_EndTag('ul') + ); foreach ($inputs as $i => $input) { $result = $this->def->fixNesting($input);