Finish implementing fixNesting(). Removed security-in-depth check for optimization reasons, since the info array will never cause such a condition.

git-svn-id: http://htmlpurifier.org/svnroot/html_purifier/trunk@58 48356398-32a2-884e-a903-53898d9a118a
2025-01-03 05:11:52 +00:00 · 2006-07-20 00:30:35 +00:00 · 2006-07-20 00:30:35 +00:00 · ff8f24458d
commit ff8f24458d
parent 3e6bcb7a0f
3 changed files with 84 additions and 14 deletions
--- a/PureHTMLDefinition.php
+++ b/PureHTMLDefinition.php
@ -44,6 +44,10 @@ class PureHTMLDefinition
        // transforms: font, menu, dir, center
        // DON'T MONKEY AROUND THIS unless you know what you are doing
        // and also know the assumptions the code makes about what this
        // contains for optimization purposes (see fixNesting)
        $e_special_extra = 'img';
        $e_special_basic = 'br | span | bdo';
        $e_special = "$e_special_basic | $e_special_extra";
@ -338,14 +342,19 @@ class PureHTMLDefinition
            for ($j = $i, $depth = 0; ; $j++) {
                if ($tokens[$j]->type == 'start') {
                    $depth++;
                    // skip token assignment on first iteration
                    if ($depth == 1) continue;
                } elseif ($tokens[$j]->type == 'end') {
                    $depth--;
                    // skip token assignment on last iteration
                    if ($depth == 0) break;
                }
                $child_tokens[] = $tokens[$j];
            }
            // $i is index of start token
            // $j is index of end token
            // have DTD child def validate children
            $element_def = $this->info[$tokens[$i]->name];
            $result = $element_def->child_def->validateChildren($child_tokens);
@ -353,14 +362,48 @@ class PureHTMLDefinition
            // process result
            if ($result === true) {
-                // leave the nodes as is, scroll to next node
+                // leave the nodes as is
-                $i++;
+                
-                while ($i < $size and $tokens[$i]->type != 'start') {
+            } elseif($result === false) {
-                    $i++;
+                
-                }
+                // WARNING WARNING WARNING!!!
                // While for the original DTD, there will never be
                // cascading removal, more complex ones may have such
                // a problem.
                // If you modify the info array such that an element
                // that requires children may contain a child that requires
                // children, you need to also scroll back and re-check that
                // elements parent node
                $length = $j - $i + 1;
                // remove entire node
                array_splice($tokens, $i, $length);
                // change size
                $size -= $length;
                // ensure that we scroll to the next node
                $i--;
            } else {
                $length = $j - $i - 1;
                // replace node with $result
                array_splice($tokens, $i + 1, $length, $result);
                // change size
                $size -= $length;
                $size += count($result);
            }
            // scroll to next node
            $i++;
            while ($i < $size and $tokens[$i]->type != 'start') $i++;
        }
        // remove implicit divs
@ -404,6 +447,7 @@ class HTMLDTD_Element
 // in order to make it self correcting
 class HTMLDTD_ChildDef
 {
    var $type = 'custom';
    var $dtd_regex;
    var $_pcre_regex;
    function HTMLDTD_ChildDef($dtd_regex) {
--- a/docs/spec.txt
+++ b/docs/spec.txt
@ -155,12 +155,18 @@ The way, I suppose, one would check for it, is whenever a node is removed,
 scroll to it's parent start, and re-evaluate it. Make sure you're able to do
 that with minimal code repetition.
 EDITOR'S NOTE: this behavior is not implemented by default, because the
 default configuration has a setup that ensures that cascading node removals
 will never happen. However, there will be warning signs in case someone tries
 to hack it further.
 The most complex case can probably be done by using some fancy regexp
 expressions and transformations. However, it doesn't seem right that, say,
 a stray <b> in a <table> can cause the entire table to be removed. Fixing it,
-however, may be too difficult.
+however, may be too difficult (or not, see below).
-This code was ripped from the PEAR class XML_DTD. It implements regexp checking.
+This code was excerpted from the PEAR class XML_DTD. It implements regexp
 checking.
 --
@ -259,6 +265,11 @@ So... I say delete the node when PCDATA isn't allowed (or the regex is too
 complicated to determine where PCDATA could be inserted), and translate the node
 to text when PCDATA is allowed.
 --
 Note that generic child definitions are not usually desirable: we should
 implement custom handlers for each one that specify the stuff correctly.
 == STAGE 4 - check attributes ==
 While we're doing all this nesting hocus-pocus, attributes are also being
--- a/tests/PureHTMLDefinition.php
+++ b/tests/PureHTMLDefinition.php
@ -413,14 +413,29 @@ class Test_PureHTMLDefinition extends UnitTestCase
            new MF_EndTag('b'),
            );
-        // need test of empty set that's required, resulting in removal of node
+        // test of empty set that's required, resulting in removal of node
        $inputs[3] = array(
            new MF_StartTag('ul'),
            new MF_EndTag('ul')
            );
        $expect[3] = array();
-        // need test of cascading removal (if possible)
+        // test illegal text which gets removed
-        
+        $inputs[4] = array(
-        // ! cover all child element conditions
+            new MF_StartTag('ul'),
-        
+                new MF_Text('Illegal Text'),
-        // execute only one test at a time:
+                new MF_StartTag('li'),
-        $inputs = array( $inputs[0] );
+                    new MF_Text('Legal item'),
                new MF_EndTag('li'),
            new MF_EndTag('ul')
            );
        $expect[4] = array(
            new MF_StartTag('ul'),
                new MF_StartTag('li'),
                    new MF_Text('Legal item'),
                new MF_EndTag('li'),
            new MF_EndTag('ul')
            );
        foreach ($inputs as $i => $input) {
            $result = $this->def->fixNesting($input);