Finish implementing fixNesting(). Removed security-in-depth check for optimization reasons, since the info array will never cause such a condition.

git-svn-id: http://htmlpurifier.org/svnroot/html_purifier/trunk@58 48356398-32a2-884e-a903-53898d9a118a
2025-01-03 05:11:52 +00:00 · 2006-07-20 00:30:35 +00:00 · 2006-07-20 00:30:35 +00:00 · ff8f24458d
commit ff8f24458d
parent 3e6bcb7a0f
3 changed files with 84 additions and 14 deletions
--- a/PureHTMLDefinition.php
+++ b/PureHTMLDefinition.php
@ -44,6 +44,10 @@ class PureHTMLDefinition
        
        // transforms: font, menu, dir, center
        
+        // DON'T MONKEY AROUND THIS unless you know what you are doing
+        // and also know the assumptions the code makes about what this
+        // contains for optimization purposes (see fixNesting)
+        
        $e_special_extra = 'img';
        $e_special_basic = 'br | span | bdo';
        $e_special = "$e_special_basic | $e_special_extra";
@ -338,14 +342,19 @@ class PureHTMLDefinition
            for ($j = $i, $depth = 0; ; $j++) {
                if ($tokens[$j]->type == 'start') {
                    $depth++;
+                    // skip token assignment on first iteration
                    if ($depth == 1) continue;
                } elseif ($tokens[$j]->type == 'end') {
                    $depth--;
+                    // skip token assignment on last iteration
                    if ($depth == 0) break;
                }
                $child_tokens[] = $tokens[$j];
            }
            
+            // $i is index of start token
+            // $j is index of end token
+            
            // have DTD child def validate children
            $element_def = $this->info[$tokens[$i]->name];
            $result = $element_def->child_def->validateChildren($child_tokens);
@ -353,14 +362,48 @@ class PureHTMLDefinition
            // process result
            if ($result === true) {
                
-                // leave the nodes as is, scroll to next node
-                $i++;
-                while ($i < $size and $tokens[$i]->type != 'start') {
-                    $i++;
-                }
+                // leave the nodes as is
+                
+            } elseif($result === false) {
+                
+                // WARNING WARNING WARNING!!!
+                // While for the original DTD, there will never be
+                // cascading removal, more complex ones may have such
+                // a problem.
+                
+                // If you modify the info array such that an element
+                // that requires children may contain a child that requires
+                // children, you need to also scroll back and re-check that
+                // elements parent node
+                
+                $length = $j - $i + 1;
+                
+                // remove entire node
+                array_splice($tokens, $i, $length);
+                
+                // change size
+                $size -= $length;
+                
+                // ensure that we scroll to the next node
+                $i--;
+                
+            } else {
+                
+                $length = $j - $i - 1;
+                
+                // replace node with $result
+                array_splice($tokens, $i + 1, $length, $result);
+                
+                // change size
+                $size -= $length;
+                $size += count($result);
                
            }
            
+            // scroll to next node
+            $i++;
+            while ($i < $size and $tokens[$i]->type != 'start') $i++;
+            
        }
        
        // remove implicit divs
@ -404,6 +447,7 @@ class HTMLDTD_Element
 // in order to make it self correcting
 class HTMLDTD_ChildDef
 {
+    var $type = 'custom';
    var $dtd_regex;
    var $_pcre_regex;
    function HTMLDTD_ChildDef($dtd_regex) {
--- a/docs/spec.txt
+++ b/docs/spec.txt
@ -155,12 +155,18 @@ The way, I suppose, one would check for it, is whenever a node is removed,
 scroll to it's parent start, and re-evaluate it. Make sure you're able to do
 that with minimal code repetition.

+EDITOR'S NOTE: this behavior is not implemented by default, because the
+default configuration has a setup that ensures that cascading node removals
+will never happen. However, there will be warning signs in case someone tries
+to hack it further.
+
 The most complex case can probably be done by using some fancy regexp
 expressions and transformations. However, it doesn't seem right that, say,
 a stray <b> in a <table> can cause the entire table to be removed. Fixing it,
-however, may be too difficult.
+however, may be too difficult (or not, see below).

-This code was ripped from the PEAR class XML_DTD. It implements regexp checking.
+This code was excerpted from the PEAR class XML_DTD. It implements regexp
+checking.

 --

@ -259,6 +265,11 @@ So... I say delete the node when PCDATA isn't allowed (or the regex is too
 complicated to determine where PCDATA could be inserted), and translate the node
 to text when PCDATA is allowed.

+--
+
+Note that generic child definitions are not usually desirable: we should
+implement custom handlers for each one that specify the stuff correctly.
+
 == STAGE 4 - check attributes ==

 While we're doing all this nesting hocus-pocus, attributes are also being
--- a/tests/PureHTMLDefinition.php
+++ b/tests/PureHTMLDefinition.php
@ -413,14 +413,29 @@ class Test_PureHTMLDefinition extends UnitTestCase
            new MF_EndTag('b'),
            );
        
-        // need test of empty set that's required, resulting in removal of node
+        // test of empty set that's required, resulting in removal of node
+        $inputs[3] = array(
+            new MF_StartTag('ul'),
+            new MF_EndTag('ul')
+            );
+        $expect[3] = array();
        
-        // need test of cascading removal (if possible)
-        
-        // ! cover all child element conditions
-        
-        // execute only one test at a time:
-        $inputs = array( $inputs[0] );
+        // test illegal text which gets removed
+        $inputs[4] = array(
+            new MF_StartTag('ul'),
+                new MF_Text('Illegal Text'),
+                new MF_StartTag('li'),
+                    new MF_Text('Legal item'),
+                new MF_EndTag('li'),
+            new MF_EndTag('ul')
+            );
+        $expect[4] = array(
+            new MF_StartTag('ul'),
+                new MF_StartTag('li'),
+                    new MF_Text('Legal item'),
+                new MF_EndTag('li'),
+            new MF_EndTag('ul')
+            );
        
        foreach ($inputs as $i => $input) {
            $result = $this->def->fixNesting($input);