From 83f735ea7eb06ecb8bf701d3c1f660d084320316 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <edwardzyang@thewritingpot.com>
Date: Mon, 17 Apr 2006 00:49:15 +0000
Subject: [PATCH] Finish HTMLDTD_ChildDef_Required.

Fix bug in HTML_Generator that resulted in attribute-less empty elements to have extra spaces in them.

Add whitespace designation to MF_Text.

git-svn-id: http://htmlpurifier.org/svnroot/html_purifier/trunk@44 48356398-32a2-884e-a903-53898d9a118a
---
 HTML_Generator.php           |   2 +-
 MarkupFragment.php           |   5 +-
 PureHTMLDefinition.php       |  69 ++++++++++++++++++-
 tests/HTML_Generator.php     |   3 +
 tests/PureHTMLDefinition.php | 124 +++++++++++++++++++++++++++++++++++
 5 files changed, 199 insertions(+), 4 deletions(-)

diff --git a/HTML_Generator.php b/HTML_Generator.php
index b8aeef23..9726267d 100644
--- a/HTML_Generator.php
+++ b/HTML_Generator.php
@@ -21,7 +21,7 @@ class HTML_Generator
             
         } elseif (is_a($token, 'MF_EmptyTag')) {
             $attr = $this->generateAttributes($token->attributes);
-             return '<' . $token->name . ' ' . $attr . ' />';
+             return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />';
             
         } elseif (is_a($token, 'MF_Text')) {
             return htmlentities($token->data, ENT_COMPAT, 'UTF-8');
diff --git a/MarkupFragment.php b/MarkupFragment.php
index 3d99d4dd..f94ecb27 100644
--- a/MarkupFragment.php
+++ b/MarkupFragment.php
@@ -31,9 +31,10 @@ class MF_Text extends MF
 {
     var $name = '#PCDATA';
     var $data;
+    var $is_whitespace = false;
     function MF_Text($data) {
-        $this->data = trim($data); // fairly certain trimming it's okay
-                                   // but it's not default SAX behavior
+        $this->data = $data;
+        if (trim($data, " \n\r\t") === '') $this->is_whitespace = true;
     }
     function append($mf_text) {
         return new MF_Text($this->data . $mf_text->data);
diff --git a/PureHTMLDefinition.php b/PureHTMLDefinition.php
index 80bb530e..1e189550 100644
--- a/PureHTMLDefinition.php
+++ b/PureHTMLDefinition.php
@@ -342,6 +342,10 @@ class HTMLDTD_Element
     
 }
 
+// HTMLDTD_ChildDef and inheritance have three types of output:
+// true = leave nodes as is
+// false = delete parent node and all children
+// array(...) = replace children nodes with these
 class HTMLDTD_ChildDef
 {
     var $dtd_regex;
@@ -354,13 +358,76 @@ class HTMLDTD_ChildDef_Simple extends HTMLDTD_ChildDef
 {
     var $elements = array();
     function HTMLDTD_ChildDef_Simple($elements) {
+        if (is_string($elements)) {
+            $elements = str_replace(' ', '', $elements);
+            $elements = explode('|', $elements);
+        }
+        $elements = array_flip($elements);
+        foreach ($elements as $i => $x) $elements[$i] = true;
         $this->elements = $elements;
+        $this->gen = new HTML_Generator();
     }
 }
 class HTMLDTD_ChildDef_Required extends HTMLDTD_ChildDef_Simple
 {
     function validateChildren($tokens_of_children) {
-    
+        // if there are no tokens, delete parent node
+        if (empty($tokens_of_children)) return false;
+        
+        // the new set of children
+        $result = array();
+        
+        // current depth into the nest
+        $nesting = 0;
+        
+        // whether or not we're deleting a node
+        $is_deleting = false;
+        
+        // whether or not parsed character data is allowed
+        // this controls whether or not we silently drop a tag
+        // or generate escaped HTML from it
+        $pcdata_allowed = isset($this->elements['#PCDATA']);
+        
+        // a little sanity check to make sure it's not ALL whitespace
+        $all_whitespace = true;
+        
+        foreach ($tokens_of_children as $token) {
+            if (!empty($token->is_whitespace)) {
+                $result[] = $token;
+                continue;
+            }
+            $all_whitespace = false; // phew, we're not talking about whitespace
+            
+            $is_child = ($nesting == 0);
+            
+            if (is_a($token, 'MF_StartTag')) {
+                $nesting++;
+            } elseif (is_a($token, 'MF_EndTag')) {
+                $nesting--;
+            }
+            
+            if ($is_child) {
+                $is_deleting = false;
+                if (!isset($this->elements[$token->name])) {
+                    $is_deleting = true;
+                    if ($pcdata_allowed) {
+                        $result[] = new MF_Text($this->gen->generateFromToken($token));
+                    }
+                    continue;
+                }
+            }
+            if (!$is_deleting) {
+                $result[] = $token;
+            } elseif ($pcdata_allowed) {
+                $result[] = new MF_Text($this->gen->generateFromToken($token));
+            } else {
+                // drop silently
+            }
+        }
+        if (empty($result)) return false;
+        if ($all_whitespace) return false;
+        if ($tokens_of_children == $result) return true;
+        return $result;
     }
 }
 class HTMLDTD_ChildDef_Optional extends HTMLDTD_ChildDef_Simple
diff --git a/tests/HTML_Generator.php b/tests/HTML_Generator.php
index 97d77dd2..f867ea22 100644
--- a/tests/HTML_Generator.php
+++ b/tests/HTML_Generator.php
@@ -30,6 +30,9 @@ class Test_HTML_Generator extends UnitTestCase
         $inputs[4] = new MF_StartTag('asdf');
         $expect[4] = '<asdf>';
         
+        $inputs[5] = new MF_EmptyTag('br');
+        $expect[5] = '<br />';
+        
         foreach ($inputs as $i => $input) {
             $result = $this->gen->generateFromToken($input);
             $this->assertEqual($result, $expect[$i]);
diff --git a/tests/PureHTMLDefinition.php b/tests/PureHTMLDefinition.php
index 2c15fefa..56fe3d85 100644
--- a/tests/PureHTMLDefinition.php
+++ b/tests/PureHTMLDefinition.php
@@ -154,20 +154,26 @@ class Test_PureHTMLDefinition extends UnitTestCase
         
         $inputs[9] = array(
             new MF_StartTag('ol')
+            
            ,new MF_StartTag('li')
            ,new MF_Text('Item 1')
+           
            ,new MF_StartTag('li')
            ,new MF_Text('Item 2')
+           
            ,new MF_EndTag('ol')
             );
         $expect[9] = array(
             new MF_StartTag('ol')
+            
            ,new MF_StartTag('li')
            ,new MF_Text('Item 1')
            ,new MF_EndTag('li')
+           
            ,new MF_StartTag('li')
            ,new MF_Text('Item 2')
            ,new MF_EndTag('li')
+           
            ,new MF_EndTag('ol')
             );
         
@@ -181,4 +187,122 @@ class Test_PureHTMLDefinition extends UnitTestCase
     
 }
 
+class Test_HTMLDTD_ChildDef extends UnitTestCase
+{
+    
+    function test_simple() {
+        
+        $def = new HTMLDTD_ChildDef_Simple('foobar | bang |gizmo');
+        $this->assertEqual($def->elements,
+          array(
+            'foobar' => true
+           ,'bang'   => true
+           ,'gizmo'  => true
+          ));
+        
+        $def = new HTMLDTD_ChildDef_Simple(array('href', 'src'));
+        $this->assertEqual($def->elements,
+          array(
+            'href' => true
+           ,'src'  => true
+          ));
+    }
+    
+    function test_required_pcdata_forbidden() {
+        
+        $def = new HTMLDTD_ChildDef_Required('dt | dd');
+        
+        $inputs[0] = array();
+        $expect[0] = false;
+        
+        $inputs[1] = array(
+            new MF_StartTag('dt')
+           ,new MF_Text('Term')
+           ,new MF_EndTag('dt')
+           
+           ,new MF_Text('Text in an illegal location')
+           
+           ,new MF_StartTag('dd')
+           ,new MF_Text('Definition')
+           ,new MF_EndTag('dd')
+           
+           ,new MF_StartTag('b') // test tag removal too
+           ,new MF_EndTag('b')
+            );
+        $expect[1] = array(
+            new MF_StartTag('dt')
+           ,new MF_Text('Term')
+           ,new MF_EndTag('dt')
+           
+           ,new MF_StartTag('dd')
+           ,new MF_Text('Definition')
+           ,new MF_EndTag('dd')
+            );
+        
+        $inputs[2] = array(new MF_Text('How do you do!'));
+        $expect[2] = false;
+        
+        // whitespace shouldn't trigger it
+        $inputs[3] = array(
+            new MF_Text("\n")
+           ,new MF_StartTag('dd')
+           ,new MF_Text('Definition')
+           ,new MF_EndTag('dd')
+           ,new MF_Text('       ')
+            );
+        $expect[3] = true;
+        
+        $inputs[4] = array(
+            new MF_StartTag('dd')
+           ,new MF_Text('Definition')
+           ,new MF_EndTag('dd')
+           ,new MF_Text('       ')
+           ,new MF_StartTag('b')
+           ,new MF_EndTag('b')
+           ,new MF_Text('       ')
+            );
+        $expect[4] = array(
+            new MF_StartTag('dd')
+           ,new MF_Text('Definition')
+           ,new MF_EndTag('dd')
+           ,new MF_Text('       ')
+           ,new MF_Text('       ')
+            );
+        $inputs[5] = array(
+            new MF_Text('       ')
+           ,new MF_Text("\t")
+            );
+        $expect[5] = false;
+        
+        foreach ($inputs as $i => $input) {
+            $result = $def->validateChildren($input);
+            if (is_bool($expect[$i])) {
+                $this->assertIdentical($expect[$i], $result);
+            } else {
+                $this->assertEqual($expect[$i], $result);
+                paintIf($result, $result != $expect[$i]);
+            }
+        }
+        
+    }
+    
+    function test_required_pcdata_allowed() {
+        $def = new HTMLDTD_ChildDef_Required('#PCDATA | b');
+        $input = array(
+            new MF_StartTag('b')
+           ,new MF_Text('Bold text')
+           ,new MF_EndTag('b')
+           ,new MF_EmptyTag('img') // illegal tag
+            );
+        $expect = array(
+            new MF_StartTag('b')
+           ,new MF_Text('Bold text')
+           ,new MF_EndTag('b')
+           ,new MF_Text('<img />')
+            );
+        $this->assertEqual($expect, $def->validateChildren($input));
+    }
+    
+}
+
 ?>
\ No newline at end of file