From 4bf3305dffd68db38ad5d73e364fdc6d410c6443 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <edwardzyang@thewritingpot.com>
Date: Sat, 22 Jul 2006 13:50:05 +0000
Subject: [PATCH] Build another lexer from PHP5's DOM library. Extremely fast!

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@80 48356398-32a2-884e-a903-53898d9a118a
---
 Lexer/DOMLex.php | 77 ++++++++++++++++++++++++++++++++++++++++++++++++
 Token.php        |  4 +++
 tests/Lexer.php  | 59 ++++++++++++++++++++++++++++++-------
 3 files changed, 130 insertions(+), 10 deletions(-)
 create mode 100644 Lexer/DOMLex.php
diff --git a/Lexer/DOMLex.php b/Lexer/DOMLex.php
new file mode 100644
index 00000000..8b72aa24
--- /dev/null
+++ b/Lexer/DOMLex.php
@@ -0,0 +1,77 @@
+<?php
+
+require_once 'HTMLPurifier/Lexer.php';
+
+// PHP5 only!
+
+class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
+{
+    
+    public function tokenizeHTML($string) {
+        $doc = new DOMDocument();
+        // preprocess string
+        $string = '<html><body><div>'.$string.'</div></body></html>';
+        @$doc->loadHTML($string); // mute all errors, handle it transparently
+        return $this->tokenizeDOM(
+            $doc->childNodes->item(1)-> // html
+                  childNodes->item(0)-> // body
+                  childNodes->item(0)   // div
+            );
+    }
+    
+    protected function tokenizeDOM($node, $tokens = array(), $collect = false) {
+        // recursive goodness!
+        
+        // intercept non element nodes
+        
+        if ( !($node instanceof DOMElement) ) {
+            if ($node instanceof DOMComment) {
+                $tokens[] = new HTMLPurifier_Token_Comment($node->data);
+            } elseif ($node instanceof DOMText) {
+                $tokens[] = new HTMLPurifier_Token_Text($node->data);
+            }
+            // quite possibly, the object wasn't handled, that's fine
+            return $tokens;
+        }
+        
+        // We still have to make sure that the element actually IS empty
+        if (!$node->hasChildNodes()) {
+            if ($collect) {
+                $tokens[] = new HTMLPurifier_Token_Empty(
+                    $node->tagName,
+                    $this->transformAttrToAssoc($node->attributes)
+                );
+            }
+        } else {
+            if ($collect) { // don't wrap on first iteration
+                $tokens[] = new HTMLPurifier_Token_Start(
+                    $tag_name = $node->tagName, // somehow, it get's dropped
+                    $this->transformAttrToAssoc($node->attributes)
+                );
+            }
+            foreach ($node->childNodes as $node) {
+                // remember, it's an accumulator. Otherwise, we'd have
+                // to use array_merge
+                $tokens = $this->tokenizeDOM($node, $tokens, true);
+            }
+            if ($collect) {
+                $tokens[] = new HTMLPurifier_Token_End($tag_name);
+            }
+        }
+        
+        return $tokens;
+        
+    }
+    
+    protected function transformAttrToAssoc($attribute_list) {
+        $attribute_array = array();
+        // undocumented behavior
+        foreach ($attribute_list as $key => $attr) {
+            $attribute_array[$key] = $attr->value;
+        }
+        return $attribute_array;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/Token.php b/Token.php
index 1ca8a1db..90b0e3e4 100644
--- a/Token.php
+++ b/Token.php
@@ -9,6 +9,8 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
     var $is_tag = true;
     var $name;
     function HTMLPurifier_Token_Tag($name) {
+        // watch out, actually XML is case-sensitive, while HTML
+        // is case insensitive, which means we can't use this for XML
         $this->name = strtolower($name); // for some reason, the SAX parser
                                          // uses uppercase. Investigate?
     }
@@ -24,6 +26,8 @@ class HTMLPurifier_Token_RichTag extends HTMLPurifier_Token_Tag // abstract
     }
 }
 
+// start CONCRETE ones
+
 class HTMLPurifier_Token_Start extends HTMLPurifier_Token_RichTag
 {
     var $type = 'start';
diff --git a/tests/Lexer.php b/tests/Lexer.php
index 18c7ca18..8c632745 100644
--- a/tests/Lexer.php
+++ b/tests/Lexer.php
@@ -2,16 +2,20 @@
 
 require_once 'HTMLPurifier/Lexer/DirectLex.php';
 require_once 'HTMLPurifier/Lexer/PEARSax3.php';
+require_once 'HTMLPurifier/Lexer/DOMLex.php';
 
 class Test_HTMLPurifier_Lexer extends UnitTestCase
 {
     
-    var $DirectLex;
-    var $PEARSax3;
+    var $DirectLex, $PEARSax3, $DOMLex;
+    var $_has_dom;
     
     function setUp() {
-        $this->DirectLex =& new HTMLPurifier_Lexer_DirectLex();
-        $this->PEARSax3  =& new HTMLPurifier_Lexer_PEARSax3();
+        $this->DirectLex = new HTMLPurifier_Lexer_DirectLex();
+        $this->PEARSax3  = new HTMLPurifier_Lexer_PEARSax3();
+        $this->DOMLex    = new HTMLPurifier_Lexer_DOMLex();
+        
+        $this->_has_dom = version_compare(PHP_VERSION, '5', '>=');
     }
     
     function test_nextWhiteSpace() {
@@ -67,6 +71,7 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
            ,new HTMLPurifier_Token_End('div')
             );
         
+        // [XML-INVALID]
         $input[4] = '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>';
         $expect[4] = array(
             new HTMLPurifier_Token_Start('asdf')
@@ -79,6 +84,17 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
            ,new HTMLPurifier_Token_End('asdf')
            ,new HTMLPurifier_Token_End('ASDF')
             );
+        // DOM is different because it condenses empty tags into REAL empty ones
+        // as well as makes it well-formed
+        $dom_expect[4] = array(
+            new HTMLPurifier_Token_Empty('asdf')
+           ,new HTMLPurifier_Token_Empty('d')
+           ,new HTMLPurifier_Token_Start('pooloka')
+           ,new HTMLPurifier_Token_Start('poolasdf')
+           ,new HTMLPurifier_Token_Empty('ds')
+           ,new HTMLPurifier_Token_End('poolasdf')
+           ,new HTMLPurifier_Token_End('pooloka')
+            );
         
         $input[5] = '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>';
         $expect[5] = array(
@@ -95,7 +111,7 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
             new HTMLPurifier_Token_Empty('br')
             );
         
-        // [INVALID] [RECOVERABLE]
+        // [SGML-INVALID] [RECOVERABLE]
         $input[7] = '<!-- Comment --> <!-- not so well formed --->';
         $expect[7] = array(
             new HTMLPurifier_Token_Comment(' Comment ')
@@ -104,7 +120,7 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
             );
         $sax_expect[7] = false; // we need to figure out proper comment output
         
-        // [INVALID]
+        // [SGML-INVALID]
         $input[8] = '<a href=""';
         $expect[8] = array(
             new HTMLPurifier_Token_Text('<a href=""')
@@ -113,6 +129,10 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
         $sax_expect[8] = array(
             new HTMLPurifier_Token_Start('a', array('href'=>''))
             ); 
+        // DOM parses it into an empty tag
+        $dom_expect[8] = array(
+            new HTMLPurifier_Token_Empty('a', array('href'=>''))
+            ); 
         
         $input[9] = '&lt;b&gt;';
         $expect[9] = array(
@@ -126,11 +146,15 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
         // note that SAX can clump text nodes together. We won't be
         // too picky though
         
-        // [INVALID]
+        // [SGML-INVALID]
         $input[10] = '<a "=>';
         $expect[10] = array(
             new HTMLPurifier_Token_Start('a', array('"' => ''))
             );
+        // DOM doesn't register an invalid attribute
+        $dom_expect[10] = array(
+            new HTMLPurifier_Token_Empty('a')
+            );
         
         // [INVALID] [RECOVERABLE]
         $input[11] = '"';
@@ -144,27 +168,42 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
         
         foreach($input as $i => $discard) {
             $result = $this->DirectLex->tokenizeHTML($input[$i]);
-            $this->assertEqual($expect[$i], $result);
+            $this->assertEqual($expect[$i], $result, 'Test '.$i.': %s');
             paintIf($result, $expect[$i] != $result);
             
             // assert unless I say otherwise
             $sax_result = $this->PEARSax3->tokenizeHTML($input[$i]);
             if (!isset($sax_expect[$i])) {
                 // by default, assert with normal result
-                $this->assertEqual($expect[$i], $sax_result);
+                $this->assertEqual($expect[$i], $sax_result, 'Test '.$i.': %s');
                 paintIf($sax_result, $expect[$i] != $sax_result);
             } elseif ($sax_expect[$i] === false) {
                 // assertions were turned off, optionally dump
                 // paintIf($sax_expect, $i == NUMBER);
             } else {
                 // match with a custom SAX result array
-                $this->assertEqual($sax_expect[$i], $sax_result);
+                $this->assertEqual($sax_expect[$i], $sax_result, 'Test '.$i.': %s');
                 paintIf($sax_result, $sax_expect[$i] != $sax_result);
             }
+            if ($this->_has_dom) {
+                $dom_result = $this->DOMLex->tokenizeHTML($input[$i]);
+                // same structure as SAX
+                if (!isset($dom_expect[$i])) {
+                    $this->assertEqual($expect[$i], $dom_result, 'Test '.$i.': %s');
+                    paintIf($dom_result, $expect[$i] != $dom_result);
+                } elseif ($dom_expect[$i] === false) {
+                    // paintIf($dom_result, $i == NUMBER);
+                } else {
+                    $this->assertEqual($dom_expect[$i], $dom_result, 'Test '.$i.': %s');
+                    paintIf($dom_result, $dom_expect[$i] != $dom_result);
+                }
+            }
+            
         }
         
     }
     
+    // internals testing
     function test_tokenizeAttributeString() {
         
         $input[] = 'href="asdf" boom="assdf"';