Build another lexer from PHP5's DOM library. Extremely fast!

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@80 48356398-32a2-884e-a903-53898d9a118a
2024-12-22 08:21:52 +00:00 · 2006-07-22 13:50:05 +00:00 · 2006-07-22 13:50:05 +00:00 · 4bf3305dff
commit 4bf3305dff
parent 3e982c7f2c
3 changed files with 130 additions and 10 deletions
--- a/Lexer/DOMLex.php
+++ b/Lexer/DOMLex.php
@ -0,0 +1,77 @@
+<?php
+
+require_once 'HTMLPurifier/Lexer.php';
+
+// PHP5 only!
+
+class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
+{
+    
+    public function tokenizeHTML($string) {
+        $doc = new DOMDocument();
+        // preprocess string
+        $string = '<html><body><div>'.$string.'</div></body></html>';
+        @$doc->loadHTML($string); // mute all errors, handle it transparently
+        return $this->tokenizeDOM(
+            $doc->childNodes->item(1)-> // html
+                  childNodes->item(0)-> // body
+                  childNodes->item(0)   // div
+            );
+    }
+    
+    protected function tokenizeDOM($node, $tokens = array(), $collect = false) {
+        // recursive goodness!
+        
+        // intercept non element nodes
+        
+        if ( !($node instanceof DOMElement) ) {
+            if ($node instanceof DOMComment) {
+                $tokens[] = new HTMLPurifier_Token_Comment($node->data);
+            } elseif ($node instanceof DOMText) {
+                $tokens[] = new HTMLPurifier_Token_Text($node->data);
+            }
+            // quite possibly, the object wasn't handled, that's fine
+            return $tokens;
+        }
+        
+        // We still have to make sure that the element actually IS empty
+        if (!$node->hasChildNodes()) {
+            if ($collect) {
+                $tokens[] = new HTMLPurifier_Token_Empty(
+                    $node->tagName,
+                    $this->transformAttrToAssoc($node->attributes)
+                );
+            }
+        } else {
+            if ($collect) { // don't wrap on first iteration
+                $tokens[] = new HTMLPurifier_Token_Start(
+                    $tag_name = $node->tagName, // somehow, it get's dropped
+                    $this->transformAttrToAssoc($node->attributes)
+                );
+            }
+            foreach ($node->childNodes as $node) {
+                // remember, it's an accumulator. Otherwise, we'd have
+                // to use array_merge
+                $tokens = $this->tokenizeDOM($node, $tokens, true);
+            }
+            if ($collect) {
+                $tokens[] = new HTMLPurifier_Token_End($tag_name);
+            }
+        }
+        
+        return $tokens;
+        
+    }
+    
+    protected function transformAttrToAssoc($attribute_list) {
+        $attribute_array = array();
+        // undocumented behavior
+        foreach ($attribute_list as $key => $attr) {
+            $attribute_array[$key] = $attr->value;
+        }
+        return $attribute_array;
+    }
+    
+}
+
+?>
--- a/Token.php
+++ b/Token.php
@ -9,6 +9,8 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
    var $is_tag = true;
    var $name;
    function HTMLPurifier_Token_Tag($name) {
+        // watch out, actually XML is case-sensitive, while HTML
+        // is case insensitive, which means we can't use this for XML
        $this->name = strtolower($name); // for some reason, the SAX parser
                                         // uses uppercase. Investigate?
    }
@ -24,6 +26,8 @@ class HTMLPurifier_Token_RichTag extends HTMLPurifier_Token_Tag // abstract
    }
 }

+// start CONCRETE ones
+
 class HTMLPurifier_Token_Start extends HTMLPurifier_Token_RichTag
 {
    var $type = 'start';
--- a/tests/Lexer.php
+++ b/tests/Lexer.php
@ -2,16 +2,20 @@

 require_once 'HTMLPurifier/Lexer/DirectLex.php';
 require_once 'HTMLPurifier/Lexer/PEARSax3.php';
+require_once 'HTMLPurifier/Lexer/DOMLex.php';

 class Test_HTMLPurifier_Lexer extends UnitTestCase
 {
    
-    var $DirectLex;
-    var $PEARSax3;
+    var $DirectLex, $PEARSax3, $DOMLex;
+    var $_has_dom;
    
    function setUp() {
-        $this->DirectLex =& new HTMLPurifier_Lexer_DirectLex();
-        $this->PEARSax3  =& new HTMLPurifier_Lexer_PEARSax3();
+        $this->DirectLex = new HTMLPurifier_Lexer_DirectLex();
+        $this->PEARSax3  = new HTMLPurifier_Lexer_PEARSax3();
+        $this->DOMLex    = new HTMLPurifier_Lexer_DOMLex();
+        
+        $this->_has_dom = version_compare(PHP_VERSION, '5', '>=');
    }
    
    function test_nextWhiteSpace() {
@ -67,6 +71,7 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
           ,new HTMLPurifier_Token_End('div')
            );
        
+        // [XML-INVALID]
        $input[4] = '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>';
        $expect[4] = array(
            new HTMLPurifier_Token_Start('asdf')
@ -79,6 +84,17 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
           ,new HTMLPurifier_Token_End('asdf')
           ,new HTMLPurifier_Token_End('ASDF')
            );
+        // DOM is different because it condenses empty tags into REAL empty ones
+        // as well as makes it well-formed
+        $dom_expect[4] = array(
+            new HTMLPurifier_Token_Empty('asdf')
+           ,new HTMLPurifier_Token_Empty('d')
+           ,new HTMLPurifier_Token_Start('pooloka')
+           ,new HTMLPurifier_Token_Start('poolasdf')
+           ,new HTMLPurifier_Token_Empty('ds')
+           ,new HTMLPurifier_Token_End('poolasdf')
+           ,new HTMLPurifier_Token_End('pooloka')
+            );
        
        $input[5] = '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>';
        $expect[5] = array(
@ -95,7 +111,7 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
            new HTMLPurifier_Token_Empty('br')
            );
        
-        // [INVALID] [RECOVERABLE]
+        // [SGML-INVALID] [RECOVERABLE]
        $input[7] = '<!-- Comment --> <!-- not so well formed --->';
        $expect[7] = array(
            new HTMLPurifier_Token_Comment(' Comment ')
@ -104,7 +120,7 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
            );
        $sax_expect[7] = false; // we need to figure out proper comment output
        
-        // [INVALID]
+        // [SGML-INVALID]
        $input[8] = '<a href=""';
        $expect[8] = array(
            new HTMLPurifier_Token_Text('<a href=""')
@ -113,6 +129,10 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
        $sax_expect[8] = array(
            new HTMLPurifier_Token_Start('a', array('href'=>''))
            ); 
+        // DOM parses it into an empty tag
+        $dom_expect[8] = array(
+            new HTMLPurifier_Token_Empty('a', array('href'=>''))
+            ); 
        
        $input[9] = '&lt;b&gt;';
        $expect[9] = array(
@ -126,11 +146,15 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
        // note that SAX can clump text nodes together. We won't be
        // too picky though
        
-        // [INVALID]
+        // [SGML-INVALID]
        $input[10] = '<a "=>';
        $expect[10] = array(
            new HTMLPurifier_Token_Start('a', array('"' => ''))
            );
+        // DOM doesn't register an invalid attribute
+        $dom_expect[10] = array(
+            new HTMLPurifier_Token_Empty('a')
+            );
        
        // [INVALID] [RECOVERABLE]
        $input[11] = '"';
@ -144,27 +168,42 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
        
        foreach($input as $i => $discard) {
            $result = $this->DirectLex->tokenizeHTML($input[$i]);
-            $this->assertEqual($expect[$i], $result);
+            $this->assertEqual($expect[$i], $result, 'Test '.$i.': %s');
            paintIf($result, $expect[$i] != $result);
            
            // assert unless I say otherwise
            $sax_result = $this->PEARSax3->tokenizeHTML($input[$i]);
            if (!isset($sax_expect[$i])) {
                // by default, assert with normal result
-                $this->assertEqual($expect[$i], $sax_result);
+                $this->assertEqual($expect[$i], $sax_result, 'Test '.$i.': %s');
                paintIf($sax_result, $expect[$i] != $sax_result);
            } elseif ($sax_expect[$i] === false) {
                // assertions were turned off, optionally dump
                // paintIf($sax_expect, $i == NUMBER);
            } else {
                // match with a custom SAX result array
-                $this->assertEqual($sax_expect[$i], $sax_result);
+                $this->assertEqual($sax_expect[$i], $sax_result, 'Test '.$i.': %s');
                paintIf($sax_result, $sax_expect[$i] != $sax_result);
            }
+            if ($this->_has_dom) {
+                $dom_result = $this->DOMLex->tokenizeHTML($input[$i]);
+                // same structure as SAX
+                if (!isset($dom_expect[$i])) {
+                    $this->assertEqual($expect[$i], $dom_result, 'Test '.$i.': %s');
+                    paintIf($dom_result, $expect[$i] != $dom_result);
+                } elseif ($dom_expect[$i] === false) {
+                    // paintIf($dom_result, $i == NUMBER);
+                } else {
+                    $this->assertEqual($dom_expect[$i], $dom_result, 'Test '.$i.': %s');
+                    paintIf($dom_result, $dom_expect[$i] != $dom_result);
+                }
+            }
+            
        }
        
    }
    
+    // internals testing
    function test_tokenizeAttributeString() {
        
        $input[] = 'href="asdf" boom="assdf"';