Build another lexer from PHP5's DOM library. Extremely fast!

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@80 48356398-32a2-884e-a903-53898d9a118a
2024-12-23 00:41:52 +00:00 · 2006-07-22 13:50:05 +00:00 · 2006-07-22 13:50:05 +00:00 · 4bf3305dff
commit 4bf3305dff
parent 3e982c7f2c
3 changed files with 130 additions and 10 deletions
--- a/Lexer/DOMLex.php
+++ b/Lexer/DOMLex.php
@ -0,0 +1,77 @@
 <?php
 require_once 'HTMLPurifier/Lexer.php';
 // PHP5 only!
 class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
 {
    public function tokenizeHTML($string) {
        $doc = new DOMDocument();
        // preprocess string
        $string = '<html><body><div>'.$string.'</div></body></html>';
        @$doc->loadHTML($string); // mute all errors, handle it transparently
        return $this->tokenizeDOM(
            $doc->childNodes->item(1)-> // html
                  childNodes->item(0)-> // body
                  childNodes->item(0)   // div
            );
    }
    protected function tokenizeDOM($node, $tokens = array(), $collect = false) {
        // recursive goodness!
        // intercept non element nodes
        if ( !($node instanceof DOMElement) ) {
            if ($node instanceof DOMComment) {
                $tokens[] = new HTMLPurifier_Token_Comment($node->data);
            } elseif ($node instanceof DOMText) {
                $tokens[] = new HTMLPurifier_Token_Text($node->data);
            }
            // quite possibly, the object wasn't handled, that's fine
            return $tokens;
        }
        // We still have to make sure that the element actually IS empty
        if (!$node->hasChildNodes()) {
            if ($collect) {
                $tokens[] = new HTMLPurifier_Token_Empty(
                    $node->tagName,
                    $this->transformAttrToAssoc($node->attributes)
                );
            }
        } else {
            if ($collect) { // don't wrap on first iteration
                $tokens[] = new HTMLPurifier_Token_Start(
                    $tag_name = $node->tagName, // somehow, it get's dropped
                    $this->transformAttrToAssoc($node->attributes)
                );
            }
            foreach ($node->childNodes as $node) {
                // remember, it's an accumulator. Otherwise, we'd have
                // to use array_merge
                $tokens = $this->tokenizeDOM($node, $tokens, true);
            }
            if ($collect) {
                $tokens[] = new HTMLPurifier_Token_End($tag_name);
            }
        }
        return $tokens;
    }
    protected function transformAttrToAssoc($attribute_list) {
        $attribute_array = array();
        // undocumented behavior
        foreach ($attribute_list as $key => $attr) {
            $attribute_array[$key] = $attr->value;
        }
        return $attribute_array;
    }
 }
 ?>
--- a/Token.php
+++ b/Token.php
@ -9,6 +9,8 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
    var $is_tag = true;
    var $name;
    function HTMLPurifier_Token_Tag($name) {
        // watch out, actually XML is case-sensitive, while HTML
        // is case insensitive, which means we can't use this for XML
        $this->name = strtolower($name); // for some reason, the SAX parser
                                         // uses uppercase. Investigate?
    }
@ -24,6 +26,8 @@ class HTMLPurifier_Token_RichTag extends HTMLPurifier_Token_Tag // abstract
    }
 }
 // start CONCRETE ones
 class HTMLPurifier_Token_Start extends HTMLPurifier_Token_RichTag
 {
    var $type = 'start';
--- a/tests/Lexer.php
+++ b/tests/Lexer.php
@ -2,16 +2,20 @@
 require_once 'HTMLPurifier/Lexer/DirectLex.php';
 require_once 'HTMLPurifier/Lexer/PEARSax3.php';
 require_once 'HTMLPurifier/Lexer/DOMLex.php';
 class Test_HTMLPurifier_Lexer extends UnitTestCase
 {
-    var $DirectLex;
+    var $DirectLex, $PEARSax3, $DOMLex;
-    var $PEARSax3;
+    var $_has_dom;
    function setUp() {
-        $this->DirectLex =& new HTMLPurifier_Lexer_DirectLex();
+        $this->DirectLex = new HTMLPurifier_Lexer_DirectLex();
-        $this->PEARSax3  =& new HTMLPurifier_Lexer_PEARSax3();
+        $this->PEARSax3  = new HTMLPurifier_Lexer_PEARSax3();
        $this->DOMLex    = new HTMLPurifier_Lexer_DOMLex();
        $this->_has_dom = version_compare(PHP_VERSION, '5', '>=');
    }
    function test_nextWhiteSpace() {
@ -67,6 +71,7 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
           ,new HTMLPurifier_Token_End('div')
            );
        // [XML-INVALID]
        $input[4] = '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>';
        $expect[4] = array(
            new HTMLPurifier_Token_Start('asdf')
@ -79,6 +84,17 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
           ,new HTMLPurifier_Token_End('asdf')
           ,new HTMLPurifier_Token_End('ASDF')
            );
        // DOM is different because it condenses empty tags into REAL empty ones
        // as well as makes it well-formed
        $dom_expect[4] = array(
            new HTMLPurifier_Token_Empty('asdf')
           ,new HTMLPurifier_Token_Empty('d')
           ,new HTMLPurifier_Token_Start('pooloka')
           ,new HTMLPurifier_Token_Start('poolasdf')
           ,new HTMLPurifier_Token_Empty('ds')
           ,new HTMLPurifier_Token_End('poolasdf')
           ,new HTMLPurifier_Token_End('pooloka')
            );
        $input[5] = '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>';
        $expect[5] = array(
@ -95,7 +111,7 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
            new HTMLPurifier_Token_Empty('br')
            );
-        // [INVALID] [RECOVERABLE]
+        // [SGML-INVALID] [RECOVERABLE]
        $input[7] = '<!-- Comment --> <!-- not so well formed --->';
        $expect[7] = array(
            new HTMLPurifier_Token_Comment(' Comment ')
@ -104,7 +120,7 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
            );
        $sax_expect[7] = false; // we need to figure out proper comment output
-        // [INVALID]
+        // [SGML-INVALID]
        $input[8] = '<a href=""';
        $expect[8] = array(
            new HTMLPurifier_Token_Text('<a href=""')
@ -113,6 +129,10 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
        $sax_expect[8] = array(
            new HTMLPurifier_Token_Start('a', array('href'=>''))
            ); 
        // DOM parses it into an empty tag
        $dom_expect[8] = array(
            new HTMLPurifier_Token_Empty('a', array('href'=>''))
            ); 
        $input[9] = '&lt;b&gt;';
        $expect[9] = array(
@ -126,11 +146,15 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
        // note that SAX can clump text nodes together. We won't be
        // too picky though
-        // [INVALID]
+        // [SGML-INVALID]
        $input[10] = '<a "=>';
        $expect[10] = array(
            new HTMLPurifier_Token_Start('a', array('"' => ''))
            );
        // DOM doesn't register an invalid attribute
        $dom_expect[10] = array(
            new HTMLPurifier_Token_Empty('a')
            );
        // [INVALID] [RECOVERABLE]
        $input[11] = '"';
@ -144,27 +168,42 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase
        foreach($input as $i => $discard) {
            $result = $this->DirectLex->tokenizeHTML($input[$i]);
-            $this->assertEqual($expect[$i], $result);
+            $this->assertEqual($expect[$i], $result, 'Test '.$i.': %s');
            paintIf($result, $expect[$i] != $result);
            // assert unless I say otherwise
            $sax_result = $this->PEARSax3->tokenizeHTML($input[$i]);
            if (!isset($sax_expect[$i])) {
                // by default, assert with normal result
-                $this->assertEqual($expect[$i], $sax_result);
+                $this->assertEqual($expect[$i], $sax_result, 'Test '.$i.': %s');
                paintIf($sax_result, $expect[$i] != $sax_result);
            } elseif ($sax_expect[$i] === false) {
                // assertions were turned off, optionally dump
                // paintIf($sax_expect, $i == NUMBER);
            } else {
                // match with a custom SAX result array
-                $this->assertEqual($sax_expect[$i], $sax_result);
+                $this->assertEqual($sax_expect[$i], $sax_result, 'Test '.$i.': %s');
                paintIf($sax_result, $sax_expect[$i] != $sax_result);
            }
            if ($this->_has_dom) {
                $dom_result = $this->DOMLex->tokenizeHTML($input[$i]);
                // same structure as SAX
                if (!isset($dom_expect[$i])) {
                    $this->assertEqual($expect[$i], $dom_result, 'Test '.$i.': %s');
                    paintIf($dom_result, $expect[$i] != $dom_result);
                } elseif ($dom_expect[$i] === false) {
                    // paintIf($dom_result, $i == NUMBER);
                } else {
                    $this->assertEqual($dom_expect[$i], $dom_result, 'Test '.$i.': %s');
                    paintIf($dom_result, $dom_expect[$i] != $dom_result);
                }
            }
        }
    }
    // internals testing
    function test_tokenizeAttributeString() {
        $input[] = 'href="asdf" boom="assdf"';