Cleanup final renaming stuff (how could I forget to rename the class) and hook in the SAX parser. It has a bit different behavior, so you'll have to be careful.

git-svn-id: http://htmlpurifier.org/svnroot/html_purifier/trunk@21 48356398-32a2-884e-a903-53898d9a118a
2024-12-22 16:31:53 +00:00 · 2006-04-15 01:47:12 +00:00 · 2006-04-15 01:47:12 +00:00 · 181d544b61
commit 181d544b61
parent bf331d3a13
2 changed files with 58 additions and 8 deletions
--- a/HTML_Lexer.php
+++ b/HTML_Lexer.php
@ -9,7 +9,7 @@ TODO:

 */

-class MarkupLexer
+class HTML_Lexer
 {
    
    function nextQuote($string, $offset = 0) {
@ -98,7 +98,10 @@ class MarkupLexer
                    continue;
                }
                
-                // Check if it is self closing, if so, remove trailing slash
+                // Check if it is explicitly self closing, if so, remove
+                // trailing slash. Remember, we could have a tag like <br>, so
+                // any later token processing scripts must convert improperly
+                // classified EmptyTags from StartTags.
                $is_self_closing = (strpos($segment,'/') === strlen($segment) - 1);
                if ($is_self_closing) {
                    $segment = substr($segment, 0, strlen($segment) - 1);
@ -189,4 +192,45 @@ class MarkupLexer
    
 }

+// uses the PEAR class XML_HTMLSax3 to parse XML
+class HTML_Lexer_Sax extends HTML_Lexer
+{
+    
+    var $tokens = array();
+    
+    function tokenizeHTML($html) {
+        $this->tokens = array();
+        $parser=& new XML_HTMLSax3();
+        $parser->set_object($this);
+        $parser->set_element_handler('openHandler','closeHandler');
+        $parser->set_data_handler('dataHandler');
+        $parser->set_escape_handler('escapeHandler');
+        $parser->parse($html);
+        return $this->tokens;
+    }
+    
+    function openHandler(&$parser, $name, $attrs) {
+        $this->tokens[] = new MF_StartTag($name, $attrs);
+        return true;
+    }
+    
+    function closeHandler(&$parser, $name) {
+        $this->tokens[] = new MF_EndTag($name);
+        return true;
+    }
+    
+    function dataHandler(&$parser, $data) {
+        $this->tokens[] = new MF_Text($data);
+        return true;
+    }
+    
+    function escapeHandler(&$parser, $data) {
+        if (strpos($data, '-') === 0) {
+            $this->tokens[] = new MF_Comment($data);
+        }
+        return true;
+    }
+    
+}
+
 ?>
--- a/tests/HTML_Lexer.php
+++ b/tests/HTML_Lexer.php
@ -1,16 +1,18 @@
 <?php

-class TestCase_MarkupLexer extends UnitTestCase
+class TestCase_HTML_Lexer extends UnitTestCase
 {
    
-    var $MarkupLexer;
+    var $HTML_Lexer;
+    var $HTML_Lexer_Sax;
    
    function setUp() {
-        $this->MarkupLexer =& new MarkupLexer();
+        $this->HTML_Lexer     =& new HTML_Lexer();
+        $this->HTML_Lexer_Sax =& new HTML_Lexer_Sax();
    }
    
    function test_nextWhiteSpace() {
-        $HP =& $this->MarkupLexer;
+        $HP =& $this->HTML_Lexer;
        $this->assertIdentical(false, $HP->nextWhiteSpace('asdf'));
        $this->assertIdentical(0, $HP->nextWhiteSpace(' asdf'));
        $this->assertIdentical(0, $HP->nextWhiteSpace("\nasdf"));
@ -90,9 +92,13 @@ class TestCase_MarkupLexer extends UnitTestCase
        
        $size = count($input);
        for($i = 0; $i < $size; $i++) {
-            $result = $this->MarkupLexer->tokenizeHTML($input[$i]);
+            $result = $this->HTML_Lexer->tokenizeHTML($input[$i]);
            $this->assertEqual($expect[$i], $result);
            paintIf($result, $expect[$i] != $result);
+            
+            // since I didn't write the parser, I can't define its behavior
+            // however, make sure that the class runs without any errors
+            $exp_result = $this->HTML_Lexer_Sax->tokenizeHTML($input[$i]);
        }
        
    }
@ -116,7 +122,7 @@ class TestCase_MarkupLexer extends UnitTestCase
        
        $size = count($input);
        for($i = 0; $i < $size; $i++) {
-            $result = $this->MarkupLexer->tokenizeAttributeString($input[$i]);
+            $result = $this->HTML_Lexer->tokenizeAttributeString($input[$i]);
            $this->assertEqual($expect[$i], $result);
            paintIf($result, $expect[$i] != $result);
        }