Cleanup final renaming stuff (how could I forget to rename the class) and hook in the SAX parser. It has a bit different behavior, so you'll have to be careful.

git-svn-id: http://htmlpurifier.org/svnroot/html_purifier/trunk@21 48356398-32a2-884e-a903-53898d9a118a
2025-03-24 14:57:01 +00:00 · 2006-04-15 01:47:12 +00:00 · 2006-04-15 01:47:12 +00:00 · 181d544b61
commit 181d544b61
parent bf331d3a13
2 changed files with 58 additions and 8 deletions
--- a/HTML_Lexer.php
+++ b/HTML_Lexer.php
@ -9,7 +9,7 @@ TODO:
 */
-class MarkupLexer
+class HTML_Lexer
 {
    function nextQuote($string, $offset = 0) {
@ -98,7 +98,10 @@ class MarkupLexer
                    continue;
                }
-                // Check if it is self closing, if so, remove trailing slash
+                // Check if it is explicitly self closing, if so, remove
                // trailing slash. Remember, we could have a tag like <br>, so
                // any later token processing scripts must convert improperly
                // classified EmptyTags from StartTags.
                $is_self_closing = (strpos($segment,'/') === strlen($segment) - 1);
                if ($is_self_closing) {
                    $segment = substr($segment, 0, strlen($segment) - 1);
@ -189,4 +192,45 @@ class MarkupLexer
 }
 // uses the PEAR class XML_HTMLSax3 to parse XML
 class HTML_Lexer_Sax extends HTML_Lexer
 {
    var $tokens = array();
    function tokenizeHTML($html) {
        $this->tokens = array();
        $parser=& new XML_HTMLSax3();
        $parser->set_object($this);
        $parser->set_element_handler('openHandler','closeHandler');
        $parser->set_data_handler('dataHandler');
        $parser->set_escape_handler('escapeHandler');
        $parser->parse($html);
        return $this->tokens;
    }
    function openHandler(&$parser, $name, $attrs) {
        $this->tokens[] = new MF_StartTag($name, $attrs);
        return true;
    }
    function closeHandler(&$parser, $name) {
        $this->tokens[] = new MF_EndTag($name);
        return true;
    }
    function dataHandler(&$parser, $data) {
        $this->tokens[] = new MF_Text($data);
        return true;
    }
    function escapeHandler(&$parser, $data) {
        if (strpos($data, '-') === 0) {
            $this->tokens[] = new MF_Comment($data);
        }
        return true;
    }
 }
 ?>
--- a/tests/HTML_Lexer.php
+++ b/tests/HTML_Lexer.php
@ -1,16 +1,18 @@
 <?php
-class TestCase_MarkupLexer extends UnitTestCase
+class TestCase_HTML_Lexer extends UnitTestCase
 {
-    var $MarkupLexer;
+    var $HTML_Lexer;
    var $HTML_Lexer_Sax;
    function setUp() {
-        $this->MarkupLexer =& new MarkupLexer();
+        $this->HTML_Lexer     =& new HTML_Lexer();
        $this->HTML_Lexer_Sax =& new HTML_Lexer_Sax();
    }
    function test_nextWhiteSpace() {
-        $HP =& $this->MarkupLexer;
+        $HP =& $this->HTML_Lexer;
        $this->assertIdentical(false, $HP->nextWhiteSpace('asdf'));
        $this->assertIdentical(0, $HP->nextWhiteSpace(' asdf'));
        $this->assertIdentical(0, $HP->nextWhiteSpace("\nasdf"));
@ -90,9 +92,13 @@ class TestCase_MarkupLexer extends UnitTestCase
        $size = count($input);
        for($i = 0; $i < $size; $i++) {
-            $result = $this->MarkupLexer->tokenizeHTML($input[$i]);
+            $result = $this->HTML_Lexer->tokenizeHTML($input[$i]);
            $this->assertEqual($expect[$i], $result);
            paintIf($result, $expect[$i] != $result);
            // since I didn't write the parser, I can't define its behavior
            // however, make sure that the class runs without any errors
            $exp_result = $this->HTML_Lexer_Sax->tokenizeHTML($input[$i]);
        }
    }
@ -116,7 +122,7 @@ class TestCase_MarkupLexer extends UnitTestCase
        $size = count($input);
        for($i = 0; $i < $size; $i++) {
-            $result = $this->MarkupLexer->tokenizeAttributeString($input[$i]);
+            $result = $this->HTML_Lexer->tokenizeAttributeString($input[$i]);
            $this->assertEqual($expect[$i], $result);
            paintIf($result, $expect[$i] != $result);
        }