Fix extant broken PEARSax3 parsing patterns.

Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
2025-01-03 05:11:52 +00:00 · 2010-02-26 21:14:52 -05:00 · 2010-02-26 21:14:52 -05:00 · ac18672aba
commit ac18672aba
parent faf28682ad
2 changed files with 54 additions and 26 deletions
--- a/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/library/HTMLPurifier/Lexer/PEARSax3.php
@ -28,6 +28,7 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
    protected $tokens = array();
    private $parent_handler;
    private $stack = array();
    public function tokenizeHTML($string, $config, $context) {
@ -67,6 +68,7 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
        } else {
            $this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs);
        }
        $this->stack[] = $name;
        return true;
    }
@ -81,6 +83,7 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
            return true;
        }
        $this->tokens[] = new HTMLPurifier_Token_End($name);
        if (!empty($this->stack)) array_pop($this->stack);
        return true;
    }
@ -97,7 +100,17 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
     */
    public function escapeHandler(&$parser, $data) {
        if (strpos($data, '--') === 0) {
-            $this->tokens[] = new HTMLPurifier_Token_Comment($data);
+            // remove trailing and leading double-dashes
            $data = substr($data, 2);
            if (strlen($data) >= 2 && substr($data, -2) == "--") {
                $data = substr($data, 0, -2);
            }
            if (isset($this->stack[sizeof($this->stack) - 1]) &&
                $this->stack[sizeof($this->stack) - 1] == "style") {
                $this->tokens[] = new HTMLPurifier_Token_Text($data);
            } else {
                $this->tokens[] = new HTMLPurifier_Token_Comment($data);
            }
        }
        // CDATA is handled elsewhere, but if it was handled here:
        //if (strpos($data, '[CDATA[') === 0) {
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -172,7 +172,6 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
            }
            if ($t_expect != $result) {
                printTokens($result);
                //var_dump($result);
            }
        }
    }
@ -270,20 +269,14 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
    function test_tokenizeHTML_comment() {
        $this->assertTokenization(
            '<!-- Comment -->',
-            array( new HTMLPurifier_Token_Comment(' Comment ') ),
+            array( new HTMLPurifier_Token_Comment(' Comment ') )
            array(
                'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- Comment --') ),
            )
        );
    }
    function test_tokenizeHTML_malformedComment() {
        $this->assertTokenization(
            '<!-- not so well formed --->',
-            array( new HTMLPurifier_Token_Comment(' not so well formed -') ),
+            array( new HTMLPurifier_Token_Comment(' not so well formed -') )
            array(
                'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- not so well formed ---') ),
            )
        );
    }
@ -574,6 +567,13 @@ div {}
    }
    function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
        $alt_expect = array(
            // Technically this is invalid, but it won't be a
            // problem with invalid element removal; also, this
            // mimics Mozilla's parsing of the tag.
            new HTMLPurifier_Token_Start('a@'),
            new HTMLPurifier_Token_Text('>'),
        );
        $this->assertTokenization(
            '<a@>>',
            array(
@ -582,13 +582,8 @@ div {}
                new HTMLPurifier_Token_End('a'),
            ),
            array(
-                'DirectLex' => array(
+                'DirectLex' => $alt_expect,
-                    // Technically this is invalid, but it won't be a
+                'PEARSax3' => $alt_expect,
                    // problem with invalid element removal; also, this
                    // mimics Mozilla's parsing of the tag.
                    new HTMLPurifier_Token_Start('a@'),
                    new HTMLPurifier_Token_Text('>'),
                ),
            )
        );
    }
@ -608,6 +603,11 @@ div {}
                    new HTMLPurifier_Token_Text('<3'),
                    new HTMLPurifier_Token_Empty('br'),
                ),
                'PEARSax3' => array(
                    // bah too lazy to fix this
                    new HTMLPurifier_Token_Empty('br'),
                    new HTMLPurifier_Token_Empty('3<br'),
                ),
            )
        );
    }
@ -627,6 +627,12 @@ div {}
                    new HTMLPurifier_Token_Text('<<'),
                    new HTMLPurifier_Token_End('b'),
                ),
                'PEARSax3' => array(
                    // also too lazy to fix
                    new HTMLPurifier_Token_Start('b'),
                    new HTMLPurifier_Token_Empty('<<'),
                    new HTMLPurifier_Token_Text('b>'),
                ),
            )
        );
    }
@ -648,26 +654,35 @@ div {}
                    new HTMLPurifier_Token_Text('test'),
                    new HTMLPurifier_Token_End('b'),
                ),
                'PEARSax3' => array(
                    // totally doing the wrong thing here
                    new HTMLPurifier_Token_Text(' '),
                    new HTMLPurifier_Token_Start('b'),
                    new HTMLPurifier_Token_Text('test'),
                    new HTMLPurifier_Token_End('b'),
                ),
            )
        );
    }
    function test_tokenizeHTML_bodyInCDATA() {
        $alt_tokens = array(
            new HTMLPurifier_Token_Text('<'),
            new HTMLPurifier_Token_Text('body'),
            new HTMLPurifier_Token_Text('>'),
            new HTMLPurifier_Token_Text('Foo'),
            new HTMLPurifier_Token_Text('<'),
            new HTMLPurifier_Token_Text('/body'),
            new HTMLPurifier_Token_Text('>'),
        );
        $this->assertTokenization(
            '<![CDATA[<body>Foo</body>]]>',
            array(
                new HTMLPurifier_Token_Text('<body>Foo</body>'),
            ),
            array(
-                'PH5P' => array(
+                'PH5P' => $alt_tokens,
-                    new HTMLPurifier_Token_Text('<'),
+                'PEARSax3' => $alt_tokens,
                    new HTMLPurifier_Token_Text('body'),
                    new HTMLPurifier_Token_Text('>'),
                    new HTMLPurifier_Token_Text('Foo'),
                    new HTMLPurifier_Token_Text('<'),
                    new HTMLPurifier_Token_Text('/body'),
                    new HTMLPurifier_Token_Text('>'),
                ),
            )
        );
    }