Refine Lexers for parsing stray angled brackets; %Core.AggressivelyFixLt = true

By default, the DirectLex and DOMLex behavior with stray angled brackets varied a great deal due to their implementations. A little known directive %Core.AggressivelyFixLt attempted to match DOMLex's behavior with DirectLex's, but it was off by default. By turning it on by default, users now enjoy these benefits, and performance-minded users can turn it back off. Also, several refinements to stray angled bracket parsing was made. Specifically: * DirectLex: Handle each left angled bracket individually, which prevents strange behavior as reported by eon. * DOMLex: Iterate aggressive lt fix, so that stacked brackets like << are handled. Signed-off-by: Edward Z. Yang <edwardzyang@thewritingpot.com>
2025-03-24 06:47:02 +00:00 · 2008-06-28 00:43:02 -04:00 · 2008-06-28 00:43:02 -04:00 · aa0fdeee30
commit aa0fdeee30
parent ba418a1f19
7 changed files with 86 additions and 29 deletions
--- a/4
+++ b/4
@ -9,8 +9,6 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
    . Internal change
 ==========================

-3.2.0, unknown release date
-
 3.1.2, unknown release date
 ! %Output.AttrSort for when you need your attributes in alphabetical order to
  deal with a bug in FCKEditor. Requested by frank farmer.
@ -22,6 +20,8 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
  use on hand-written HTML.
 ! Add error-cases for unsupported elements in MakeWellFormed. This enables
  the strategy to be used, standalone, on untrusted input.
+! %Core.AggressivelyFixLt is on by default. This causes more sensible
+  processing of left angled brackets in smileys and other whatnot.
 - Fix two bugs in %URI.MakeAbsolute; one involving empty paths in base URLs,
  the other involving an undefined $is_folder error.
 - Throw error when %Core.Encoding is set to a spurious value. Previously,
--- a/configdoc/usage.xml
+++ b/configdoc/usage.xml
@ -10,7 +10,7 @@
  <file name="HTMLPurifier/Lexer/DirectLex.php">
   <line>50</line>
   <line>62</line>
-   <line>327</line>
+   <line>319</line>
  </file>
  <file name="HTMLPurifier/Strategy/RemoveForeignElements.php">
   <line>47</line>
--- a/library/HTMLPurifier/ConfigSchema/schema.ser
+++ b/library/HTMLPurifier/ConfigSchema/schema.ser
--- a/library/HTMLPurifier/ConfigSchema/schema/Core.AggressivelyFixLt.txt
+++ b/library/HTMLPurifier/ConfigSchema/schema/Core.AggressivelyFixLt.txt
@ -1,13 +1,17 @@
 Core.AggressivelyFixLt
 TYPE: bool
 VERSION: 2.1.0
-DEFAULT: false
+DEFAULT: true
 --DESCRIPTION--
-
+<p>
    This directive enables aggressive pre-filter fixes HTML Purifier can
    perform in order to ensure that open angled-brackets do not get killed
    during parsing stage. Enabling this will result in two preg_replace_callback
-calls and one preg_replace call for every bit of HTML passed through here.
-It is not necessary and will have no effect for PHP 4.
-
-
+    calls and at least two preg_replace calls for every HTML document parsed;
+    if your users make very well-formed HTML, you can set this directive false.
+    This has no effect when DirectLex is used.
+</p>
+<p>
+    <strong>Notice:</strong> This directive's default turned from false to true
+    in HTML Purifier 3.1.2.
+</p>
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@ -45,7 +45,10 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
            $char = '[^a-z!\/]';
            $comment = "/<!--(.*?)(-->|\z)/is";
            $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
+            do {
+                $old = $html;
                $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
+            } while ($html !== $old);
            $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
        }
        
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@ -197,20 +197,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                if (!ctype_alpha($segment[0])) {
                    // XML:  $segment[0] !== '_' && $segment[0] !== ':'
                    if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
-                    $token = new
-                        HTMLPurifier_Token_Text(
-                            '<' .
-                            $this->parseData(
-                                $segment
-                            ) . 
-                            '>'
-                        );
+                    $token = new HTMLPurifier_Token_Text('<');
                    if ($maintain_line_numbers) {
                        $token->line = $current_line;
                        $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                    }
                    $array[] = $token;
-                    $cursor = $position_next_gt + 1;
                    $inside_tag = false;
                    continue;
                }
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -418,14 +418,13 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
    }
    
    function test_tokenizeHTML_emoticonProtection() {
-        $this->config->set('Core', 'AggressivelyFixLt', true);
        $this->assertTokenization(
            '<b>Whoa! <3 That\'s not good >.></b>',
            array(
                new HTMLPurifier_Token_Start('b'),
                new HTMLPurifier_Token_Text('Whoa! '),
-                new HTMLPurifier_Token_Text('<3 That\'s not good >'),
-                new HTMLPurifier_Token_Text('.>'),
+                new HTMLPurifier_Token_Text('<'),
+                new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
                new HTMLPurifier_Token_End('b')
            ),
            array(
@ -491,7 +490,6 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
    }
    
    function test_tokenizeHTML_entitiesInComment() {
-        $this->config->set('Core', 'AggressivelyFixLt', true);
        $this->assertTokenization(
            '<!-- This comment < &lt; & -->',
            array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') ),
@ -508,7 +506,8 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
            array(
                'DirectLex' => array(
                    new HTMLPurifier_Token_Start('a', array('href' => '')),
-                    new HTMLPurifier_Token_Text('<">'),
+                    new HTMLPurifier_Token_Text('<'),
+                    new HTMLPurifier_Token_Text('">'),
                ),
                'PEARSax3' => false,
            )
@ -556,7 +555,7 @@ div {}
        );
    }
    
-    function test_tokenizeHTML_() {
+    function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
        $this->assertTokenization(
            '<a@>>',
            array(
@ -576,6 +575,65 @@ div {}
        );
    }
    
+    function test_tokenizeHTML_emoticonHeart() {
+        $this->assertTokenization(
+            '<br /><3<br />',
+            array(
+                new HTMLPurifier_Token_Empty('br'),
+                new HTMLPurifier_Token_Text('<'),
+                new HTMLPurifier_Token_Text('3'),
+                new HTMLPurifier_Token_Empty('br'),
+            ),
+            array(
+                'DOMLex' => array(
+                    new HTMLPurifier_Token_Empty('br'),
+                    new HTMLPurifier_Token_Text('<3'),
+                    new HTMLPurifier_Token_Empty('br'),
+                ),
+            )
+        );
+    }
+    
+    function test_tokenizeHTML_emoticonShiftyEyes() {
+        $this->assertTokenization(
+            '<b><<</b>',
+            array(
+                new HTMLPurifier_Token_Start('b'),
+                new HTMLPurifier_Token_Text('<'),
+                new HTMLPurifier_Token_Text('<'),
+                new HTMLPurifier_Token_End('b'),
+            ),
+            array(
+                'DOMLex' => array(
+                    new HTMLPurifier_Token_Start('b'),
+                    new HTMLPurifier_Token_Text('<<'),
+                    new HTMLPurifier_Token_End('b'),
+                ),
+            )
+        );
+    }
+    
+    function test_tokenizeHTML_eon1996() {
+        $this->assertTokenization(
+            '< <b>test</b>',
+            array(
+                new HTMLPurifier_Token_Text('<'),
+                new HTMLPurifier_Token_Text(' '),
+                new HTMLPurifier_Token_Start('b'),
+                new HTMLPurifier_Token_Text('test'),
+                new HTMLPurifier_Token_End('b'),
+            ),
+            array(
+                'DOMLex' => array(
+                    new HTMLPurifier_Token_Text('< '),
+                    new HTMLPurifier_Token_Start('b'),
+                    new HTMLPurifier_Token_Text('test'),
+                    new HTMLPurifier_Token_End('b'),
+                ),
+            )
+        );
+    }
+    
    /*
    
    function test_tokenizeHTML_() {