[2.1.0] True emoticon < fix.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1260 48356398-32a2-884e-a903-53898d9a118a
2025-03-11 17:18:44 +00:00 · 2007-06-27 16:40:18 +00:00 · 2007-06-27 16:40:18 +00:00 · a6ede3804e
commit a6ede3804e
parent 4476745003
4 changed files with 60 additions and 7 deletions
--- a/4
+++ b/4
@ -10,7 +10,9 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
 ==========================

 2.1.0, unknown release date
-(none)
+! With %Core.AggressivelyFixLt, <3 and similar emoticons no longer
+  trigger HTML removal in PHP5 (DOMLex). This directive is not necessary
+  for PHP4 (DirectLex).

 2.0.2, unknown release date
 (none)
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@ -3,6 +3,16 @@
 require_once 'HTMLPurifier/Lexer.php';
 require_once 'HTMLPurifier/TokenFactory.php';

+HTMLPurifier_ConfigSchema::define(
+    'Core', 'AggressivelyFixLt', false, 'bool', '
+This directive enables aggressive pre-filter fixes HTML Purifier can
+perform in order to ensure that open angled-brackets do not get killed
+during parsing stage. Enabling this will result in two preg_replace_callback
+calls and one preg_replace call for every bit of HTML passed through here.
+It is not necessary and will have no effect for PHP 4.
+This directive has been available since 2.1.0.
+');
+
 /**
 * Parser that uses PHP 5's DOM extension (part of the core).
 * 
@ -42,6 +52,16 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
        
        $html = $this->normalize($html, $config, $context);
        
+        // attempt to armor stray angled brackets that cannot possibly
+        // form tags and thus are probably being used as emoticons
+        if ($config->get('Core', 'AggressivelyFixLt')) {
+            $char = '[^a-z!\/]';
+            $comment = "/<!--(.*?)(-->|\z)/is";
+            $html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackArmorCommentEntities'), $html);
+            $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
+            $html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackUndoCommentSubst'), $html); // fix comments
+        }
+        
        // preprocess html, essential for UTF-8
        $html =
            '<!DOCTYPE html '.
@ -151,5 +171,21 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
     */
    public function muteErrorHandler($errno, $errstr) {}
    
+    /**
+     * Callback function for undoing escaping of stray angled brackets
+     * in comments
+     */
+    function callbackUndoCommentSubst($matches) {
+        return '<!--' . strtr($matches[1], array('&amp;'=>'&','&lt;'=>'<')) . $matches[2];
+    }
+    
+    /**
+     * Callback function that entity-izes ampersands in comments so that
+     * callbackUndoCommentSubst doesn't clobber them
+     */
+    function callbackArmorCommentEntities($matches) {
+        return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
+    }
+    
 }

--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@ -204,7 +204,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                // Check leading character is alnum, if not, we may
                // have accidently grabbed an emoticon. Translate into
                // text and go our merry way
-                if (!ctype_alnum($segment[0])) {
+                if (!ctype_alpha($segment[0])) {
+                    // XML:  $segment[0] !== '_' && $segment[0] !== ':'
                    if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
                    $token = new
                        HTMLPurifier_Token_Text(
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -288,16 +288,21 @@ class HTMLPurifier_LexerTest extends UnitTestCase
        $expect[18] = array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) );
        
        // test emoticon protection
-        $input[19] = '<b>Whoa! >.< That\'s not good >.></b>';
+        $input[19] = '<b>Whoa! <3 That\'s not good >.></b>';
        $expect[19] = array(
            new HTMLPurifier_Token_Start('b'),
-            new HTMLPurifier_Token_Text('Whoa! >.'),
-            new HTMLPurifier_Token_Text('< That\'s not good >'),
+            new HTMLPurifier_Token_Text('Whoa! '),
+            new HTMLPurifier_Token_Text('<3 That\'s not good >'),
            new HTMLPurifier_Token_Text('.>'),
            new HTMLPurifier_Token_End('b'),
        );
+        $dom_expect[19] = array(
+            new HTMLPurifier_Token_Start('b'),
+            new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
+            new HTMLPurifier_Token_End('b'),
+        );
        $sax_expect[19] = false; // SAX drops the < character
-        $dom_expect[19] = false; // DOM drops the entire pseudo-tag
+        $config[19] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true));
        
        // test comment parsing with funky characters inside
        $input[20] = '<!-- This >< comment --><br />';
@ -306,6 +311,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase
            new HTMLPurifier_Token_Empty('br')
        );
        $sax_expect[20] = false;
+        $config[20] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true));
        
        // test comment parsing of missing end
        $input[21] = '<!-- This >< comment';
@ -314,6 +320,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase
        );
        $sax_expect[21] = false;
        $dom_expect[21] = false;
+        $config[21] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true));
        
        // test CDATA tags
        $input[22] = '<script>alert("<foo>");</script>';
@ -324,7 +331,14 @@ class HTMLPurifier_LexerTest extends UnitTestCase
        );
        $config[22] = HTMLPurifier_Config::create(array('HTML.Trusted' => true));
        $sax_expect[22] = false;
-        //$dom_expect[22] = false;
+        
+        // test escaping
+        $input[23] = '<!-- This comment < &lt; & -->';
+        $expect[23] = array(
+            new HTMLPurifier_Token_Comment(' This comment < &lt; & ')
+        );
+        $sax_expect[23] = false;
+        $config[21] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true));
        
        $default_config = HTMLPurifier_Config::createDefault();
        $default_context = new HTMLPurifier_Context();