[2.0.1] Improve special case handling for <script>

- DirectLex now honors comments with greater than or less than signs in them - Comments are transformed into script elements, ending comments are scrapped - Buggy generator code rewritten to be more error-proof - AttrValidator checks if token has attributes before processing - Remove invalid documentation from Scripting - "Commenting" of script elements switched to the more advanced version git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1189 48356398-32a2-884e-a903-53898d9a118a
2024-12-22 16:31:53 +00:00 · 2007-06-21 14:44:26 +00:00 · 2007-06-21 14:44:26 +00:00 · bf0d659c47
commit bf0d659c47
parent e55551ecdd
13 changed files with 179 additions and 32 deletions
--- a/3
+++ b/3
@ -9,6 +9,9 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
    . Internal change
 ==========================

+2.0.1, unknown release date
+- Clean up special case code for <script> tags
+
 2.0.0, released 2007-06-20
 # Completely refactored HTMLModuleManager, decentralizing safety
  information
--- a/library/HTMLPurifier/AttrValidator.php
+++ b/library/HTMLPurifier/AttrValidator.php
@ -8,6 +8,8 @@ class HTMLPurifier_AttrValidator
            
        $definition = $config->getHTMLDefinition();
        
+        if ($token->type !== 'start' && $token->type !== 'empty') return $token;
+        
        // create alias to global definition array, see also $defs
        // DEFINITION CALL
        $d_defs = $definition->info_global_attr;
--- a/library/HTMLPurifier/Generator.php
+++ b/library/HTMLPurifier/Generator.php
@ -4,7 +4,7 @@ HTMLPurifier_ConfigSchema::define(
    'Output', 'CommentScriptContents', true, 'bool',
    'Determines whether or not HTML Purifier should attempt to fix up '.
    'the contents of script tags for legacy browsers with comments. This '.
-    'directive was available since 1.7.'
+    'directive was available since 2.0.0.'
 );
 HTMLPurifier_ConfigSchema::defineAlias('Core', 'CommentScriptContents', 'Output', 'CommentScriptContents');

@ -76,13 +76,17 @@ class HTMLPurifier_Generator
        
        if (!$tokens) return '';
        for ($i = 0, $size = count($tokens); $i < $size; $i++) {
-            if ($this->_scriptFix && $tokens[$i]->name === 'script') {
+            if ($this->_scriptFix && $tokens[$i]->name === 'script'
+                && $i + 2 < $size && $tokens[$i+2]->type == 'end') {
                // script special case
+                // the contents of the script block must be ONE token
+                // for this to work
                $html .= $this->generateFromToken($tokens[$i++]);
                $html .= $this->generateScriptFromToken($tokens[$i++]);
-                while ($tokens[$i]->name != 'script') {
-                    $html .= $this->generateScriptFromToken($tokens[$i++]);
-                }
+                // We're not going to do this: it wouldn't be valid anyway
+                //while ($tokens[$i]->name != 'script') {
+                //    $html .= $this->generateScriptFromToken($tokens[$i++]);
+                //}
            }
            $html .= $this->generateFromToken($tokens[$i]);
        }
@ -148,10 +152,12 @@ class HTMLPurifier_Generator
     *          --> somewhere inside the script contents.
     */
    function generateScriptFromToken($token) {
-        if (!$token->type == 'text') return $this->generateFromToken($token);
-        return '<!--' . PHP_EOL . $token->data . PHP_EOL . '// -->';
+        if ($token->type != 'text') return $this->generateFromToken($token);
+        // return '<!--' . PHP_EOL . trim($token->data) . PHP_EOL . '// -->';
        // more advanced version:
-        // return '<!--//--><![CDATA[//><!--' . PHP_EOL . $token->data . PHP_EOL . '//--><!]]>';
+        // thanks <http://lachy.id.au/log/2005/05/script-comments>
+        $data = preg_replace('#//\s*$#', '', $token->data);
+        return '<!--//--><![CDATA[//><!--' . PHP_EOL . trim($data) . PHP_EOL . '//--><!]]>';
    }
    
    /**
--- a/library/HTMLPurifier/HTMLModule/Scripting.php
+++ b/library/HTMLPurifier/HTMLModule/Scripting.php
@ -5,14 +5,6 @@
 WARNING: THIS MODULE IS EXTREMELY DANGEROUS AS IT ENABLES INLINE SCRIPTING
 INSIDE HTML PURIFIER DOCUMENTS. USE ONLY WITH TRUSTED USER INPUT!!!

-Usage:
-
-require_once 'HTMLPurifier/HTMLModule/Scripting.php';
-$def =& $config->getHTMLDefinition(true); // get the raw version
-$def->manager->addModule('Scripting');
-
-This must come before any other calls to getHTMLDefinition()
-
 */

 /**
@ -63,6 +55,7 @@ class HTMLPurifier_HTMLModule_Scripting extends HTMLPurifier_HTMLModule
        );
        $this->info['script']->content_model = '#PCDATA';
        $this->info['script']->content_model_type = 'optional';
+        $this->info['script']->attr_transform_pre['type'] =
        $this->info['script']->attr_transform_post['type'] =
            new HTMLPurifier_AttrTransform_ScriptRequired();
    }
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@ -259,7 +259,19 @@ class HTMLPurifier_Lexer
     */
    function escapeCDATA($string) {
        return preg_replace_callback(
-            '/<!\[CDATA\[(.+?)\]\]>/',
+            '/<!\[CDATA\[(.+?)\]\]>/s',
+            array('HTMLPurifier_Lexer', 'CDATACallback'),
+            $string
+        );
+    }
+    
+    /**
+     * Special CDATA case that is especiall convoluted for <script>
+     */
+    function escapeCommentedCDATA($string) {
+        // <!--//--><![CDATA[//><!--
+        return preg_replace_callback(
+            '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
            array('HTMLPurifier_Lexer', 'CDATACallback'),
            $string
        );
@ -291,6 +303,11 @@ class HTMLPurifier_Lexer
            $html = $this->extractBody($html);
        }
        
+        if ($config->get('HTML', 'Trusted')) {
+            // escape convoluted CDATA
+            $html = $this->escapeCommentedCDATA($html);
+        }
+        
        // escape CDATA
        $html = $this->escapeCDATA($html);
        
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@ -83,10 +83,13 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
        // intercept non element nodes. WE MUST catch all of them,
        // but we're not getting the character reference nodes because
        // those should have been preprocessed
-        if ($node->nodeType === XML_TEXT_NODE ||
-                  $node->nodeType === XML_CDATA_SECTION_NODE) {
+        if ($node->nodeType === XML_TEXT_NODE) {
            $tokens[] = $this->factory->createText($node->data);
            return;
+        } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
+            // undo DOM's special treatment of <script> tags
+            $tokens[] = $this->factory->createText($this->parseData($node->data));
+            return;
        } elseif ($node->nodeType === XML_COMMENT_NODE) {
            $tokens[] = $this->factory->createComment($node->data);
            return;
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@ -126,22 +126,34 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                
                // Check if it's a comment
                if (
-                    substr($segment, 0, 3) == '!--' &&
-                    substr($segment, $strlen_segment-2, 2) == '--'
+                    substr($segment, 0, 3) == '!--'
                ) {
+                    // re-determine segment length, looking for -->
+                    $position_comment_end = strpos($html, '-->', $cursor);
+                    if ($position_comment_end === false) {
+                        // uh oh, we have a comment that extends to
+                        // infinity. Can't be helped: set comment
+                        // end position to end of string
+                        $position_comment_end = strlen($html);
+                        $end = true;
+                    } else {
+                        $end = false;
+                    }
+                    $strlen_segment = $position_comment_end - $cursor;
+                    $segment = substr($html, $cursor, $strlen_segment);
                    $token = new
                        HTMLPurifier_Token_Comment(
                            substr(
-                                $segment, 3, $strlen_segment - 5
+                                $segment, 3, $strlen_segment - 3
                            )
                        );
                    if ($maintain_line_numbers) {
                        $token->line = $current_line;
-                        $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
+                        $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
                    }
                    $array[] = $token;
+                    $cursor = $end ? $position_comment_end : $position_comment_end + 3;
                    $inside_tag = false;
-                    $cursor = $position_next_gt + 1;
                    continue;
                }
                
--- a/library/HTMLPurifier/Strategy/RemoveForeignElements.php
+++ b/library/HTMLPurifier/Strategy/RemoveForeignElements.php
@ -17,9 +17,11 @@ HTMLPurifier_ConfigSchema::define(

 HTMLPurifier_ConfigSchema::define(
    'Core', 'RemoveScriptContents', true, 'bool', '
-This directive enables HTML Purifier to remove not only script tags
-but all of their contents. This directive has been available since 2.0.0,
-revert to pre-2.0.0 behavior by setting to false.
+<p>
+  This directive enables HTML Purifier to remove not only script tags
+  but all of their contents. This directive has been available since 2.0.0,
+  revert to pre-2.0.0 behavior by setting to false.
+</p>
 '
 );

@ -48,6 +50,9 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
        // removes tokens until it reaches a closing tag with its value
        $remove_until = false;
        
+        // converts comments into text tokens when this is equal to a tag name
+        $textify_comments = false;
+        
        foreach($tokens as $token) {
            if ($remove_until) {
                if (empty($token->is_tag) || $token->name !== $remove_until) {
@ -88,6 +93,13 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
                        $token->armor['ValidateAttributes'] = true;
                    }
                    
+                    // CAN BE GENERICIZED
+                    if ($token->name == 'script' && $token->type == 'start') {
+                        $textify_comments = $token->name;
+                    } elseif ($token->name === $textify_comments && $token->type == 'end') {
+                        $textify_comments = false;
+                    }
+                    
                } elseif ($escape_invalid_tags) {
                    // invalid tag, generate HTML and insert in
                    $token = new HTMLPurifier_Token_Text(
@ -108,8 +120,14 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
                    continue;
                }
            } elseif ($token->type == 'comment') {
-                // strip comments
-                continue;
+                // textify comments in script tags when they are allowed
+                if ($textify_comments !== false) {
+                    $data = $token->data;
+                    $token = new HTMLPurifier_Token_Text($data);
+                } else {
+                    // strip comments
+                    continue;
+                }
            } elseif ($token->type == 'text') {
            } else {
                continue;
--- a/tests/HTMLPurifier/GeneratorTest.php
+++ b/tests/HTMLPurifier/GeneratorTest.php
@ -141,9 +141,31 @@ class HTMLPurifier_GeneratorTest extends HTMLPurifier_Harness
                new HTMLPurifier_Token_Text('alert(3 < 5);'),
                new HTMLPurifier_Token_End('script')
            ),
-            "<script><!--\nalert(3 < 5);\n// --></script>"
+            "<script><!--//--><![CDATA[//><!--\nalert(3 < 5);\n//--><!]]></script>"
        );
        
+        // if missing close tag, don't do anything
+        $this->assertGeneration(
+            array(
+                new HTMLPurifier_Token_Start('script'),
+                new HTMLPurifier_Token_Text('alert(3 < 5);'),
+            ),
+            "<script>alert(3 &lt; 5);"
+        );
+        
+        // if two script blocks, don't do anything
+        $this->assertGeneration(
+            array(
+                new HTMLPurifier_Token_Start('script'),
+                new HTMLPurifier_Token_Text('alert(3 < 5);'),
+                new HTMLPurifier_Token_Text('foo();'),
+                new HTMLPurifier_Token_End('script')
+            ),
+            "<script>alert(3 &lt; 5);foo();</script>"
+        );
+        
+        
+        
        $this->config = HTMLPurifier_Config::createDefault();
        $this->config->set('Core', 'CommentScriptContents', false);
        
--- a/tests/HTMLPurifier/HTMLModule/ScriptingTest.php
+++ b/tests/HTMLPurifier/HTMLModule/ScriptingTest.php
@ -18,6 +18,14 @@ class HTMLPurifier_HTMLModule_ScriptingTest extends HTMLPurifier_HTMLModuleHarne
            array('HTML.Trusted' => true)
        );
        
+        // CDATA
+        $this->assertResult(
+'//<![CDATA[
+alert("<This is compatible with XHTML>");
+//]]> ', true,
+            array('HTML.Trusted' => true)
+        );
+        
        // max
        $this->assertResult(
            '<script
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -299,6 +299,22 @@ class HTMLPurifier_LexerTest extends UnitTestCase
        $sax_expect[19] = false; // SAX drops the < character
        $dom_expect[19] = false; // DOM drops the entire pseudo-tag
        
+        // test comment parsing with funky characters inside
+        $input[20] = '<!-- This >< comment --><br />';
+        $expect[20] = array(
+            new HTMLPurifier_Token_Comment(' This >< comment '),
+            new HTMLPurifier_Token_Empty('br')
+        );
+        $sax_expect[20] = false;
+        
+        // test comment parsing of missing end
+        $input[21] = '<!-- This >< comment';
+        $expect[21] = array(
+            new HTMLPurifier_Token_Comment(' This >< comment')
+        );
+        $sax_expect[21] = false;
+        $dom_expect[21] = false;
+        
        $default_config = HTMLPurifier_Config::createDefault();
        $default_context = new HTMLPurifier_Context();
        foreach($input as $i => $discard) {
--- a/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php
+++ b/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php
@ -79,6 +79,18 @@ class HTMLPurifier_Strategy_RemoveForeignElementsTest
            array('HTML.Allowed' => 'div')
        );
        
+        // text-ify commented script contents ( the trailing comment gets
+        // removed during generation )
+        $this->assertResult(
+'<script type="text/javascript"><!--
+alert(<b>bold</b>);
+// --></script>',
+'<script type="text/javascript">
+alert(&lt;b&gt;bold&lt;/b&gt;);
+// </script>',
+            array('HTML.Trusted' => true, 'Output.CommentScriptContents' => false)
+        );
+        
    }
    
 }
--- a/tests/HTMLPurifierTest.php
+++ b/tests/HTMLPurifierTest.php
@ -12,9 +12,9 @@ class HTMLPurifierTest extends UnitTestCase
        $this->purifier = new HTMLPurifier();
    }
    
-    function assertPurification($input, $expect = null) {
+    function assertPurification($input, $expect = null, $config = array()) {
        if ($expect === null) $expect = $input;
-        $result = $this->purifier->purify($input);
+        $result = $this->purifier->purify($input, $config);
        $this->assertIdentical($expect, $result);
    }
    
@ -97,6 +97,41 @@ class HTMLPurifierTest extends UnitTestCase
        
    }
    
+    function testScript() {
+        $this->purifier = new HTMLPurifier(array('HTML.Trusted' => true));
+        $ideal = '<script type="text/javascript"><!--//--><![CDATA[//><!--
+alert("<This is compatible with XHTML>");
+//--><!]]></script>';
+        
+        $this->assertPurification($ideal);
+        
+        $this->assertPurification(
+            '<script type="text/javascript"><![CDATA[
+alert("<This is compatible with XHTML>");
+]]></script>',
+            $ideal
+        );
+        
+        $this->assertPurification(
+            '<script type="text/javascript">alert("<This is compatible with XHTML>");</script>',
+            $ideal
+        );
+        
+        $this->assertPurification(
+            '<script type="text/javascript"><!--
+alert("<This is compatible with XHTML>");
+//--></script>',
+            $ideal
+        );
+        
+        $this->assertPurification(
+            '<script type="text/javascript"><![CDATA[
+alert("<This is compatible with XHTML>");
+//]]></script>',
+            $ideal
+        );
+    }
+    
 }

 ?>