0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-18 11:41:52 +00:00

[2.0.1] Improve special case handling for <script>

- DirectLex now honors comments with greater than or less than signs in them
- Comments are transformed into script elements, ending comments are scrapped
- Buggy generator code rewritten to be more error-proof
- AttrValidator checks if token has attributes before processing
- Remove invalid documentation from Scripting
- "Commenting" of script elements switched to the more advanced version

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1189 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2007-06-21 14:44:26 +00:00
parent e55551ecdd
commit bf0d659c47
13 changed files with 179 additions and 32 deletions

3
NEWS
View File

@ -9,6 +9,9 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
. Internal change . Internal change
========================== ==========================
2.0.1, unknown release date
- Clean up special case code for <script> tags
2.0.0, released 2007-06-20 2.0.0, released 2007-06-20
# Completely refactored HTMLModuleManager, decentralizing safety # Completely refactored HTMLModuleManager, decentralizing safety
information information

View File

@ -8,6 +8,8 @@ class HTMLPurifier_AttrValidator
$definition = $config->getHTMLDefinition(); $definition = $config->getHTMLDefinition();
if ($token->type !== 'start' && $token->type !== 'empty') return $token;
// create alias to global definition array, see also $defs // create alias to global definition array, see also $defs
// DEFINITION CALL // DEFINITION CALL
$d_defs = $definition->info_global_attr; $d_defs = $definition->info_global_attr;

View File

@ -4,7 +4,7 @@ HTMLPurifier_ConfigSchema::define(
'Output', 'CommentScriptContents', true, 'bool', 'Output', 'CommentScriptContents', true, 'bool',
'Determines whether or not HTML Purifier should attempt to fix up '. 'Determines whether or not HTML Purifier should attempt to fix up '.
'the contents of script tags for legacy browsers with comments. This '. 'the contents of script tags for legacy browsers with comments. This '.
'directive was available since 1.7.' 'directive was available since 2.0.0.'
); );
HTMLPurifier_ConfigSchema::defineAlias('Core', 'CommentScriptContents', 'Output', 'CommentScriptContents'); HTMLPurifier_ConfigSchema::defineAlias('Core', 'CommentScriptContents', 'Output', 'CommentScriptContents');
@ -76,13 +76,17 @@ class HTMLPurifier_Generator
if (!$tokens) return ''; if (!$tokens) return '';
for ($i = 0, $size = count($tokens); $i < $size; $i++) { for ($i = 0, $size = count($tokens); $i < $size; $i++) {
if ($this->_scriptFix && $tokens[$i]->name === 'script') { if ($this->_scriptFix && $tokens[$i]->name === 'script'
&& $i + 2 < $size && $tokens[$i+2]->type == 'end') {
// script special case // script special case
// the contents of the script block must be ONE token
// for this to work
$html .= $this->generateFromToken($tokens[$i++]); $html .= $this->generateFromToken($tokens[$i++]);
$html .= $this->generateScriptFromToken($tokens[$i++]); $html .= $this->generateScriptFromToken($tokens[$i++]);
while ($tokens[$i]->name != 'script') { // We're not going to do this: it wouldn't be valid anyway
$html .= $this->generateScriptFromToken($tokens[$i++]); //while ($tokens[$i]->name != 'script') {
} // $html .= $this->generateScriptFromToken($tokens[$i++]);
//}
} }
$html .= $this->generateFromToken($tokens[$i]); $html .= $this->generateFromToken($tokens[$i]);
} }
@ -148,10 +152,12 @@ class HTMLPurifier_Generator
* --> somewhere inside the script contents. * --> somewhere inside the script contents.
*/ */
function generateScriptFromToken($token) { function generateScriptFromToken($token) {
if (!$token->type == 'text') return $this->generateFromToken($token); if ($token->type != 'text') return $this->generateFromToken($token);
return '<!--' . PHP_EOL . $token->data . PHP_EOL . '// -->'; // return '<!--' . PHP_EOL . trim($token->data) . PHP_EOL . '// -->';
// more advanced version: // more advanced version:
// return '<!--//--><![CDATA[//><!--' . PHP_EOL . $token->data . PHP_EOL . '//--><!]]>'; // thanks <http://lachy.id.au/log/2005/05/script-comments>
$data = preg_replace('#//\s*$#', '', $token->data);
return '<!--//--><![CDATA[//><!--' . PHP_EOL . trim($data) . PHP_EOL . '//--><!]]>';
} }
/** /**

View File

@ -5,14 +5,6 @@
WARNING: THIS MODULE IS EXTREMELY DANGEROUS AS IT ENABLES INLINE SCRIPTING WARNING: THIS MODULE IS EXTREMELY DANGEROUS AS IT ENABLES INLINE SCRIPTING
INSIDE HTML PURIFIER DOCUMENTS. USE ONLY WITH TRUSTED USER INPUT!!! INSIDE HTML PURIFIER DOCUMENTS. USE ONLY WITH TRUSTED USER INPUT!!!
Usage:
require_once 'HTMLPurifier/HTMLModule/Scripting.php';
$def =& $config->getHTMLDefinition(true); // get the raw version
$def->manager->addModule('Scripting');
This must come before any other calls to getHTMLDefinition()
*/ */
/** /**
@ -63,6 +55,7 @@ class HTMLPurifier_HTMLModule_Scripting extends HTMLPurifier_HTMLModule
); );
$this->info['script']->content_model = '#PCDATA'; $this->info['script']->content_model = '#PCDATA';
$this->info['script']->content_model_type = 'optional'; $this->info['script']->content_model_type = 'optional';
$this->info['script']->attr_transform_pre['type'] =
$this->info['script']->attr_transform_post['type'] = $this->info['script']->attr_transform_post['type'] =
new HTMLPurifier_AttrTransform_ScriptRequired(); new HTMLPurifier_AttrTransform_ScriptRequired();
} }

View File

@ -259,7 +259,19 @@ class HTMLPurifier_Lexer
*/ */
function escapeCDATA($string) { function escapeCDATA($string) {
return preg_replace_callback( return preg_replace_callback(
'/<!\[CDATA\[(.+?)\]\]>/', '/<!\[CDATA\[(.+?)\]\]>/s',
array('HTMLPurifier_Lexer', 'CDATACallback'),
$string
);
}
/**
* Special CDATA case that is especiall convoluted for <script>
*/
function escapeCommentedCDATA($string) {
// <!--//--><![CDATA[//><!--
return preg_replace_callback(
'#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
array('HTMLPurifier_Lexer', 'CDATACallback'), array('HTMLPurifier_Lexer', 'CDATACallback'),
$string $string
); );
@ -291,6 +303,11 @@ class HTMLPurifier_Lexer
$html = $this->extractBody($html); $html = $this->extractBody($html);
} }
if ($config->get('HTML', 'Trusted')) {
// escape convoluted CDATA
$html = $this->escapeCommentedCDATA($html);
}
// escape CDATA // escape CDATA
$html = $this->escapeCDATA($html); $html = $this->escapeCDATA($html);

View File

@ -83,10 +83,13 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
// intercept non element nodes. WE MUST catch all of them, // intercept non element nodes. WE MUST catch all of them,
// but we're not getting the character reference nodes because // but we're not getting the character reference nodes because
// those should have been preprocessed // those should have been preprocessed
if ($node->nodeType === XML_TEXT_NODE || if ($node->nodeType === XML_TEXT_NODE) {
$node->nodeType === XML_CDATA_SECTION_NODE) {
$tokens[] = $this->factory->createText($node->data); $tokens[] = $this->factory->createText($node->data);
return; return;
} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
// undo DOM's special treatment of <script> tags
$tokens[] = $this->factory->createText($this->parseData($node->data));
return;
} elseif ($node->nodeType === XML_COMMENT_NODE) { } elseif ($node->nodeType === XML_COMMENT_NODE) {
$tokens[] = $this->factory->createComment($node->data); $tokens[] = $this->factory->createComment($node->data);
return; return;

View File

@ -126,22 +126,34 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// Check if it's a comment // Check if it's a comment
if ( if (
substr($segment, 0, 3) == '!--' && substr($segment, 0, 3) == '!--'
substr($segment, $strlen_segment-2, 2) == '--'
) { ) {
// re-determine segment length, looking for -->
$position_comment_end = strpos($html, '-->', $cursor);
if ($position_comment_end === false) {
// uh oh, we have a comment that extends to
// infinity. Can't be helped: set comment
// end position to end of string
$position_comment_end = strlen($html);
$end = true;
} else {
$end = false;
}
$strlen_segment = $position_comment_end - $cursor;
$segment = substr($html, $cursor, $strlen_segment);
$token = new $token = new
HTMLPurifier_Token_Comment( HTMLPurifier_Token_Comment(
substr( substr(
$segment, 3, $strlen_segment - 5 $segment, 3, $strlen_segment - 3
) )
); );
if ($maintain_line_numbers) { if ($maintain_line_numbers) {
$token->line = $current_line; $token->line = $current_line;
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
} }
$array[] = $token; $array[] = $token;
$cursor = $end ? $position_comment_end : $position_comment_end + 3;
$inside_tag = false; $inside_tag = false;
$cursor = $position_next_gt + 1;
continue; continue;
} }

View File

@ -17,9 +17,11 @@ HTMLPurifier_ConfigSchema::define(
HTMLPurifier_ConfigSchema::define( HTMLPurifier_ConfigSchema::define(
'Core', 'RemoveScriptContents', true, 'bool', ' 'Core', 'RemoveScriptContents', true, 'bool', '
This directive enables HTML Purifier to remove not only script tags <p>
but all of their contents. This directive has been available since 2.0.0, This directive enables HTML Purifier to remove not only script tags
revert to pre-2.0.0 behavior by setting to false. but all of their contents. This directive has been available since 2.0.0,
revert to pre-2.0.0 behavior by setting to false.
</p>
' '
); );
@ -48,6 +50,9 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
// removes tokens until it reaches a closing tag with its value // removes tokens until it reaches a closing tag with its value
$remove_until = false; $remove_until = false;
// converts comments into text tokens when this is equal to a tag name
$textify_comments = false;
foreach($tokens as $token) { foreach($tokens as $token) {
if ($remove_until) { if ($remove_until) {
if (empty($token->is_tag) || $token->name !== $remove_until) { if (empty($token->is_tag) || $token->name !== $remove_until) {
@ -88,6 +93,13 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
$token->armor['ValidateAttributes'] = true; $token->armor['ValidateAttributes'] = true;
} }
// CAN BE GENERICIZED
if ($token->name == 'script' && $token->type == 'start') {
$textify_comments = $token->name;
} elseif ($token->name === $textify_comments && $token->type == 'end') {
$textify_comments = false;
}
} elseif ($escape_invalid_tags) { } elseif ($escape_invalid_tags) {
// invalid tag, generate HTML and insert in // invalid tag, generate HTML and insert in
$token = new HTMLPurifier_Token_Text( $token = new HTMLPurifier_Token_Text(
@ -108,8 +120,14 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
continue; continue;
} }
} elseif ($token->type == 'comment') { } elseif ($token->type == 'comment') {
// textify comments in script tags when they are allowed
if ($textify_comments !== false) {
$data = $token->data;
$token = new HTMLPurifier_Token_Text($data);
} else {
// strip comments // strip comments
continue; continue;
}
} elseif ($token->type == 'text') { } elseif ($token->type == 'text') {
} else { } else {
continue; continue;

View File

@ -141,9 +141,31 @@ class HTMLPurifier_GeneratorTest extends HTMLPurifier_Harness
new HTMLPurifier_Token_Text('alert(3 < 5);'), new HTMLPurifier_Token_Text('alert(3 < 5);'),
new HTMLPurifier_Token_End('script') new HTMLPurifier_Token_End('script')
), ),
"<script><!--\nalert(3 < 5);\n// --></script>" "<script><!--//--><![CDATA[//><!--\nalert(3 < 5);\n//--><!]]></script>"
); );
// if missing close tag, don't do anything
$this->assertGeneration(
array(
new HTMLPurifier_Token_Start('script'),
new HTMLPurifier_Token_Text('alert(3 < 5);'),
),
"<script>alert(3 &lt; 5);"
);
// if two script blocks, don't do anything
$this->assertGeneration(
array(
new HTMLPurifier_Token_Start('script'),
new HTMLPurifier_Token_Text('alert(3 < 5);'),
new HTMLPurifier_Token_Text('foo();'),
new HTMLPurifier_Token_End('script')
),
"<script>alert(3 &lt; 5);foo();</script>"
);
$this->config = HTMLPurifier_Config::createDefault(); $this->config = HTMLPurifier_Config::createDefault();
$this->config->set('Core', 'CommentScriptContents', false); $this->config->set('Core', 'CommentScriptContents', false);

View File

@ -18,6 +18,14 @@ class HTMLPurifier_HTMLModule_ScriptingTest extends HTMLPurifier_HTMLModuleHarne
array('HTML.Trusted' => true) array('HTML.Trusted' => true)
); );
// CDATA
$this->assertResult(
'//<![CDATA[
alert("<This is compatible with XHTML>");
//]]> ', true,
array('HTML.Trusted' => true)
);
// max // max
$this->assertResult( $this->assertResult(
'<script '<script

View File

@ -299,6 +299,22 @@ class HTMLPurifier_LexerTest extends UnitTestCase
$sax_expect[19] = false; // SAX drops the < character $sax_expect[19] = false; // SAX drops the < character
$dom_expect[19] = false; // DOM drops the entire pseudo-tag $dom_expect[19] = false; // DOM drops the entire pseudo-tag
// test comment parsing with funky characters inside
$input[20] = '<!-- This >< comment --><br />';
$expect[20] = array(
new HTMLPurifier_Token_Comment(' This >< comment '),
new HTMLPurifier_Token_Empty('br')
);
$sax_expect[20] = false;
// test comment parsing of missing end
$input[21] = '<!-- This >< comment';
$expect[21] = array(
new HTMLPurifier_Token_Comment(' This >< comment')
);
$sax_expect[21] = false;
$dom_expect[21] = false;
$default_config = HTMLPurifier_Config::createDefault(); $default_config = HTMLPurifier_Config::createDefault();
$default_context = new HTMLPurifier_Context(); $default_context = new HTMLPurifier_Context();
foreach($input as $i => $discard) { foreach($input as $i => $discard) {

View File

@ -79,6 +79,18 @@ class HTMLPurifier_Strategy_RemoveForeignElementsTest
array('HTML.Allowed' => 'div') array('HTML.Allowed' => 'div')
); );
// text-ify commented script contents ( the trailing comment gets
// removed during generation )
$this->assertResult(
'<script type="text/javascript"><!--
alert(<b>bold</b>);
// --></script>',
'<script type="text/javascript">
alert(&lt;b&gt;bold&lt;/b&gt;);
// </script>',
array('HTML.Trusted' => true, 'Output.CommentScriptContents' => false)
);
} }
} }

View File

@ -12,9 +12,9 @@ class HTMLPurifierTest extends UnitTestCase
$this->purifier = new HTMLPurifier(); $this->purifier = new HTMLPurifier();
} }
function assertPurification($input, $expect = null) { function assertPurification($input, $expect = null, $config = array()) {
if ($expect === null) $expect = $input; if ($expect === null) $expect = $input;
$result = $this->purifier->purify($input); $result = $this->purifier->purify($input, $config);
$this->assertIdentical($expect, $result); $this->assertIdentical($expect, $result);
} }
@ -97,6 +97,41 @@ class HTMLPurifierTest extends UnitTestCase
} }
function testScript() {
$this->purifier = new HTMLPurifier(array('HTML.Trusted' => true));
$ideal = '<script type="text/javascript"><!--//--><![CDATA[//><!--
alert("<This is compatible with XHTML>");
//--><!]]></script>';
$this->assertPurification($ideal);
$this->assertPurification(
'<script type="text/javascript"><![CDATA[
alert("<This is compatible with XHTML>");
]]></script>',
$ideal
);
$this->assertPurification(
'<script type="text/javascript">alert("<This is compatible with XHTML>");</script>',
$ideal
);
$this->assertPurification(
'<script type="text/javascript"><!--
alert("<This is compatible with XHTML>");
//--></script>',
$ideal
);
$this->assertPurification(
'<script type="text/javascript"><![CDATA[
alert("<This is compatible with XHTML>");
//]]></script>',
$ideal
);
}
} }
?> ?>