0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-05 06:01:52 +00:00

[2.1.0] True emoticon < fix.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1260 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2007-06-27 16:40:18 +00:00
parent 4476745003
commit a6ede3804e
4 changed files with 60 additions and 7 deletions

4
NEWS
View File

@ -10,7 +10,9 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
========================== ==========================
2.1.0, unknown release date 2.1.0, unknown release date
(none) ! With %Core.AggressivelyFixLt, <3 and similar emoticons no longer
trigger HTML removal in PHP5 (DOMLex). This directive is not necessary
for PHP4 (DirectLex).
2.0.2, unknown release date 2.0.2, unknown release date
(none) (none)

View File

@ -3,6 +3,16 @@
require_once 'HTMLPurifier/Lexer.php'; require_once 'HTMLPurifier/Lexer.php';
require_once 'HTMLPurifier/TokenFactory.php'; require_once 'HTMLPurifier/TokenFactory.php';
HTMLPurifier_ConfigSchema::define(
'Core', 'AggressivelyFixLt', false, 'bool', '
This directive enables aggressive pre-filter fixes HTML Purifier can
perform in order to ensure that open angled-brackets do not get killed
during parsing stage. Enabling this will result in two preg_replace_callback
calls and one preg_replace call for every bit of HTML passed through here.
It is not necessary and will have no effect for PHP 4.
This directive has been available since 2.1.0.
');
/** /**
* Parser that uses PHP 5's DOM extension (part of the core). * Parser that uses PHP 5's DOM extension (part of the core).
* *
@ -42,6 +52,16 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
$html = $this->normalize($html, $config, $context); $html = $this->normalize($html, $config, $context);
// attempt to armor stray angled brackets that cannot possibly
// form tags and thus are probably being used as emoticons
if ($config->get('Core', 'AggressivelyFixLt')) {
$char = '[^a-z!\/]';
$comment = "/<!--(.*?)(-->|\z)/is";
$html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackArmorCommentEntities'), $html);
$html = preg_replace("/<($char)/i", '&lt;\\1', $html);
$html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackUndoCommentSubst'), $html); // fix comments
}
// preprocess html, essential for UTF-8 // preprocess html, essential for UTF-8
$html = $html =
'<!DOCTYPE html '. '<!DOCTYPE html '.
@ -151,5 +171,21 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
*/ */
public function muteErrorHandler($errno, $errstr) {} public function muteErrorHandler($errno, $errstr) {}
/**
* Callback function for undoing escaping of stray angled brackets
* in comments
*/
function callbackUndoCommentSubst($matches) {
return '<!--' . strtr($matches[1], array('&amp;'=>'&','&lt;'=>'<')) . $matches[2];
}
/**
* Callback function that entity-izes ampersands in comments so that
* callbackUndoCommentSubst doesn't clobber them
*/
function callbackArmorCommentEntities($matches) {
return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
}
} }

View File

@ -204,7 +204,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// Check leading character is alnum, if not, we may // Check leading character is alnum, if not, we may
// have accidently grabbed an emoticon. Translate into // have accidently grabbed an emoticon. Translate into
// text and go our merry way // text and go our merry way
if (!ctype_alnum($segment[0])) { if (!ctype_alpha($segment[0])) {
// XML: $segment[0] !== '_' && $segment[0] !== ':'
if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt'); if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
$token = new $token = new
HTMLPurifier_Token_Text( HTMLPurifier_Token_Text(

View File

@ -288,16 +288,21 @@ class HTMLPurifier_LexerTest extends UnitTestCase
$expect[18] = array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) ); $expect[18] = array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) );
// test emoticon protection // test emoticon protection
$input[19] = '<b>Whoa! >.< That\'s not good >.></b>'; $input[19] = '<b>Whoa! <3 That\'s not good >.></b>';
$expect[19] = array( $expect[19] = array(
new HTMLPurifier_Token_Start('b'), new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('Whoa! >.'), new HTMLPurifier_Token_Text('Whoa! '),
new HTMLPurifier_Token_Text('< That\'s not good >'), new HTMLPurifier_Token_Text('<3 That\'s not good >'),
new HTMLPurifier_Token_Text('.>'), new HTMLPurifier_Token_Text('.>'),
new HTMLPurifier_Token_End('b'), new HTMLPurifier_Token_End('b'),
); );
$dom_expect[19] = array(
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
new HTMLPurifier_Token_End('b'),
);
$sax_expect[19] = false; // SAX drops the < character $sax_expect[19] = false; // SAX drops the < character
$dom_expect[19] = false; // DOM drops the entire pseudo-tag $config[19] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true));
// test comment parsing with funky characters inside // test comment parsing with funky characters inside
$input[20] = '<!-- This >< comment --><br />'; $input[20] = '<!-- This >< comment --><br />';
@ -306,6 +311,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase
new HTMLPurifier_Token_Empty('br') new HTMLPurifier_Token_Empty('br')
); );
$sax_expect[20] = false; $sax_expect[20] = false;
$config[20] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true));
// test comment parsing of missing end // test comment parsing of missing end
$input[21] = '<!-- This >< comment'; $input[21] = '<!-- This >< comment';
@ -314,6 +320,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase
); );
$sax_expect[21] = false; $sax_expect[21] = false;
$dom_expect[21] = false; $dom_expect[21] = false;
$config[21] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true));
// test CDATA tags // test CDATA tags
$input[22] = '<script>alert("<foo>");</script>'; $input[22] = '<script>alert("<foo>");</script>';
@ -324,7 +331,14 @@ class HTMLPurifier_LexerTest extends UnitTestCase
); );
$config[22] = HTMLPurifier_Config::create(array('HTML.Trusted' => true)); $config[22] = HTMLPurifier_Config::create(array('HTML.Trusted' => true));
$sax_expect[22] = false; $sax_expect[22] = false;
//$dom_expect[22] = false;
// test escaping
$input[23] = '<!-- This comment < &lt; & -->';
$expect[23] = array(
new HTMLPurifier_Token_Comment(' This comment < &lt; & ')
);
$sax_expect[23] = false;
$config[21] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true));
$default_config = HTMLPurifier_Config::createDefault(); $default_config = HTMLPurifier_Config::createDefault();
$default_context = new HTMLPurifier_Context(); $default_context = new HTMLPurifier_Context();