0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-09-18 18:25:18 +00:00

[2.1.0] True emoticon < fix.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1260 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2007-06-27 16:40:18 +00:00
parent 4476745003
commit a6ede3804e
4 changed files with 60 additions and 7 deletions

4
NEWS
View File

@ -10,7 +10,9 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
==========================
2.1.0, unknown release date
(none)
! With %Core.AggressivelyFixLt, <3 and similar emoticons no longer
trigger HTML removal in PHP5 (DOMLex). This directive is not necessary
for PHP4 (DirectLex).
2.0.2, unknown release date
(none)

View File

@ -3,6 +3,16 @@
require_once 'HTMLPurifier/Lexer.php';
require_once 'HTMLPurifier/TokenFactory.php';
HTMLPurifier_ConfigSchema::define(
'Core', 'AggressivelyFixLt', false, 'bool', '
This directive enables aggressive pre-filter fixes HTML Purifier can
perform in order to ensure that open angled-brackets do not get killed
during parsing stage. Enabling this will result in two preg_replace_callback
calls and one preg_replace call for every bit of HTML passed through here.
It is not necessary and will have no effect for PHP 4.
This directive has been available since 2.1.0.
');
/**
* Parser that uses PHP 5's DOM extension (part of the core).
*
@ -42,6 +52,16 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
$html = $this->normalize($html, $config, $context);
// attempt to armor stray angled brackets that cannot possibly
// form tags and thus are probably being used as emoticons
if ($config->get('Core', 'AggressivelyFixLt')) {
$char = '[^a-z!\/]';
$comment = "/<!--(.*?)(-->|\z)/is";
$html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackArmorCommentEntities'), $html);
$html = preg_replace("/<($char)/i", '&lt;\\1', $html);
$html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackUndoCommentSubst'), $html); // fix comments
}
// preprocess html, essential for UTF-8
$html =
'<!DOCTYPE html '.
@ -151,5 +171,21 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
*/
public function muteErrorHandler($errno, $errstr) {}
/**
* Callback function for undoing escaping of stray angled brackets
* in comments
*/
function callbackUndoCommentSubst($matches) {
return '<!--' . strtr($matches[1], array('&amp;'=>'&','&lt;'=>'<')) . $matches[2];
}
/**
* Callback function that entity-izes ampersands in comments so that
* callbackUndoCommentSubst doesn't clobber them
*/
function callbackArmorCommentEntities($matches) {
return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
}
}

View File

@ -204,7 +204,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// Check leading character is alnum, if not, we may
// have accidently grabbed an emoticon. Translate into
// text and go our merry way
if (!ctype_alnum($segment[0])) {
if (!ctype_alpha($segment[0])) {
// XML: $segment[0] !== '_' && $segment[0] !== ':'
if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
$token = new
HTMLPurifier_Token_Text(

View File

@ -288,16 +288,21 @@ class HTMLPurifier_LexerTest extends UnitTestCase
$expect[18] = array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) );
// test emoticon protection
$input[19] = '<b>Whoa! >.< That\'s not good >.></b>';
$input[19] = '<b>Whoa! <3 That\'s not good >.></b>';
$expect[19] = array(
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('Whoa! >.'),
new HTMLPurifier_Token_Text('< That\'s not good >'),
new HTMLPurifier_Token_Text('Whoa! '),
new HTMLPurifier_Token_Text('<3 That\'s not good >'),
new HTMLPurifier_Token_Text('.>'),
new HTMLPurifier_Token_End('b'),
);
$dom_expect[19] = array(
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
new HTMLPurifier_Token_End('b'),
);
$sax_expect[19] = false; // SAX drops the < character
$dom_expect[19] = false; // DOM drops the entire pseudo-tag
$config[19] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true));
// test comment parsing with funky characters inside
$input[20] = '<!-- This >< comment --><br />';
@ -306,6 +311,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase
new HTMLPurifier_Token_Empty('br')
);
$sax_expect[20] = false;
$config[20] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true));
// test comment parsing of missing end
$input[21] = '<!-- This >< comment';
@ -314,6 +320,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase
);
$sax_expect[21] = false;
$dom_expect[21] = false;
$config[21] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true));
// test CDATA tags
$input[22] = '<script>alert("<foo>");</script>';
@ -324,7 +331,14 @@ class HTMLPurifier_LexerTest extends UnitTestCase
);
$config[22] = HTMLPurifier_Config::create(array('HTML.Trusted' => true));
$sax_expect[22] = false;
//$dom_expect[22] = false;
// test escaping
$input[23] = '<!-- This comment < &lt; & -->';
$expect[23] = array(
new HTMLPurifier_Token_Comment(' This comment < &lt; & ')
);
$sax_expect[23] = false;
$config[21] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true));
$default_config = HTMLPurifier_Config::createDefault();
$default_context = new HTMLPurifier_Context();