0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-05 06:01:52 +00:00

Refine Lexers for parsing stray angled brackets; %Core.AggressivelyFixLt = true

By default, the DirectLex and DOMLex behavior with stray angled brackets
varied a great deal due to their implementations. A little known directive
%Core.AggressivelyFixLt attempted to match DOMLex's behavior with DirectLex's,
but it was off by default. By turning it on by default, users now enjoy these
benefits, and performance-minded users can turn it back off.

Also, several refinements to stray angled bracket parsing was made. Specifically:

* DirectLex: Handle each left angled bracket individually, which prevents
  strange behavior as reported by eon.
* DOMLex: Iterate aggressive lt fix, so that stacked brackets like << are
  handled.

Signed-off-by: Edward Z. Yang <edwardzyang@thewritingpot.com>
This commit is contained in:
Edward Z. Yang 2008-06-28 00:43:02 -04:00
parent ba418a1f19
commit aa0fdeee30
7 changed files with 86 additions and 29 deletions

4
NEWS
View File

@ -9,8 +9,6 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
. Internal change . Internal change
========================== ==========================
3.2.0, unknown release date
3.1.2, unknown release date 3.1.2, unknown release date
! %Output.AttrSort for when you need your attributes in alphabetical order to ! %Output.AttrSort for when you need your attributes in alphabetical order to
deal with a bug in FCKEditor. Requested by frank farmer. deal with a bug in FCKEditor. Requested by frank farmer.
@ -22,6 +20,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
use on hand-written HTML. use on hand-written HTML.
! Add error-cases for unsupported elements in MakeWellFormed. This enables ! Add error-cases for unsupported elements in MakeWellFormed. This enables
the strategy to be used, standalone, on untrusted input. the strategy to be used, standalone, on untrusted input.
! %Core.AggressivelyFixLt is on by default. This causes more sensible
processing of left angled brackets in smileys and other whatnot.
- Fix two bugs in %URI.MakeAbsolute; one involving empty paths in base URLs, - Fix two bugs in %URI.MakeAbsolute; one involving empty paths in base URLs,
the other involving an undefined $is_folder error. the other involving an undefined $is_folder error.
- Throw error when %Core.Encoding is set to a spurious value. Previously, - Throw error when %Core.Encoding is set to a spurious value. Previously,

View File

@ -10,7 +10,7 @@
<file name="HTMLPurifier/Lexer/DirectLex.php"> <file name="HTMLPurifier/Lexer/DirectLex.php">
<line>50</line> <line>50</line>
<line>62</line> <line>62</line>
<line>327</line> <line>319</line>
</file> </file>
<file name="HTMLPurifier/Strategy/RemoveForeignElements.php"> <file name="HTMLPurifier/Strategy/RemoveForeignElements.php">
<line>47</line> <line>47</line>

File diff suppressed because one or more lines are too long

View File

@ -1,13 +1,17 @@
Core.AggressivelyFixLt Core.AggressivelyFixLt
TYPE: bool TYPE: bool
VERSION: 2.1.0 VERSION: 2.1.0
DEFAULT: false DEFAULT: true
--DESCRIPTION-- --DESCRIPTION--
<p>
This directive enables aggressive pre-filter fixes HTML Purifier can This directive enables aggressive pre-filter fixes HTML Purifier can
perform in order to ensure that open angled-brackets do not get killed perform in order to ensure that open angled-brackets do not get killed
during parsing stage. Enabling this will result in two preg_replace_callback during parsing stage. Enabling this will result in two preg_replace_callback
calls and one preg_replace call for every bit of HTML passed through here. calls and at least two preg_replace calls for every HTML document parsed;
It is not necessary and will have no effect for PHP 4. if your users make very well-formed HTML, you can set this directive false.
This has no effect when DirectLex is used.
</p>
<p>
<strong>Notice:</strong> This directive's default turned from false to true
in HTML Purifier 3.1.2.
</p>

View File

@ -45,7 +45,10 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
$char = '[^a-z!\/]'; $char = '[^a-z!\/]';
$comment = "/<!--(.*?)(-->|\z)/is"; $comment = "/<!--(.*?)(-->|\z)/is";
$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
do {
$old = $html;
$html = preg_replace("/<($char)/i", '&lt;\\1', $html); $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
} while ($html !== $old);
$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
} }

View File

@ -197,20 +197,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
if (!ctype_alpha($segment[0])) { if (!ctype_alpha($segment[0])) {
// XML: $segment[0] !== '_' && $segment[0] !== ':' // XML: $segment[0] !== '_' && $segment[0] !== ':'
if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt'); if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
$token = new $token = new HTMLPurifier_Token_Text('<');
HTMLPurifier_Token_Text(
'<' .
$this->parseData(
$segment
) .
'>'
);
if ($maintain_line_numbers) { if ($maintain_line_numbers) {
$token->line = $current_line; $token->line = $current_line;
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
} }
$array[] = $token; $array[] = $token;
$cursor = $position_next_gt + 1;
$inside_tag = false; $inside_tag = false;
continue; continue;
} }

View File

@ -418,14 +418,13 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
} }
function test_tokenizeHTML_emoticonProtection() { function test_tokenizeHTML_emoticonProtection() {
$this->config->set('Core', 'AggressivelyFixLt', true);
$this->assertTokenization( $this->assertTokenization(
'<b>Whoa! <3 That\'s not good >.></b>', '<b>Whoa! <3 That\'s not good >.></b>',
array( array(
new HTMLPurifier_Token_Start('b'), new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('Whoa! '), new HTMLPurifier_Token_Text('Whoa! '),
new HTMLPurifier_Token_Text('<3 That\'s not good >'), new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text('.>'), new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
new HTMLPurifier_Token_End('b') new HTMLPurifier_Token_End('b')
), ),
array( array(
@ -491,7 +490,6 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
} }
function test_tokenizeHTML_entitiesInComment() { function test_tokenizeHTML_entitiesInComment() {
$this->config->set('Core', 'AggressivelyFixLt', true);
$this->assertTokenization( $this->assertTokenization(
'<!-- This comment < &lt; & -->', '<!-- This comment < &lt; & -->',
array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') ), array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') ),
@ -508,7 +506,8 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
array( array(
'DirectLex' => array( 'DirectLex' => array(
new HTMLPurifier_Token_Start('a', array('href' => '')), new HTMLPurifier_Token_Start('a', array('href' => '')),
new HTMLPurifier_Token_Text('<">'), new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text('">'),
), ),
'PEARSax3' => false, 'PEARSax3' => false,
) )
@ -556,7 +555,7 @@ div {}
); );
} }
function test_tokenizeHTML_() { function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
$this->assertTokenization( $this->assertTokenization(
'<a@>>', '<a@>>',
array( array(
@ -576,6 +575,65 @@ div {}
); );
} }
function test_tokenizeHTML_emoticonHeart() {
$this->assertTokenization(
'<br /><3<br />',
array(
new HTMLPurifier_Token_Empty('br'),
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text('3'),
new HTMLPurifier_Token_Empty('br'),
),
array(
'DOMLex' => array(
new HTMLPurifier_Token_Empty('br'),
new HTMLPurifier_Token_Text('<3'),
new HTMLPurifier_Token_Empty('br'),
),
)
);
}
function test_tokenizeHTML_emoticonShiftyEyes() {
$this->assertTokenization(
'<b><<</b>',
array(
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_End('b'),
),
array(
'DOMLex' => array(
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('<<'),
new HTMLPurifier_Token_End('b'),
),
)
);
}
function test_tokenizeHTML_eon1996() {
$this->assertTokenization(
'< <b>test</b>',
array(
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text(' '),
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('test'),
new HTMLPurifier_Token_End('b'),
),
array(
'DOMLex' => array(
new HTMLPurifier_Token_Text('< '),
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('test'),
new HTMLPurifier_Token_End('b'),
),
)
);
}
/* /*
function test_tokenizeHTML_() { function test_tokenizeHTML_() {