mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-01-21 13:01:53 +00:00
Refine Lexers for parsing stray angled brackets; %Core.AggressivelyFixLt = true
By default, the DirectLex and DOMLex behavior with stray angled brackets varied a great deal due to their implementations. A little known directive %Core.AggressivelyFixLt attempted to match DOMLex's behavior with DirectLex's, but it was off by default. By turning it on by default, users now enjoy these benefits, and performance-minded users can turn it back off. Also, several refinements to stray angled bracket parsing was made. Specifically: * DirectLex: Handle each left angled bracket individually, which prevents strange behavior as reported by eon. * DOMLex: Iterate aggressive lt fix, so that stacked brackets like << are handled. Signed-off-by: Edward Z. Yang <edwardzyang@thewritingpot.com>
This commit is contained in:
parent
ba418a1f19
commit
aa0fdeee30
4
NEWS
4
NEWS
@ -9,8 +9,6 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
. Internal change
|
. Internal change
|
||||||
==========================
|
==========================
|
||||||
|
|
||||||
3.2.0, unknown release date
|
|
||||||
|
|
||||||
3.1.2, unknown release date
|
3.1.2, unknown release date
|
||||||
! %Output.AttrSort for when you need your attributes in alphabetical order to
|
! %Output.AttrSort for when you need your attributes in alphabetical order to
|
||||||
deal with a bug in FCKEditor. Requested by frank farmer.
|
deal with a bug in FCKEditor. Requested by frank farmer.
|
||||||
@ -22,6 +20,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
use on hand-written HTML.
|
use on hand-written HTML.
|
||||||
! Add error-cases for unsupported elements in MakeWellFormed. This enables
|
! Add error-cases for unsupported elements in MakeWellFormed. This enables
|
||||||
the strategy to be used, standalone, on untrusted input.
|
the strategy to be used, standalone, on untrusted input.
|
||||||
|
! %Core.AggressivelyFixLt is on by default. This causes more sensible
|
||||||
|
processing of left angled brackets in smileys and other whatnot.
|
||||||
- Fix two bugs in %URI.MakeAbsolute; one involving empty paths in base URLs,
|
- Fix two bugs in %URI.MakeAbsolute; one involving empty paths in base URLs,
|
||||||
the other involving an undefined $is_folder error.
|
the other involving an undefined $is_folder error.
|
||||||
- Throw error when %Core.Encoding is set to a spurious value. Previously,
|
- Throw error when %Core.Encoding is set to a spurious value. Previously,
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
<file name="HTMLPurifier/Lexer/DirectLex.php">
|
<file name="HTMLPurifier/Lexer/DirectLex.php">
|
||||||
<line>50</line>
|
<line>50</line>
|
||||||
<line>62</line>
|
<line>62</line>
|
||||||
<line>327</line>
|
<line>319</line>
|
||||||
</file>
|
</file>
|
||||||
<file name="HTMLPurifier/Strategy/RemoveForeignElements.php">
|
<file name="HTMLPurifier/Strategy/RemoveForeignElements.php">
|
||||||
<line>47</line>
|
<line>47</line>
|
||||||
|
File diff suppressed because one or more lines are too long
@ -1,13 +1,17 @@
|
|||||||
Core.AggressivelyFixLt
|
Core.AggressivelyFixLt
|
||||||
TYPE: bool
|
TYPE: bool
|
||||||
VERSION: 2.1.0
|
VERSION: 2.1.0
|
||||||
DEFAULT: false
|
DEFAULT: true
|
||||||
--DESCRIPTION--
|
--DESCRIPTION--
|
||||||
|
<p>
|
||||||
This directive enables aggressive pre-filter fixes HTML Purifier can
|
This directive enables aggressive pre-filter fixes HTML Purifier can
|
||||||
perform in order to ensure that open angled-brackets do not get killed
|
perform in order to ensure that open angled-brackets do not get killed
|
||||||
during parsing stage. Enabling this will result in two preg_replace_callback
|
during parsing stage. Enabling this will result in two preg_replace_callback
|
||||||
calls and one preg_replace call for every bit of HTML passed through here.
|
calls and at least two preg_replace calls for every HTML document parsed;
|
||||||
It is not necessary and will have no effect for PHP 4.
|
if your users make very well-formed HTML, you can set this directive false.
|
||||||
|
This has no effect when DirectLex is used.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
<strong>Notice:</strong> This directive's default turned from false to true
|
||||||
|
in HTML Purifier 3.1.2.
|
||||||
|
</p>
|
@ -45,7 +45,10 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
|||||||
$char = '[^a-z!\/]';
|
$char = '[^a-z!\/]';
|
||||||
$comment = "/<!--(.*?)(-->|\z)/is";
|
$comment = "/<!--(.*?)(-->|\z)/is";
|
||||||
$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
|
$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
|
||||||
$html = preg_replace("/<($char)/i", '<\\1', $html);
|
do {
|
||||||
|
$old = $html;
|
||||||
|
$html = preg_replace("/<($char)/i", '<\\1', $html);
|
||||||
|
} while ($html !== $old);
|
||||||
$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
|
$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -197,20 +197,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
if (!ctype_alpha($segment[0])) {
|
if (!ctype_alpha($segment[0])) {
|
||||||
// XML: $segment[0] !== '_' && $segment[0] !== ':'
|
// XML: $segment[0] !== '_' && $segment[0] !== ':'
|
||||||
if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
|
if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
|
||||||
$token = new
|
$token = new HTMLPurifier_Token_Text('<');
|
||||||
HTMLPurifier_Token_Text(
|
|
||||||
'<' .
|
|
||||||
$this->parseData(
|
|
||||||
$segment
|
|
||||||
) .
|
|
||||||
'>'
|
|
||||||
);
|
|
||||||
if ($maintain_line_numbers) {
|
if ($maintain_line_numbers) {
|
||||||
$token->line = $current_line;
|
$token->line = $current_line;
|
||||||
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
|
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
|
||||||
}
|
}
|
||||||
$array[] = $token;
|
$array[] = $token;
|
||||||
$cursor = $position_next_gt + 1;
|
|
||||||
$inside_tag = false;
|
$inside_tag = false;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -418,14 +418,13 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
}
|
}
|
||||||
|
|
||||||
function test_tokenizeHTML_emoticonProtection() {
|
function test_tokenizeHTML_emoticonProtection() {
|
||||||
$this->config->set('Core', 'AggressivelyFixLt', true);
|
|
||||||
$this->assertTokenization(
|
$this->assertTokenization(
|
||||||
'<b>Whoa! <3 That\'s not good >.></b>',
|
'<b>Whoa! <3 That\'s not good >.></b>',
|
||||||
array(
|
array(
|
||||||
new HTMLPurifier_Token_Start('b'),
|
new HTMLPurifier_Token_Start('b'),
|
||||||
new HTMLPurifier_Token_Text('Whoa! '),
|
new HTMLPurifier_Token_Text('Whoa! '),
|
||||||
new HTMLPurifier_Token_Text('<3 That\'s not good >'),
|
new HTMLPurifier_Token_Text('<'),
|
||||||
new HTMLPurifier_Token_Text('.>'),
|
new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
|
||||||
new HTMLPurifier_Token_End('b')
|
new HTMLPurifier_Token_End('b')
|
||||||
),
|
),
|
||||||
array(
|
array(
|
||||||
@ -491,7 +490,6 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
}
|
}
|
||||||
|
|
||||||
function test_tokenizeHTML_entitiesInComment() {
|
function test_tokenizeHTML_entitiesInComment() {
|
||||||
$this->config->set('Core', 'AggressivelyFixLt', true);
|
|
||||||
$this->assertTokenization(
|
$this->assertTokenization(
|
||||||
'<!-- This comment < < & -->',
|
'<!-- This comment < < & -->',
|
||||||
array( new HTMLPurifier_Token_Comment(' This comment < < & ') ),
|
array( new HTMLPurifier_Token_Comment(' This comment < < & ') ),
|
||||||
@ -508,7 +506,8 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
array(
|
array(
|
||||||
'DirectLex' => array(
|
'DirectLex' => array(
|
||||||
new HTMLPurifier_Token_Start('a', array('href' => '')),
|
new HTMLPurifier_Token_Start('a', array('href' => '')),
|
||||||
new HTMLPurifier_Token_Text('<">'),
|
new HTMLPurifier_Token_Text('<'),
|
||||||
|
new HTMLPurifier_Token_Text('">'),
|
||||||
),
|
),
|
||||||
'PEARSax3' => false,
|
'PEARSax3' => false,
|
||||||
)
|
)
|
||||||
@ -556,7 +555,7 @@ div {}
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
function test_tokenizeHTML_() {
|
function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
|
||||||
$this->assertTokenization(
|
$this->assertTokenization(
|
||||||
'<a@>>',
|
'<a@>>',
|
||||||
array(
|
array(
|
||||||
@ -576,6 +575,65 @@ div {}
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function test_tokenizeHTML_emoticonHeart() {
|
||||||
|
$this->assertTokenization(
|
||||||
|
'<br /><3<br />',
|
||||||
|
array(
|
||||||
|
new HTMLPurifier_Token_Empty('br'),
|
||||||
|
new HTMLPurifier_Token_Text('<'),
|
||||||
|
new HTMLPurifier_Token_Text('3'),
|
||||||
|
new HTMLPurifier_Token_Empty('br'),
|
||||||
|
),
|
||||||
|
array(
|
||||||
|
'DOMLex' => array(
|
||||||
|
new HTMLPurifier_Token_Empty('br'),
|
||||||
|
new HTMLPurifier_Token_Text('<3'),
|
||||||
|
new HTMLPurifier_Token_Empty('br'),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function test_tokenizeHTML_emoticonShiftyEyes() {
|
||||||
|
$this->assertTokenization(
|
||||||
|
'<b><<</b>',
|
||||||
|
array(
|
||||||
|
new HTMLPurifier_Token_Start('b'),
|
||||||
|
new HTMLPurifier_Token_Text('<'),
|
||||||
|
new HTMLPurifier_Token_Text('<'),
|
||||||
|
new HTMLPurifier_Token_End('b'),
|
||||||
|
),
|
||||||
|
array(
|
||||||
|
'DOMLex' => array(
|
||||||
|
new HTMLPurifier_Token_Start('b'),
|
||||||
|
new HTMLPurifier_Token_Text('<<'),
|
||||||
|
new HTMLPurifier_Token_End('b'),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function test_tokenizeHTML_eon1996() {
|
||||||
|
$this->assertTokenization(
|
||||||
|
'< <b>test</b>',
|
||||||
|
array(
|
||||||
|
new HTMLPurifier_Token_Text('<'),
|
||||||
|
new HTMLPurifier_Token_Text(' '),
|
||||||
|
new HTMLPurifier_Token_Start('b'),
|
||||||
|
new HTMLPurifier_Token_Text('test'),
|
||||||
|
new HTMLPurifier_Token_End('b'),
|
||||||
|
),
|
||||||
|
array(
|
||||||
|
'DOMLex' => array(
|
||||||
|
new HTMLPurifier_Token_Text('< '),
|
||||||
|
new HTMLPurifier_Token_Start('b'),
|
||||||
|
new HTMLPurifier_Token_Text('test'),
|
||||||
|
new HTMLPurifier_Token_End('b'),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
||||||
function test_tokenizeHTML_() {
|
function test_tokenizeHTML_() {
|
||||||
|
Loading…
Reference in New Issue
Block a user