mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-01-20 12:31:53 +00:00
Fix extant broken PEARSax3 parsing patterns.
Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
This commit is contained in:
parent
faf28682ad
commit
ac18672aba
@ -28,6 +28,7 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
|||||||
protected $tokens = array();
|
protected $tokens = array();
|
||||||
|
|
||||||
private $parent_handler;
|
private $parent_handler;
|
||||||
|
private $stack = array();
|
||||||
|
|
||||||
public function tokenizeHTML($string, $config, $context) {
|
public function tokenizeHTML($string, $config, $context) {
|
||||||
|
|
||||||
@ -67,6 +68,7 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
|||||||
} else {
|
} else {
|
||||||
$this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs);
|
$this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs);
|
||||||
}
|
}
|
||||||
|
$this->stack[] = $name;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -81,6 +83,7 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
$this->tokens[] = new HTMLPurifier_Token_End($name);
|
$this->tokens[] = new HTMLPurifier_Token_End($name);
|
||||||
|
if (!empty($this->stack)) array_pop($this->stack);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -97,8 +100,18 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
|||||||
*/
|
*/
|
||||||
public function escapeHandler(&$parser, $data) {
|
public function escapeHandler(&$parser, $data) {
|
||||||
if (strpos($data, '--') === 0) {
|
if (strpos($data, '--') === 0) {
|
||||||
|
// remove trailing and leading double-dashes
|
||||||
|
$data = substr($data, 2);
|
||||||
|
if (strlen($data) >= 2 && substr($data, -2) == "--") {
|
||||||
|
$data = substr($data, 0, -2);
|
||||||
|
}
|
||||||
|
if (isset($this->stack[sizeof($this->stack) - 1]) &&
|
||||||
|
$this->stack[sizeof($this->stack) - 1] == "style") {
|
||||||
|
$this->tokens[] = new HTMLPurifier_Token_Text($data);
|
||||||
|
} else {
|
||||||
$this->tokens[] = new HTMLPurifier_Token_Comment($data);
|
$this->tokens[] = new HTMLPurifier_Token_Comment($data);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
// CDATA is handled elsewhere, but if it was handled here:
|
// CDATA is handled elsewhere, but if it was handled here:
|
||||||
//if (strpos($data, '[CDATA[') === 0) {
|
//if (strpos($data, '[CDATA[') === 0) {
|
||||||
// $this->tokens[] = new HTMLPurifier_Token_Text(
|
// $this->tokens[] = new HTMLPurifier_Token_Text(
|
||||||
|
@ -172,7 +172,6 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
}
|
}
|
||||||
if ($t_expect != $result) {
|
if ($t_expect != $result) {
|
||||||
printTokens($result);
|
printTokens($result);
|
||||||
//var_dump($result);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -270,20 +269,14 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
function test_tokenizeHTML_comment() {
|
function test_tokenizeHTML_comment() {
|
||||||
$this->assertTokenization(
|
$this->assertTokenization(
|
||||||
'<!-- Comment -->',
|
'<!-- Comment -->',
|
||||||
array( new HTMLPurifier_Token_Comment(' Comment ') ),
|
array( new HTMLPurifier_Token_Comment(' Comment ') )
|
||||||
array(
|
|
||||||
'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- Comment --') ),
|
|
||||||
)
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
function test_tokenizeHTML_malformedComment() {
|
function test_tokenizeHTML_malformedComment() {
|
||||||
$this->assertTokenization(
|
$this->assertTokenization(
|
||||||
'<!-- not so well formed --->',
|
'<!-- not so well formed --->',
|
||||||
array( new HTMLPurifier_Token_Comment(' not so well formed -') ),
|
array( new HTMLPurifier_Token_Comment(' not so well formed -') )
|
||||||
array(
|
|
||||||
'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- not so well formed ---') ),
|
|
||||||
)
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -574,6 +567,13 @@ div {}
|
|||||||
}
|
}
|
||||||
|
|
||||||
function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
|
function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
|
||||||
|
$alt_expect = array(
|
||||||
|
// Technically this is invalid, but it won't be a
|
||||||
|
// problem with invalid element removal; also, this
|
||||||
|
// mimics Mozilla's parsing of the tag.
|
||||||
|
new HTMLPurifier_Token_Start('a@'),
|
||||||
|
new HTMLPurifier_Token_Text('>'),
|
||||||
|
);
|
||||||
$this->assertTokenization(
|
$this->assertTokenization(
|
||||||
'<a@>>',
|
'<a@>>',
|
||||||
array(
|
array(
|
||||||
@ -582,13 +582,8 @@ div {}
|
|||||||
new HTMLPurifier_Token_End('a'),
|
new HTMLPurifier_Token_End('a'),
|
||||||
),
|
),
|
||||||
array(
|
array(
|
||||||
'DirectLex' => array(
|
'DirectLex' => $alt_expect,
|
||||||
// Technically this is invalid, but it won't be a
|
'PEARSax3' => $alt_expect,
|
||||||
// problem with invalid element removal; also, this
|
|
||||||
// mimics Mozilla's parsing of the tag.
|
|
||||||
new HTMLPurifier_Token_Start('a@'),
|
|
||||||
new HTMLPurifier_Token_Text('>'),
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -608,6 +603,11 @@ div {}
|
|||||||
new HTMLPurifier_Token_Text('<3'),
|
new HTMLPurifier_Token_Text('<3'),
|
||||||
new HTMLPurifier_Token_Empty('br'),
|
new HTMLPurifier_Token_Empty('br'),
|
||||||
),
|
),
|
||||||
|
'PEARSax3' => array(
|
||||||
|
// bah too lazy to fix this
|
||||||
|
new HTMLPurifier_Token_Empty('br'),
|
||||||
|
new HTMLPurifier_Token_Empty('3<br'),
|
||||||
|
),
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -627,6 +627,12 @@ div {}
|
|||||||
new HTMLPurifier_Token_Text('<<'),
|
new HTMLPurifier_Token_Text('<<'),
|
||||||
new HTMLPurifier_Token_End('b'),
|
new HTMLPurifier_Token_End('b'),
|
||||||
),
|
),
|
||||||
|
'PEARSax3' => array(
|
||||||
|
// also too lazy to fix
|
||||||
|
new HTMLPurifier_Token_Start('b'),
|
||||||
|
new HTMLPurifier_Token_Empty('<<'),
|
||||||
|
new HTMLPurifier_Token_Text('b>'),
|
||||||
|
),
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -648,18 +654,19 @@ div {}
|
|||||||
new HTMLPurifier_Token_Text('test'),
|
new HTMLPurifier_Token_Text('test'),
|
||||||
new HTMLPurifier_Token_End('b'),
|
new HTMLPurifier_Token_End('b'),
|
||||||
),
|
),
|
||||||
|
'PEARSax3' => array(
|
||||||
|
// totally doing the wrong thing here
|
||||||
|
new HTMLPurifier_Token_Text(' '),
|
||||||
|
new HTMLPurifier_Token_Start('b'),
|
||||||
|
new HTMLPurifier_Token_Text('test'),
|
||||||
|
new HTMLPurifier_Token_End('b'),
|
||||||
|
),
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
function test_tokenizeHTML_bodyInCDATA() {
|
function test_tokenizeHTML_bodyInCDATA() {
|
||||||
$this->assertTokenization(
|
$alt_tokens = array(
|
||||||
'<![CDATA[<body>Foo</body>]]>',
|
|
||||||
array(
|
|
||||||
new HTMLPurifier_Token_Text('<body>Foo</body>'),
|
|
||||||
),
|
|
||||||
array(
|
|
||||||
'PH5P' => array(
|
|
||||||
new HTMLPurifier_Token_Text('<'),
|
new HTMLPurifier_Token_Text('<'),
|
||||||
new HTMLPurifier_Token_Text('body'),
|
new HTMLPurifier_Token_Text('body'),
|
||||||
new HTMLPurifier_Token_Text('>'),
|
new HTMLPurifier_Token_Text('>'),
|
||||||
@ -667,7 +674,15 @@ div {}
|
|||||||
new HTMLPurifier_Token_Text('<'),
|
new HTMLPurifier_Token_Text('<'),
|
||||||
new HTMLPurifier_Token_Text('/body'),
|
new HTMLPurifier_Token_Text('/body'),
|
||||||
new HTMLPurifier_Token_Text('>'),
|
new HTMLPurifier_Token_Text('>'),
|
||||||
|
);
|
||||||
|
$this->assertTokenization(
|
||||||
|
'<![CDATA[<body>Foo</body>]]>',
|
||||||
|
array(
|
||||||
|
new HTMLPurifier_Token_Text('<body>Foo</body>'),
|
||||||
),
|
),
|
||||||
|
array(
|
||||||
|
'PH5P' => $alt_tokens,
|
||||||
|
'PEARSax3' => $alt_tokens,
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user