0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-03-23 22:37:02 +00:00
Signed-off-by: Edward Z. Yang <ezyang@cs.stanford.edu>
This commit is contained in:
Edward Z. Yang 2017-03-06 22:54:54 -08:00
parent 353c96f156
commit 5662efc936
5 changed files with 45 additions and 12 deletions

6
NEWS
View File

@ -22,6 +22,12 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
- We accidentally dropped certain Unicode characters if there was - We accidentally dropped certain Unicode characters if there was
one or more invalid characters. This has been fixed, thanks one or more invalid characters. This has been fixed, thanks
to mpyw <ryosuke_i_628@yahoo.co.jp> to mpyw <ryosuke_i_628@yahoo.co.jp>
- Fix for "Don't truncate upon encountering </div> when using DOMLex"
caused a regression with HTML 4.01 Strict parsing with libxml 2.9.1
(and maybe later versions, but known OK with libxml 2.9.4). The
fix is to go about handling truncation a bit more cleverly so that
we can wrap with divs (sidestepping the bug) but slurping out the
rest of the text in case it ran off the end. (#78)
# By default, when a link has a target attribute associated # By default, when a link has a target attribute associated
with it, we now also add rel="noopener" in order to with it, we now also add rel="noopener" in order to
prevent the new window from being able to overwrite prevent the new window from being able to overwrite

View File

@ -72,12 +72,20 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
$doc->loadHTML($html); $doc->loadHTML($html);
restore_error_handler(); restore_error_handler();
$body = $doc->getElementsByTagName('html')->item(0)-> // <html>
getElementsByTagName('body')->item(0); // <body>
$div = $body->getElementsByTagName('div')->item(0); // <div>
$tokens = array(); $tokens = array();
$this->tokenizeDOM( $this->tokenizeDOM($div, $tokens);
$doc->getElementsByTagName('html')->item(0)-> // <html> // If the div has a sibling, that means we tripped across
getElementsByTagName('body')->item(0), // <body> // a premature </div> tag. So remove the div we parsed,
$tokens // and then tokenize the rest of body. We can't tokenize
); // the sibling directly as we'll lose the tags in that case.
if ($div->nextSibling) {
$body->removeChild($div);
$this->tokenizeDOM($body, $tokens);
}
return $tokens; return $tokens;
} }
@ -252,7 +260,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
* @param HTMLPurifier_Context $context * @param HTMLPurifier_Context $context
* @return string * @return string
*/ */
protected function wrapHTML($html, $config, $context) protected function wrapHTML($html, $config, $context, $use_div = true)
{ {
$def = $config->getDefinition('HTML'); $def = $config->getDefinition('HTML');
$ret = ''; $ret = '';
@ -271,7 +279,11 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
$ret .= '<html><head>'; $ret .= '<html><head>';
$ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'; $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
// No protection if $html contains a stray </div>! // No protection if $html contains a stray </div>!
$ret .= '</head><body>' . $html . '</body></html>'; $ret .= '</head><body>';
if ($use_div) $ret .= '<div>';
$ret .= $html;
if ($use_div) $ret .= '</div>';
$ret .= '</body></html>';
return $ret; return $ret;
} }
} }

View File

@ -21,7 +21,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
public function tokenizeHTML($html, $config, $context) public function tokenizeHTML($html, $config, $context)
{ {
$new_html = $this->normalize($html, $config, $context); $new_html = $this->normalize($html, $config, $context);
$new_html = $this->wrapHTML($new_html, $config, $context); $new_html = $this->wrapHTML($new_html, $config, $context, false /* no div */);
try { try {
$parser = new HTML5($new_html); $parser = new HTML5($new_html);
$doc = $parser->save(); $doc = $parser->save();

View File

@ -0,0 +1,7 @@
--INI--
HTML.Doctype = HTML 4.01 Strict
--HTML--
<b>Vetgedrukt</b> <i>Schuingedrukt</i> <span>Hou</span><iframe></iframe><script></script> jij ook zo van vakjesdenken?
--EXPECT--
<b>Vetgedrukt</b> <i>Schuingedrukt</i> <span>Hou</span> jij ook zo van vakjesdenken?
--# vim: et sw=4 sts=4

View File

@ -814,13 +814,21 @@ div {}
public function test_tokenizeHTML_prematureDivClose() public function test_tokenizeHTML_prematureDivClose()
{ {
$this->assertTokenization( $this->assertTokenization(
'</div>dontdie', '</div>dont<b>die</b>',
array( array(
new HTMLPurifier_Token_End('div'), new HTMLPurifier_Token_End('div'),
new HTMLPurifier_Token_Text('dontdie') new HTMLPurifier_Token_Text('dont'),
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('die'),
new HTMLPurifier_Token_End('b'),
), ),
array( array(
'DOMLex' => $alt = array(new HTMLPurifier_Token_Text('dontdie')), 'DOMLex' => $alt = array(
new HTMLPurifier_Token_Text('dont'),
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('die'),
new HTMLPurifier_Token_End('b')
),
'PH5P' => $alt 'PH5P' => $alt
) )
); );