mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-22 08:21:52 +00:00
Revamp entity decoding to be more like HTML5.
See %Core.LegacyEntityDecoder for more details. Signed-off-by: Edward Z. Yang <ezyang@cs.stanford.edu>
This commit is contained in:
parent
66bbae73a9
commit
7e11c271b9
4
NEWS
4
NEWS
@ -32,6 +32,10 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
||||
rest of the text in case it ran off the end. (#78)
|
||||
- Fix PREG_BACKTRACK_LIMIT_ERROR in HTMLPurifier_Filter_ExtractStyle.
|
||||
Thanks @breathbath for contributing the report and fix (#120)
|
||||
- Fix entity decoding algorithm to be more conservative about
|
||||
decoding entities that are missing trailing semicolon.
|
||||
To get old behavior, set %Core.LegacyEntityDecoder to true.
|
||||
(#119)
|
||||
# By default, when a link has a target attribute associated
|
||||
with it, we now also add rel="noopener" in order to
|
||||
prevent the new window from being able to overwrite
|
||||
|
@ -6,7 +6,7 @@
|
||||
</file>
|
||||
<file name="HTMLPurifier/Lexer.php">
|
||||
<line>85</line>
|
||||
<line>315</line>
|
||||
<line>322</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/Lexer/DirectLex.php">
|
||||
<line>67</line>
|
||||
@ -124,7 +124,7 @@
|
||||
<line>122</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/Lexer.php">
|
||||
<line>297</line>
|
||||
<line>304</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Output.Newline">
|
||||
@ -172,7 +172,7 @@
|
||||
<line>234</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/Lexer.php">
|
||||
<line>302</line>
|
||||
<line>309</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/HTMLModule/Image.php">
|
||||
<line>37</line>
|
||||
@ -262,12 +262,12 @@
|
||||
</directive>
|
||||
<directive id="Core.ConvertDocumentToFragment">
|
||||
<file name="HTMLPurifier/Lexer.php">
|
||||
<line>313</line>
|
||||
<line>320</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.RemoveProcessingInstructions">
|
||||
<file name="HTMLPurifier/Lexer.php">
|
||||
<line>334</line>
|
||||
<line>343</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="URI.">
|
||||
@ -444,12 +444,12 @@
|
||||
</directive>
|
||||
<directive id="Filter.ExtractStyleBlocks.Scope">
|
||||
<file name="HTMLPurifier/Filter/ExtractStyleBlocks.php">
|
||||
<line>122</line>
|
||||
<line>125</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Filter.ExtractStyleBlocks.Escaping">
|
||||
<file name="HTMLPurifier/Filter/ExtractStyleBlocks.php">
|
||||
<line>327</line>
|
||||
<line>330</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.SafeIframe">
|
||||
|
Binary file not shown.
@ -0,0 +1,36 @@
|
||||
Core.LegacyEntityDecoder
|
||||
TYPE: bool
|
||||
VERSION: 4.9.0
|
||||
DEFAULT: false
|
||||
--DESCRIPTION--
|
||||
<p>
|
||||
Prior to HTML Purifier 4.9.0, entities were decoded by performing
|
||||
a global search replace for all entities whose decoded versions
|
||||
did not have special meanings under HTML, and replaced them with
|
||||
their decoded versions. We would match all entities, even if they did
|
||||
not have a trailing semicolon, but only if there weren't any trailing
|
||||
alphanumeric characters.
|
||||
</p>
|
||||
<table>
|
||||
<tr><th>Original</th><th>Text</th><th>Attribute</th></tr>
|
||||
<tr><td>&yen;</td><td>¥</td><td>¥</td></tr>
|
||||
<tr><td>&yen</td><td>¥</td><td>¥</td></tr>
|
||||
<tr><td>&yena</td><td>&yena</td><td>&yena</td></tr>
|
||||
<tr><td>&yen=</td><td>¥=</td><td>¥=</td></tr>
|
||||
</table>
|
||||
<p>
|
||||
In HTML Purifier 4.9.0, we changed the behavior of entity parsing
|
||||
to match entities that had missing trailing semicolons in less
|
||||
cases, to more closely match HTML5 parsing behavior:
|
||||
</p>
|
||||
<table>
|
||||
<tr><th>Original</th><th>Text</th><th>Attribute</th></tr>
|
||||
<tr><td>&yen;</td><td>¥</td><td>¥</td></tr>
|
||||
<tr><td>&yen</td><td>¥</td><td>¥</td></tr>
|
||||
<tr><td>&yena</td><td>¥a</td><td>&yena</td></tr>
|
||||
<tr><td>&yen=</td><td>¥=</td><td>&yen=</td></tr>
|
||||
</table>
|
||||
<p>
|
||||
This flag reverts back to pre-HTML Purifier 4.9.0 behavior.
|
||||
</p>
|
||||
--# vim: et sw=4 sts=4
|
@ -16,6 +16,138 @@ class HTMLPurifier_EntityParser
|
||||
*/
|
||||
protected $_entity_lookup;
|
||||
|
||||
/**
|
||||
* Callback regex string for entities in text.
|
||||
* @type string
|
||||
*/
|
||||
protected $_textEntitiesRegex;
|
||||
|
||||
/**
|
||||
* Callback regex string for entities in attributes.
|
||||
* @type string
|
||||
*/
|
||||
protected $_attrEntitiesRegex;
|
||||
|
||||
/**
|
||||
* Tests if the beginning of a string is a semi-optional regex
|
||||
*/
|
||||
protected $_semiOptionalPrefixRegex;
|
||||
|
||||
public function __construct() {
|
||||
// From
|
||||
// http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon
|
||||
$semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml";
|
||||
|
||||
// NB: three empty captures to put the fourth match in the right
|
||||
// place
|
||||
$this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/";
|
||||
|
||||
$this->_textEntitiesRegex =
|
||||
'/&(?:'.
|
||||
// hex
|
||||
'[#]x([a-fA-F0-9]+);?|'.
|
||||
// dec
|
||||
'[#]0*(\d+);?|'.
|
||||
// string (mandatory semicolon)
|
||||
// NB: order matters: match semicolon preferentially
|
||||
'([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
|
||||
// string (optional semicolon)
|
||||
"($semi_optional)".
|
||||
')/';
|
||||
|
||||
$this->_attrEntitiesRegex =
|
||||
'/&(?:'.
|
||||
// hex
|
||||
'[#]x([a-fA-F0-9]+);?|'.
|
||||
// dec
|
||||
'[#]0*(\d+);?|'.
|
||||
// string (mandatory semicolon)
|
||||
// NB: order matters: match semicolon preferentially
|
||||
'([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
|
||||
// string (optional semicolon)
|
||||
// don't match if trailing is equals or alphanumeric (URL
|
||||
// like)
|
||||
"($semi_optional)(?![=;A-Za-z0-9])".
|
||||
')/';
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Substitute entities with the parsed equivalents. Use this on
|
||||
* textual data in an HTML document (as opposed to attributes.)
|
||||
*
|
||||
* @param string $string String to have entities parsed.
|
||||
* @return string Parsed string.
|
||||
*/
|
||||
public function substituteTextEntities($string)
|
||||
{
|
||||
return preg_replace_callback(
|
||||
$this->_textEntitiesRegex,
|
||||
array($this, 'entityCallback'),
|
||||
$string
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Substitute entities with the parsed equivalents. Use this on
|
||||
* attribute contents in documents.
|
||||
*
|
||||
* @param string $string String to have entities parsed.
|
||||
* @return string Parsed string.
|
||||
*/
|
||||
public function substituteAttrEntities($string)
|
||||
{
|
||||
return preg_replace_callback(
|
||||
$this->_attrEntitiesRegex,
|
||||
array($this, 'entityCallback'),
|
||||
$string
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback function for substituteNonSpecialEntities() that does the work.
|
||||
*
|
||||
* @param array $matches PCRE matches array, with 0 the entire match, and
|
||||
* either index 1, 2 or 3 set with a hex value, dec value,
|
||||
* or string (respectively).
|
||||
* @return string Replacement string.
|
||||
*/
|
||||
|
||||
protected function entityCallback($matches)
|
||||
{
|
||||
$entity = $matches[0];
|
||||
$hex_part = @$matches[1];
|
||||
$dec_part = @$matches[2];
|
||||
$named_part = empty($matches[3]) ? @$matches[4] : $matches[3];
|
||||
if ($hex_part) {
|
||||
return HTMLPurifier_Encoder::unichr(hexdec($hex_part));
|
||||
} elseif ($dec_part) {
|
||||
return HTMLPurifier_Encoder((int) $dec_part);
|
||||
} else {
|
||||
if (!$this->_entity_lookup) {
|
||||
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
|
||||
}
|
||||
if (isset($this->_entity_lookup->table[$named_part])) {
|
||||
return $this->_entity_lookup->table[$named_part];
|
||||
} else {
|
||||
// exact match didn't match anything, so test if
|
||||
// any of the semicolon optional match the prefix.
|
||||
// Test that this is an EXACT match is important to
|
||||
// prevent infinite loop
|
||||
if (!empty($matches[3])) {
|
||||
return preg_replace_callback(
|
||||
$this->_semiOptionalPrefixRegex,
|
||||
array($this, 'entityCallback'),
|
||||
$entity
|
||||
);
|
||||
}
|
||||
return $entity;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// LEGACY CODE BELOW
|
||||
|
||||
/**
|
||||
* Callback regex string for parsing entities.
|
||||
* @type string
|
||||
@ -144,7 +276,7 @@ class HTMLPurifier_EntityParser
|
||||
$entity;
|
||||
} else {
|
||||
return isset($this->_special_ent2dec[$matches[3]]) ?
|
||||
$this->_special_ent2dec[$matches[3]] :
|
||||
$this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] :
|
||||
$entity;
|
||||
}
|
||||
}
|
||||
|
@ -169,21 +169,24 @@ class HTMLPurifier_Lexer
|
||||
''' => "'"
|
||||
);
|
||||
|
||||
public function parseText($string, $config) {
|
||||
return $this->parseData($string, false, $config);
|
||||
}
|
||||
|
||||
public function parseAttr($string, $config) {
|
||||
return $this->parseData($string, true, $config);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses special entities into the proper characters.
|
||||
*
|
||||
* This string will translate escaped versions of the special characters
|
||||
* into the correct ones.
|
||||
*
|
||||
* @warning
|
||||
* You should be able to treat the output of this function as
|
||||
* completely parsed, but that's only because all other entities should
|
||||
* have been handled previously in substituteNonSpecialEntities()
|
||||
*
|
||||
* @param string $string String character data to be parsed.
|
||||
* @return string Parsed character data.
|
||||
*/
|
||||
public function parseData($string)
|
||||
public function parseData($string, $is_attr, $config)
|
||||
{
|
||||
// following functions require at least one character
|
||||
if ($string === '') {
|
||||
@ -209,7 +212,15 @@ class HTMLPurifier_Lexer
|
||||
}
|
||||
|
||||
// hmm... now we have some uncommon entities. Use the callback.
|
||||
$string = $this->_entity_parser->substituteSpecialEntities($string);
|
||||
if ($config->get('Core.LegacyEntityDecoder')) {
|
||||
$string = $this->_entity_parser->substituteSpecialEntities($string);
|
||||
} else {
|
||||
if ($is_attr) {
|
||||
$string = $this->_entity_parser->substituteAttrEntities($string);
|
||||
} else {
|
||||
$string = $this->_entity_parser->substituteTextEntities($string);
|
||||
}
|
||||
}
|
||||
return $string;
|
||||
}
|
||||
|
||||
@ -323,7 +334,9 @@ class HTMLPurifier_Lexer
|
||||
}
|
||||
|
||||
// expand entities that aren't the big five
|
||||
$html = $this->_entity_parser->substituteNonSpecialEntities($html);
|
||||
if ($config->get('Core.LegacyEntityDecoder')) {
|
||||
$html = $this->_entity_parser->substituteNonSpecialEntities($html);
|
||||
}
|
||||
|
||||
// clean into wellformed UTF-8 string for an SGML context: this has
|
||||
// to be done after entity expansion because the entities sometimes
|
||||
|
@ -77,14 +77,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
|
||||
$div = $body->getElementsByTagName('div')->item(0); // <div>
|
||||
$tokens = array();
|
||||
$this->tokenizeDOM($div, $tokens);
|
||||
$this->tokenizeDOM($div, $tokens, $config);
|
||||
// If the div has a sibling, that means we tripped across
|
||||
// a premature </div> tag. So remove the div we parsed,
|
||||
// and then tokenize the rest of body. We can't tokenize
|
||||
// the sibling directly as we'll lose the tags in that case.
|
||||
if ($div->nextSibling) {
|
||||
$body->removeChild($div);
|
||||
$this->tokenizeDOM($body, $tokens);
|
||||
$this->tokenizeDOM($body, $tokens, $config);
|
||||
}
|
||||
return $tokens;
|
||||
}
|
||||
@ -96,7 +96,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
|
||||
* @return HTMLPurifier_Token of node appended to previously passed tokens.
|
||||
*/
|
||||
protected function tokenizeDOM($node, &$tokens)
|
||||
protected function tokenizeDOM($node, &$tokens, $config)
|
||||
{
|
||||
$level = 0;
|
||||
$nodes = array($level => new HTMLPurifier_Queue(array($node)));
|
||||
@ -105,7 +105,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
while (!$nodes[$level]->isEmpty()) {
|
||||
$node = $nodes[$level]->shift(); // FIFO
|
||||
$collect = $level > 0 ? true : false;
|
||||
$needEndingTag = $this->createStartNode($node, $tokens, $collect);
|
||||
$needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
|
||||
if ($needEndingTag) {
|
||||
$closingNodes[$level][] = $node;
|
||||
}
|
||||
@ -135,7 +135,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
* @return bool if the token needs an endtoken
|
||||
* @todo data and tagName properties don't seem to exist in DOMNode?
|
||||
*/
|
||||
protected function createStartNode($node, &$tokens, $collect)
|
||||
protected function createStartNode($node, &$tokens, $collect, $config)
|
||||
{
|
||||
// intercept non element nodes. WE MUST catch all of them,
|
||||
// but we're not getting the character reference nodes because
|
||||
@ -159,7 +159,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
}
|
||||
}
|
||||
}
|
||||
$tokens[] = $this->factory->createText($this->parseData($data));
|
||||
$tokens[] = $this->factory->createText($this->parseText($data, $config));
|
||||
return false;
|
||||
} elseif ($node->nodeType === XML_COMMENT_NODE) {
|
||||
// this is code is only invoked for comments in script/style in versions
|
||||
|
@ -129,12 +129,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
// We are not inside tag and there still is another tag to parse
|
||||
$token = new
|
||||
HTMLPurifier_Token_Text(
|
||||
$this->parseData(
|
||||
$this->parseText(
|
||||
substr(
|
||||
$html,
|
||||
$cursor,
|
||||
$position_next_lt - $cursor
|
||||
)
|
||||
), $config
|
||||
)
|
||||
);
|
||||
if ($maintain_line_numbers) {
|
||||
@ -154,11 +154,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
// Create Text of rest of string
|
||||
$token = new
|
||||
HTMLPurifier_Token_Text(
|
||||
$this->parseData(
|
||||
$this->parseText(
|
||||
substr(
|
||||
$html,
|
||||
$cursor
|
||||
)
|
||||
), $config
|
||||
)
|
||||
);
|
||||
if ($maintain_line_numbers) {
|
||||
@ -324,8 +324,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
$token = new
|
||||
HTMLPurifier_Token_Text(
|
||||
'<' .
|
||||
$this->parseData(
|
||||
substr($html, $cursor)
|
||||
$this->parseText(
|
||||
substr($html, $cursor), $config
|
||||
)
|
||||
);
|
||||
if ($maintain_line_numbers) {
|
||||
@ -429,7 +429,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
if ($value === false) {
|
||||
$value = '';
|
||||
}
|
||||
return array($key => $this->parseData($value));
|
||||
return array($key => $this->parseAttr($value, $config));
|
||||
}
|
||||
|
||||
// setup loop environment
|
||||
@ -518,7 +518,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
if ($value === false) {
|
||||
$value = '';
|
||||
}
|
||||
$array[$key] = $this->parseData($value);
|
||||
$array[$key] = $this->parseAttr($value, $config);
|
||||
$cursor++;
|
||||
} else {
|
||||
// boolattr
|
||||
|
@ -36,7 +36,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
|
||||
$doc->getElementsByTagName('html')->item(0)-> // <html>
|
||||
getElementsByTagName('body')->item(0) // <body>
|
||||
,
|
||||
$tokens
|
||||
$tokens, $config
|
||||
);
|
||||
return $tokens;
|
||||
}
|
||||
@ -1515,6 +1515,7 @@ class HTML5
|
||||
// Consume the maximum number of characters possible, with the
|
||||
// consumed characters case-sensitively matching one of the
|
||||
// identifiers in the first column of the entities table.
|
||||
|
||||
$e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
|
||||
$len = strlen($e_name);
|
||||
|
||||
@ -1547,7 +1548,7 @@ class HTML5
|
||||
|
||||
// Return a character token for the character corresponding to the
|
||||
// entity name (as given by the second column of the entities table).
|
||||
return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
|
||||
return html_entity_decode('&' . rtrim($entity, ';') . ';', ENT_QUOTES, 'UTF-8');
|
||||
}
|
||||
|
||||
private function emitToken($token)
|
||||
|
@ -46,11 +46,11 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
||||
|
||||
// HTMLPurifier_Lexer->parseData() -----------------------------------------
|
||||
|
||||
public function assertParseData($input, $expect = true)
|
||||
public function assertParseData($input, $expect = true, $is_attr = false)
|
||||
{
|
||||
if ($expect === true) $expect = $input;
|
||||
$lexer = new HTMLPurifier_Lexer();
|
||||
$this->assertIdentical($expect, $lexer->parseData($input));
|
||||
$this->assertIdentical($expect, $lexer->parseData($input, $is_attr, $this->config));
|
||||
}
|
||||
|
||||
public function test_parseData_plainText()
|
||||
@ -95,7 +95,58 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
||||
|
||||
public function test_parseData_improperEntityFaultToleranceTest()
|
||||
{
|
||||
$this->assertParseData('-');
|
||||
$this->assertParseData('-', '-');
|
||||
}
|
||||
|
||||
public function test_parseData_noTrailingSemi()
|
||||
{
|
||||
$this->assertParseData('&A', '&A');
|
||||
}
|
||||
|
||||
public function test_parseData_noTrailingSemiAttr()
|
||||
{
|
||||
$this->assertParseData('&A', '&A', true);
|
||||
}
|
||||
|
||||
public function test_parseData_T119()
|
||||
{
|
||||
$this->assertParseData('&A', '&A', true);
|
||||
}
|
||||
|
||||
public function test_parseData_T119b()
|
||||
{
|
||||
$this->assertParseData('&trade=', true, true);
|
||||
}
|
||||
|
||||
public function test_parseData_legacy1()
|
||||
{
|
||||
$this->config->set('Core.LegacyEntityDecoder', true);
|
||||
$this->assertParseData('&a', true);
|
||||
$this->assertParseData('&=', "&=");
|
||||
$this->assertParseData('&a', true, true);
|
||||
$this->assertParseData('&=', "&=", true);
|
||||
$this->assertParseData('<a', true);
|
||||
$this->assertParseData('<=', "<=");
|
||||
$this->assertParseData('<a', true, true);
|
||||
$this->assertParseData('<=', "<=", true);
|
||||
}
|
||||
|
||||
public function test_parseData_nonlegacy1()
|
||||
{
|
||||
$this->assertParseData('&a', "&a");
|
||||
$this->assertParseData('&=', "&=");
|
||||
$this->assertParseData('&a', true, true);
|
||||
$this->assertParseData('&=', true, true);
|
||||
$this->assertParseData('<a', "<a");
|
||||
$this->assertParseData('<=', "<=");
|
||||
$this->assertParseData('<a', true, true);
|
||||
$this->assertParseData('<=', true, true);
|
||||
$this->assertParseData('<a;', "<a;");
|
||||
}
|
||||
|
||||
public function test_parseData_noTrailingSemiNever()
|
||||
{
|
||||
$this->assertParseData('&imath');
|
||||
}
|
||||
|
||||
// HTMLPurifier_Lexer->extractBody() ---------------------------------------
|
||||
|
Loading…
Reference in New Issue
Block a user