diff --git a/library/HTMLPurifier/ConfigSchema/schema.ser b/library/HTMLPurifier/ConfigSchema/schema.ser
index 40b870b3..df8c5c46 100644
Binary files a/library/HTMLPurifier/ConfigSchema/schema.ser and b/library/HTMLPurifier/ConfigSchema/schema.ser differ
diff --git a/library/HTMLPurifier/ConfigSchema/schema/Core.LegacyEntityDecoder.txt b/library/HTMLPurifier/ConfigSchema/schema/Core.LegacyEntityDecoder.txt
new file mode 100644
index 00000000..392b4364
--- /dev/null
+++ b/library/HTMLPurifier/ConfigSchema/schema/Core.LegacyEntityDecoder.txt
@@ -0,0 +1,36 @@
+Core.LegacyEntityDecoder
+TYPE: bool
+VERSION: 4.9.0
+DEFAULT: false
+--DESCRIPTION--
+
+ Prior to HTML Purifier 4.9.0, entities were decoded by performing
+ a global search replace for all entities whose decoded versions
+ did not have special meanings under HTML, and replaced them with
+ their decoded versions. We would match all entities, even if they did
+ not have a trailing semicolon, but only if there weren't any trailing
+ alphanumeric characters.
+
+
+Original | Text | Attribute |
+¥ | ¥ | ¥ |
+¥ | ¥ | ¥ |
+¥a | ¥a | ¥a |
+¥= | ¥= | ¥= |
+
+
+ In HTML Purifier 4.9.0, we changed the behavior of entity parsing
+ to match entities that had missing trailing semicolons in less
+ cases, to more closely match HTML5 parsing behavior:
+
+
+Original | Text | Attribute |
+¥ | ¥ | ¥ |
+¥ | ¥ | ¥ |
+¥a | ¥a | ¥a |
+¥= | ¥= | ¥= |
+
+
+ This flag reverts back to pre-HTML Purifier 4.9.0 behavior.
+
+--# vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/EntityParser.php b/library/HTMLPurifier/EntityParser.php
index 61529dcd..50e07bb2 100644
--- a/library/HTMLPurifier/EntityParser.php
+++ b/library/HTMLPurifier/EntityParser.php
@@ -16,6 +16,138 @@ class HTMLPurifier_EntityParser
*/
protected $_entity_lookup;
+ /**
+ * Callback regex string for entities in text.
+ * @type string
+ */
+ protected $_textEntitiesRegex;
+
+ /**
+ * Callback regex string for entities in attributes.
+ * @type string
+ */
+ protected $_attrEntitiesRegex;
+
+ /**
+ * Tests if the beginning of a string is a semi-optional regex
+ */
+ protected $_semiOptionalPrefixRegex;
+
+ public function __construct() {
+ // From
+ // http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon
+ $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml";
+
+ // NB: three empty captures to put the fourth match in the right
+ // place
+ $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/";
+
+ $this->_textEntitiesRegex =
+ '/&(?:'.
+ // hex
+ '[#]x([a-fA-F0-9]+);?|'.
+ // dec
+ '[#]0*(\d+);?|'.
+ // string (mandatory semicolon)
+ // NB: order matters: match semicolon preferentially
+ '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
+ // string (optional semicolon)
+ "($semi_optional)".
+ ')/';
+
+ $this->_attrEntitiesRegex =
+ '/&(?:'.
+ // hex
+ '[#]x([a-fA-F0-9]+);?|'.
+ // dec
+ '[#]0*(\d+);?|'.
+ // string (mandatory semicolon)
+ // NB: order matters: match semicolon preferentially
+ '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
+ // string (optional semicolon)
+ // don't match if trailing is equals or alphanumeric (URL
+ // like)
+ "($semi_optional)(?![=;A-Za-z0-9])".
+ ')/';
+
+ }
+
+ /**
+ * Substitute entities with the parsed equivalents. Use this on
+ * textual data in an HTML document (as opposed to attributes.)
+ *
+ * @param string $string String to have entities parsed.
+ * @return string Parsed string.
+ */
+ public function substituteTextEntities($string)
+ {
+ return preg_replace_callback(
+ $this->_textEntitiesRegex,
+ array($this, 'entityCallback'),
+ $string
+ );
+ }
+
+ /**
+ * Substitute entities with the parsed equivalents. Use this on
+ * attribute contents in documents.
+ *
+ * @param string $string String to have entities parsed.
+ * @return string Parsed string.
+ */
+ public function substituteAttrEntities($string)
+ {
+ return preg_replace_callback(
+ $this->_attrEntitiesRegex,
+ array($this, 'entityCallback'),
+ $string
+ );
+ }
+
+ /**
+ * Callback function for substituteNonSpecialEntities() that does the work.
+ *
+ * @param array $matches PCRE matches array, with 0 the entire match, and
+ * either index 1, 2 or 3 set with a hex value, dec value,
+ * or string (respectively).
+ * @return string Replacement string.
+ */
+
+ protected function entityCallback($matches)
+ {
+ $entity = $matches[0];
+ $hex_part = @$matches[1];
+ $dec_part = @$matches[2];
+ $named_part = empty($matches[3]) ? @$matches[4] : $matches[3];
+ if ($hex_part) {
+ return HTMLPurifier_Encoder::unichr(hexdec($hex_part));
+ } elseif ($dec_part) {
+ return HTMLPurifier_Encoder((int) $dec_part);
+ } else {
+ if (!$this->_entity_lookup) {
+ $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
+ }
+ if (isset($this->_entity_lookup->table[$named_part])) {
+ return $this->_entity_lookup->table[$named_part];
+ } else {
+ // exact match didn't match anything, so test if
+ // any of the semicolon optional match the prefix.
+ // Test that this is an EXACT match is important to
+ // prevent infinite loop
+ if (!empty($matches[3])) {
+ return preg_replace_callback(
+ $this->_semiOptionalPrefixRegex,
+ array($this, 'entityCallback'),
+ $entity
+ );
+ }
+ return $entity;
+ }
+ }
+ }
+
+ // LEGACY CODE BELOW
+
/**
* Callback regex string for parsing entities.
* @type string
@@ -144,7 +276,7 @@ class HTMLPurifier_EntityParser
$entity;
} else {
return isset($this->_special_ent2dec[$matches[3]]) ?
- $this->_special_ent2dec[$matches[3]] :
+ $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] :
$entity;
}
}
diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php
index 44c5c659..37174eae 100644
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@@ -169,21 +169,24 @@ class HTMLPurifier_Lexer
''' => "'"
);
+ public function parseText($string, $config) {
+ return $this->parseData($string, false, $config);
+ }
+
+ public function parseAttr($string, $config) {
+ return $this->parseData($string, true, $config);
+ }
+
/**
* Parses special entities into the proper characters.
*
* This string will translate escaped versions of the special characters
* into the correct ones.
*
- * @warning
- * You should be able to treat the output of this function as
- * completely parsed, but that's only because all other entities should
- * have been handled previously in substituteNonSpecialEntities()
- *
* @param string $string String character data to be parsed.
* @return string Parsed character data.
*/
- public function parseData($string)
+ public function parseData($string, $is_attr, $config)
{
// following functions require at least one character
if ($string === '') {
@@ -209,7 +212,15 @@ class HTMLPurifier_Lexer
}
// hmm... now we have some uncommon entities. Use the callback.
- $string = $this->_entity_parser->substituteSpecialEntities($string);
+ if ($config->get('Core.LegacyEntityDecoder')) {
+ $string = $this->_entity_parser->substituteSpecialEntities($string);
+ } else {
+ if ($is_attr) {
+ $string = $this->_entity_parser->substituteAttrEntities($string);
+ } else {
+ $string = $this->_entity_parser->substituteTextEntities($string);
+ }
+ }
return $string;
}
@@ -323,7 +334,9 @@ class HTMLPurifier_Lexer
}
// expand entities that aren't the big five
- $html = $this->_entity_parser->substituteNonSpecialEntities($html);
+ if ($config->get('Core.LegacyEntityDecoder')) {
+ $html = $this->_entity_parser->substituteNonSpecialEntities($html);
+ }
// clean into wellformed UTF-8 string for an SGML context: this has
// to be done after entity expansion because the entities sometimes
diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php
index 1406c506..22ab5820 100644
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -77,14 +77,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
$div = $body->getElementsByTagName('div')->item(0); //
$tokens = array();
- $this->tokenizeDOM($div, $tokens);
+ $this->tokenizeDOM($div, $tokens, $config);
// If the div has a sibling, that means we tripped across
// a premature
tag. So remove the div we parsed,
// and then tokenize the rest of body. We can't tokenize
// the sibling directly as we'll lose the tags in that case.
if ($div->nextSibling) {
$body->removeChild($div);
- $this->tokenizeDOM($body, $tokens);
+ $this->tokenizeDOM($body, $tokens, $config);
}
return $tokens;
}
@@ -96,7 +96,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
* @return HTMLPurifier_Token of node appended to previously passed tokens.
*/
- protected function tokenizeDOM($node, &$tokens)
+ protected function tokenizeDOM($node, &$tokens, $config)
{
$level = 0;
$nodes = array($level => new HTMLPurifier_Queue(array($node)));
@@ -105,7 +105,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
while (!$nodes[$level]->isEmpty()) {
$node = $nodes[$level]->shift(); // FIFO
$collect = $level > 0 ? true : false;
- $needEndingTag = $this->createStartNode($node, $tokens, $collect);
+ $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
if ($needEndingTag) {
$closingNodes[$level][] = $node;
}
@@ -135,7 +135,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
* @return bool if the token needs an endtoken
* @todo data and tagName properties don't seem to exist in DOMNode?
*/
- protected function createStartNode($node, &$tokens, $collect)
+ protected function createStartNode($node, &$tokens, $collect, $config)
{
// intercept non element nodes. WE MUST catch all of them,
// but we're not getting the character reference nodes because
@@ -159,7 +159,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
}
}
}
- $tokens[] = $this->factory->createText($this->parseData($data));
+ $tokens[] = $this->factory->createText($this->parseText($data, $config));
return false;
} elseif ($node->nodeType === XML_COMMENT_NODE) {
// this is code is only invoked for comments in script/style in versions
diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php
index 746b6e31..6f130896 100644
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -129,12 +129,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// We are not inside tag and there still is another tag to parse
$token = new
HTMLPurifier_Token_Text(
- $this->parseData(
+ $this->parseText(
substr(
$html,
$cursor,
$position_next_lt - $cursor
- )
+ ), $config
)
);
if ($maintain_line_numbers) {
@@ -154,11 +154,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// Create Text of rest of string
$token = new
HTMLPurifier_Token_Text(
- $this->parseData(
+ $this->parseText(
substr(
$html,
$cursor
- )
+ ), $config
)
);
if ($maintain_line_numbers) {
@@ -324,8 +324,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
$token = new
HTMLPurifier_Token_Text(
'<' .
- $this->parseData(
- substr($html, $cursor)
+ $this->parseText(
+ substr($html, $cursor), $config
)
);
if ($maintain_line_numbers) {
@@ -429,7 +429,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
if ($value === false) {
$value = '';
}
- return array($key => $this->parseData($value));
+ return array($key => $this->parseAttr($value, $config));
}
// setup loop environment
@@ -518,7 +518,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
if ($value === false) {
$value = '';
}
- $array[$key] = $this->parseData($value);
+ $array[$key] = $this->parseAttr($value, $config);
$cursor++;
} else {
// boolattr
diff --git a/library/HTMLPurifier/Lexer/PH5P.php b/library/HTMLPurifier/Lexer/PH5P.php
index 39a677da..0b452d17 100644
--- a/library/HTMLPurifier/Lexer/PH5P.php
+++ b/library/HTMLPurifier/Lexer/PH5P.php
@@ -36,7 +36,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
$doc->getElementsByTagName('html')->item(0)-> //
getElementsByTagName('body')->item(0) //
,
- $tokens
+ $tokens, $config
);
return $tokens;
}
@@ -1515,6 +1515,7 @@ class HTML5
// Consume the maximum number of characters possible, with the
// consumed characters case-sensitively matching one of the
// identifiers in the first column of the entities table.
+
$e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
$len = strlen($e_name);
@@ -1547,7 +1548,7 @@ class HTML5
// Return a character token for the character corresponding to the
// entity name (as given by the second column of the entities table).
- return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
+ return html_entity_decode('&' . rtrim($entity, ';') . ';', ENT_QUOTES, 'UTF-8');
}
private function emitToken($token)
diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php
index e28dc9e9..12b46dd4 100644
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@@ -46,11 +46,11 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
// HTMLPurifier_Lexer->parseData() -----------------------------------------
- public function assertParseData($input, $expect = true)
+ public function assertParseData($input, $expect = true, $is_attr = false)
{
if ($expect === true) $expect = $input;
$lexer = new HTMLPurifier_Lexer();
- $this->assertIdentical($expect, $lexer->parseData($input));
+ $this->assertIdentical($expect, $lexer->parseData($input, $is_attr, $this->config));
}
public function test_parseData_plainText()
@@ -95,7 +95,58 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
public function test_parseData_improperEntityFaultToleranceTest()
{
- $this->assertParseData('-');
+ $this->assertParseData('-', '-');
+ }
+
+ public function test_parseData_noTrailingSemi()
+ {
+ $this->assertParseData('&A', '&A');
+ }
+
+ public function test_parseData_noTrailingSemiAttr()
+ {
+ $this->assertParseData('&A', '&A', true);
+ }
+
+ public function test_parseData_T119()
+ {
+ $this->assertParseData('&A', '&A', true);
+ }
+
+ public function test_parseData_T119b()
+ {
+ $this->assertParseData('&trade=', true, true);
+ }
+
+ public function test_parseData_legacy1()
+ {
+ $this->config->set('Core.LegacyEntityDecoder', true);
+ $this->assertParseData('&a', true);
+ $this->assertParseData('&=', "&=");
+ $this->assertParseData('&a', true, true);
+ $this->assertParseData('&=', "&=", true);
+ $this->assertParseData('<a', true);
+ $this->assertParseData('<=', "<=");
+ $this->assertParseData('<a', true, true);
+ $this->assertParseData('<=', "<=", true);
+ }
+
+ public function test_parseData_nonlegacy1()
+ {
+ $this->assertParseData('&a', "&a");
+ $this->assertParseData('&=', "&=");
+ $this->assertParseData('&a', true, true);
+ $this->assertParseData('&=', true, true);
+ $this->assertParseData('<a', "assertParseData('<=', "<=");
+ $this->assertParseData('<a', true, true);
+ $this->assertParseData('<=', true, true);
+ $this->assertParseData('<a;', "assertParseData('&imath');
}
// HTMLPurifier_Lexer->extractBody() ---------------------------------------