mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-22 16:31:53 +00:00
Revamp entity decoding to be more like HTML5.
See %Core.LegacyEntityDecoder for more details. Signed-off-by: Edward Z. Yang <ezyang@cs.stanford.edu>
This commit is contained in:
parent
66bbae73a9
commit
7e11c271b9
4
NEWS
4
NEWS
@ -32,6 +32,10 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
rest of the text in case it ran off the end. (#78)
|
rest of the text in case it ran off the end. (#78)
|
||||||
- Fix PREG_BACKTRACK_LIMIT_ERROR in HTMLPurifier_Filter_ExtractStyle.
|
- Fix PREG_BACKTRACK_LIMIT_ERROR in HTMLPurifier_Filter_ExtractStyle.
|
||||||
Thanks @breathbath for contributing the report and fix (#120)
|
Thanks @breathbath for contributing the report and fix (#120)
|
||||||
|
- Fix entity decoding algorithm to be more conservative about
|
||||||
|
decoding entities that are missing trailing semicolon.
|
||||||
|
To get old behavior, set %Core.LegacyEntityDecoder to true.
|
||||||
|
(#119)
|
||||||
# By default, when a link has a target attribute associated
|
# By default, when a link has a target attribute associated
|
||||||
with it, we now also add rel="noopener" in order to
|
with it, we now also add rel="noopener" in order to
|
||||||
prevent the new window from being able to overwrite
|
prevent the new window from being able to overwrite
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
</file>
|
</file>
|
||||||
<file name="HTMLPurifier/Lexer.php">
|
<file name="HTMLPurifier/Lexer.php">
|
||||||
<line>85</line>
|
<line>85</line>
|
||||||
<line>315</line>
|
<line>322</line>
|
||||||
</file>
|
</file>
|
||||||
<file name="HTMLPurifier/Lexer/DirectLex.php">
|
<file name="HTMLPurifier/Lexer/DirectLex.php">
|
||||||
<line>67</line>
|
<line>67</line>
|
||||||
@ -124,7 +124,7 @@
|
|||||||
<line>122</line>
|
<line>122</line>
|
||||||
</file>
|
</file>
|
||||||
<file name="HTMLPurifier/Lexer.php">
|
<file name="HTMLPurifier/Lexer.php">
|
||||||
<line>297</line>
|
<line>304</line>
|
||||||
</file>
|
</file>
|
||||||
</directive>
|
</directive>
|
||||||
<directive id="Output.Newline">
|
<directive id="Output.Newline">
|
||||||
@ -172,7 +172,7 @@
|
|||||||
<line>234</line>
|
<line>234</line>
|
||||||
</file>
|
</file>
|
||||||
<file name="HTMLPurifier/Lexer.php">
|
<file name="HTMLPurifier/Lexer.php">
|
||||||
<line>302</line>
|
<line>309</line>
|
||||||
</file>
|
</file>
|
||||||
<file name="HTMLPurifier/HTMLModule/Image.php">
|
<file name="HTMLPurifier/HTMLModule/Image.php">
|
||||||
<line>37</line>
|
<line>37</line>
|
||||||
@ -262,12 +262,12 @@
|
|||||||
</directive>
|
</directive>
|
||||||
<directive id="Core.ConvertDocumentToFragment">
|
<directive id="Core.ConvertDocumentToFragment">
|
||||||
<file name="HTMLPurifier/Lexer.php">
|
<file name="HTMLPurifier/Lexer.php">
|
||||||
<line>313</line>
|
<line>320</line>
|
||||||
</file>
|
</file>
|
||||||
</directive>
|
</directive>
|
||||||
<directive id="Core.RemoveProcessingInstructions">
|
<directive id="Core.RemoveProcessingInstructions">
|
||||||
<file name="HTMLPurifier/Lexer.php">
|
<file name="HTMLPurifier/Lexer.php">
|
||||||
<line>334</line>
|
<line>343</line>
|
||||||
</file>
|
</file>
|
||||||
</directive>
|
</directive>
|
||||||
<directive id="URI.">
|
<directive id="URI.">
|
||||||
@ -444,12 +444,12 @@
|
|||||||
</directive>
|
</directive>
|
||||||
<directive id="Filter.ExtractStyleBlocks.Scope">
|
<directive id="Filter.ExtractStyleBlocks.Scope">
|
||||||
<file name="HTMLPurifier/Filter/ExtractStyleBlocks.php">
|
<file name="HTMLPurifier/Filter/ExtractStyleBlocks.php">
|
||||||
<line>122</line>
|
<line>125</line>
|
||||||
</file>
|
</file>
|
||||||
</directive>
|
</directive>
|
||||||
<directive id="Filter.ExtractStyleBlocks.Escaping">
|
<directive id="Filter.ExtractStyleBlocks.Escaping">
|
||||||
<file name="HTMLPurifier/Filter/ExtractStyleBlocks.php">
|
<file name="HTMLPurifier/Filter/ExtractStyleBlocks.php">
|
||||||
<line>327</line>
|
<line>330</line>
|
||||||
</file>
|
</file>
|
||||||
</directive>
|
</directive>
|
||||||
<directive id="HTML.SafeIframe">
|
<directive id="HTML.SafeIframe">
|
||||||
|
Binary file not shown.
@ -0,0 +1,36 @@
|
|||||||
|
Core.LegacyEntityDecoder
|
||||||
|
TYPE: bool
|
||||||
|
VERSION: 4.9.0
|
||||||
|
DEFAULT: false
|
||||||
|
--DESCRIPTION--
|
||||||
|
<p>
|
||||||
|
Prior to HTML Purifier 4.9.0, entities were decoded by performing
|
||||||
|
a global search replace for all entities whose decoded versions
|
||||||
|
did not have special meanings under HTML, and replaced them with
|
||||||
|
their decoded versions. We would match all entities, even if they did
|
||||||
|
not have a trailing semicolon, but only if there weren't any trailing
|
||||||
|
alphanumeric characters.
|
||||||
|
</p>
|
||||||
|
<table>
|
||||||
|
<tr><th>Original</th><th>Text</th><th>Attribute</th></tr>
|
||||||
|
<tr><td>&yen;</td><td>¥</td><td>¥</td></tr>
|
||||||
|
<tr><td>&yen</td><td>¥</td><td>¥</td></tr>
|
||||||
|
<tr><td>&yena</td><td>&yena</td><td>&yena</td></tr>
|
||||||
|
<tr><td>&yen=</td><td>¥=</td><td>¥=</td></tr>
|
||||||
|
</table>
|
||||||
|
<p>
|
||||||
|
In HTML Purifier 4.9.0, we changed the behavior of entity parsing
|
||||||
|
to match entities that had missing trailing semicolons in less
|
||||||
|
cases, to more closely match HTML5 parsing behavior:
|
||||||
|
</p>
|
||||||
|
<table>
|
||||||
|
<tr><th>Original</th><th>Text</th><th>Attribute</th></tr>
|
||||||
|
<tr><td>&yen;</td><td>¥</td><td>¥</td></tr>
|
||||||
|
<tr><td>&yen</td><td>¥</td><td>¥</td></tr>
|
||||||
|
<tr><td>&yena</td><td>¥a</td><td>&yena</td></tr>
|
||||||
|
<tr><td>&yen=</td><td>¥=</td><td>&yen=</td></tr>
|
||||||
|
</table>
|
||||||
|
<p>
|
||||||
|
This flag reverts back to pre-HTML Purifier 4.9.0 behavior.
|
||||||
|
</p>
|
||||||
|
--# vim: et sw=4 sts=4
|
@ -16,6 +16,138 @@ class HTMLPurifier_EntityParser
|
|||||||
*/
|
*/
|
||||||
protected $_entity_lookup;
|
protected $_entity_lookup;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Callback regex string for entities in text.
|
||||||
|
* @type string
|
||||||
|
*/
|
||||||
|
protected $_textEntitiesRegex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Callback regex string for entities in attributes.
|
||||||
|
* @type string
|
||||||
|
*/
|
||||||
|
protected $_attrEntitiesRegex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests if the beginning of a string is a semi-optional regex
|
||||||
|
*/
|
||||||
|
protected $_semiOptionalPrefixRegex;
|
||||||
|
|
||||||
|
public function __construct() {
|
||||||
|
// From
|
||||||
|
// http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon
|
||||||
|
$semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml";
|
||||||
|
|
||||||
|
// NB: three empty captures to put the fourth match in the right
|
||||||
|
// place
|
||||||
|
$this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/";
|
||||||
|
|
||||||
|
$this->_textEntitiesRegex =
|
||||||
|
'/&(?:'.
|
||||||
|
// hex
|
||||||
|
'[#]x([a-fA-F0-9]+);?|'.
|
||||||
|
// dec
|
||||||
|
'[#]0*(\d+);?|'.
|
||||||
|
// string (mandatory semicolon)
|
||||||
|
// NB: order matters: match semicolon preferentially
|
||||||
|
'([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
|
||||||
|
// string (optional semicolon)
|
||||||
|
"($semi_optional)".
|
||||||
|
')/';
|
||||||
|
|
||||||
|
$this->_attrEntitiesRegex =
|
||||||
|
'/&(?:'.
|
||||||
|
// hex
|
||||||
|
'[#]x([a-fA-F0-9]+);?|'.
|
||||||
|
// dec
|
||||||
|
'[#]0*(\d+);?|'.
|
||||||
|
// string (mandatory semicolon)
|
||||||
|
// NB: order matters: match semicolon preferentially
|
||||||
|
'([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
|
||||||
|
// string (optional semicolon)
|
||||||
|
// don't match if trailing is equals or alphanumeric (URL
|
||||||
|
// like)
|
||||||
|
"($semi_optional)(?![=;A-Za-z0-9])".
|
||||||
|
')/';
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Substitute entities with the parsed equivalents. Use this on
|
||||||
|
* textual data in an HTML document (as opposed to attributes.)
|
||||||
|
*
|
||||||
|
* @param string $string String to have entities parsed.
|
||||||
|
* @return string Parsed string.
|
||||||
|
*/
|
||||||
|
public function substituteTextEntities($string)
|
||||||
|
{
|
||||||
|
return preg_replace_callback(
|
||||||
|
$this->_textEntitiesRegex,
|
||||||
|
array($this, 'entityCallback'),
|
||||||
|
$string
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Substitute entities with the parsed equivalents. Use this on
|
||||||
|
* attribute contents in documents.
|
||||||
|
*
|
||||||
|
* @param string $string String to have entities parsed.
|
||||||
|
* @return string Parsed string.
|
||||||
|
*/
|
||||||
|
public function substituteAttrEntities($string)
|
||||||
|
{
|
||||||
|
return preg_replace_callback(
|
||||||
|
$this->_attrEntitiesRegex,
|
||||||
|
array($this, 'entityCallback'),
|
||||||
|
$string
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Callback function for substituteNonSpecialEntities() that does the work.
|
||||||
|
*
|
||||||
|
* @param array $matches PCRE matches array, with 0 the entire match, and
|
||||||
|
* either index 1, 2 or 3 set with a hex value, dec value,
|
||||||
|
* or string (respectively).
|
||||||
|
* @return string Replacement string.
|
||||||
|
*/
|
||||||
|
|
||||||
|
protected function entityCallback($matches)
|
||||||
|
{
|
||||||
|
$entity = $matches[0];
|
||||||
|
$hex_part = @$matches[1];
|
||||||
|
$dec_part = @$matches[2];
|
||||||
|
$named_part = empty($matches[3]) ? @$matches[4] : $matches[3];
|
||||||
|
if ($hex_part) {
|
||||||
|
return HTMLPurifier_Encoder::unichr(hexdec($hex_part));
|
||||||
|
} elseif ($dec_part) {
|
||||||
|
return HTMLPurifier_Encoder((int) $dec_part);
|
||||||
|
} else {
|
||||||
|
if (!$this->_entity_lookup) {
|
||||||
|
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
|
||||||
|
}
|
||||||
|
if (isset($this->_entity_lookup->table[$named_part])) {
|
||||||
|
return $this->_entity_lookup->table[$named_part];
|
||||||
|
} else {
|
||||||
|
// exact match didn't match anything, so test if
|
||||||
|
// any of the semicolon optional match the prefix.
|
||||||
|
// Test that this is an EXACT match is important to
|
||||||
|
// prevent infinite loop
|
||||||
|
if (!empty($matches[3])) {
|
||||||
|
return preg_replace_callback(
|
||||||
|
$this->_semiOptionalPrefixRegex,
|
||||||
|
array($this, 'entityCallback'),
|
||||||
|
$entity
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return $entity;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// LEGACY CODE BELOW
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Callback regex string for parsing entities.
|
* Callback regex string for parsing entities.
|
||||||
* @type string
|
* @type string
|
||||||
@ -144,7 +276,7 @@ class HTMLPurifier_EntityParser
|
|||||||
$entity;
|
$entity;
|
||||||
} else {
|
} else {
|
||||||
return isset($this->_special_ent2dec[$matches[3]]) ?
|
return isset($this->_special_ent2dec[$matches[3]]) ?
|
||||||
$this->_special_ent2dec[$matches[3]] :
|
$this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] :
|
||||||
$entity;
|
$entity;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -169,21 +169,24 @@ class HTMLPurifier_Lexer
|
|||||||
''' => "'"
|
''' => "'"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
public function parseText($string, $config) {
|
||||||
|
return $this->parseData($string, false, $config);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function parseAttr($string, $config) {
|
||||||
|
return $this->parseData($string, true, $config);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parses special entities into the proper characters.
|
* Parses special entities into the proper characters.
|
||||||
*
|
*
|
||||||
* This string will translate escaped versions of the special characters
|
* This string will translate escaped versions of the special characters
|
||||||
* into the correct ones.
|
* into the correct ones.
|
||||||
*
|
*
|
||||||
* @warning
|
|
||||||
* You should be able to treat the output of this function as
|
|
||||||
* completely parsed, but that's only because all other entities should
|
|
||||||
* have been handled previously in substituteNonSpecialEntities()
|
|
||||||
*
|
|
||||||
* @param string $string String character data to be parsed.
|
* @param string $string String character data to be parsed.
|
||||||
* @return string Parsed character data.
|
* @return string Parsed character data.
|
||||||
*/
|
*/
|
||||||
public function parseData($string)
|
public function parseData($string, $is_attr, $config)
|
||||||
{
|
{
|
||||||
// following functions require at least one character
|
// following functions require at least one character
|
||||||
if ($string === '') {
|
if ($string === '') {
|
||||||
@ -209,7 +212,15 @@ class HTMLPurifier_Lexer
|
|||||||
}
|
}
|
||||||
|
|
||||||
// hmm... now we have some uncommon entities. Use the callback.
|
// hmm... now we have some uncommon entities. Use the callback.
|
||||||
$string = $this->_entity_parser->substituteSpecialEntities($string);
|
if ($config->get('Core.LegacyEntityDecoder')) {
|
||||||
|
$string = $this->_entity_parser->substituteSpecialEntities($string);
|
||||||
|
} else {
|
||||||
|
if ($is_attr) {
|
||||||
|
$string = $this->_entity_parser->substituteAttrEntities($string);
|
||||||
|
} else {
|
||||||
|
$string = $this->_entity_parser->substituteTextEntities($string);
|
||||||
|
}
|
||||||
|
}
|
||||||
return $string;
|
return $string;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -323,7 +334,9 @@ class HTMLPurifier_Lexer
|
|||||||
}
|
}
|
||||||
|
|
||||||
// expand entities that aren't the big five
|
// expand entities that aren't the big five
|
||||||
$html = $this->_entity_parser->substituteNonSpecialEntities($html);
|
if ($config->get('Core.LegacyEntityDecoder')) {
|
||||||
|
$html = $this->_entity_parser->substituteNonSpecialEntities($html);
|
||||||
|
}
|
||||||
|
|
||||||
// clean into wellformed UTF-8 string for an SGML context: this has
|
// clean into wellformed UTF-8 string for an SGML context: this has
|
||||||
// to be done after entity expansion because the entities sometimes
|
// to be done after entity expansion because the entities sometimes
|
||||||
|
@ -77,14 +77,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
|||||||
|
|
||||||
$div = $body->getElementsByTagName('div')->item(0); // <div>
|
$div = $body->getElementsByTagName('div')->item(0); // <div>
|
||||||
$tokens = array();
|
$tokens = array();
|
||||||
$this->tokenizeDOM($div, $tokens);
|
$this->tokenizeDOM($div, $tokens, $config);
|
||||||
// If the div has a sibling, that means we tripped across
|
// If the div has a sibling, that means we tripped across
|
||||||
// a premature </div> tag. So remove the div we parsed,
|
// a premature </div> tag. So remove the div we parsed,
|
||||||
// and then tokenize the rest of body. We can't tokenize
|
// and then tokenize the rest of body. We can't tokenize
|
||||||
// the sibling directly as we'll lose the tags in that case.
|
// the sibling directly as we'll lose the tags in that case.
|
||||||
if ($div->nextSibling) {
|
if ($div->nextSibling) {
|
||||||
$body->removeChild($div);
|
$body->removeChild($div);
|
||||||
$this->tokenizeDOM($body, $tokens);
|
$this->tokenizeDOM($body, $tokens, $config);
|
||||||
}
|
}
|
||||||
return $tokens;
|
return $tokens;
|
||||||
}
|
}
|
||||||
@ -96,7 +96,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
|||||||
* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
|
* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
|
||||||
* @return HTMLPurifier_Token of node appended to previously passed tokens.
|
* @return HTMLPurifier_Token of node appended to previously passed tokens.
|
||||||
*/
|
*/
|
||||||
protected function tokenizeDOM($node, &$tokens)
|
protected function tokenizeDOM($node, &$tokens, $config)
|
||||||
{
|
{
|
||||||
$level = 0;
|
$level = 0;
|
||||||
$nodes = array($level => new HTMLPurifier_Queue(array($node)));
|
$nodes = array($level => new HTMLPurifier_Queue(array($node)));
|
||||||
@ -105,7 +105,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
|||||||
while (!$nodes[$level]->isEmpty()) {
|
while (!$nodes[$level]->isEmpty()) {
|
||||||
$node = $nodes[$level]->shift(); // FIFO
|
$node = $nodes[$level]->shift(); // FIFO
|
||||||
$collect = $level > 0 ? true : false;
|
$collect = $level > 0 ? true : false;
|
||||||
$needEndingTag = $this->createStartNode($node, $tokens, $collect);
|
$needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
|
||||||
if ($needEndingTag) {
|
if ($needEndingTag) {
|
||||||
$closingNodes[$level][] = $node;
|
$closingNodes[$level][] = $node;
|
||||||
}
|
}
|
||||||
@ -135,7 +135,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
|||||||
* @return bool if the token needs an endtoken
|
* @return bool if the token needs an endtoken
|
||||||
* @todo data and tagName properties don't seem to exist in DOMNode?
|
* @todo data and tagName properties don't seem to exist in DOMNode?
|
||||||
*/
|
*/
|
||||||
protected function createStartNode($node, &$tokens, $collect)
|
protected function createStartNode($node, &$tokens, $collect, $config)
|
||||||
{
|
{
|
||||||
// intercept non element nodes. WE MUST catch all of them,
|
// intercept non element nodes. WE MUST catch all of them,
|
||||||
// but we're not getting the character reference nodes because
|
// but we're not getting the character reference nodes because
|
||||||
@ -159,7 +159,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
$tokens[] = $this->factory->createText($this->parseData($data));
|
$tokens[] = $this->factory->createText($this->parseText($data, $config));
|
||||||
return false;
|
return false;
|
||||||
} elseif ($node->nodeType === XML_COMMENT_NODE) {
|
} elseif ($node->nodeType === XML_COMMENT_NODE) {
|
||||||
// this is code is only invoked for comments in script/style in versions
|
// this is code is only invoked for comments in script/style in versions
|
||||||
|
@ -129,12 +129,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
// We are not inside tag and there still is another tag to parse
|
// We are not inside tag and there still is another tag to parse
|
||||||
$token = new
|
$token = new
|
||||||
HTMLPurifier_Token_Text(
|
HTMLPurifier_Token_Text(
|
||||||
$this->parseData(
|
$this->parseText(
|
||||||
substr(
|
substr(
|
||||||
$html,
|
$html,
|
||||||
$cursor,
|
$cursor,
|
||||||
$position_next_lt - $cursor
|
$position_next_lt - $cursor
|
||||||
)
|
), $config
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
if ($maintain_line_numbers) {
|
if ($maintain_line_numbers) {
|
||||||
@ -154,11 +154,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
// Create Text of rest of string
|
// Create Text of rest of string
|
||||||
$token = new
|
$token = new
|
||||||
HTMLPurifier_Token_Text(
|
HTMLPurifier_Token_Text(
|
||||||
$this->parseData(
|
$this->parseText(
|
||||||
substr(
|
substr(
|
||||||
$html,
|
$html,
|
||||||
$cursor
|
$cursor
|
||||||
)
|
), $config
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
if ($maintain_line_numbers) {
|
if ($maintain_line_numbers) {
|
||||||
@ -324,8 +324,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
$token = new
|
$token = new
|
||||||
HTMLPurifier_Token_Text(
|
HTMLPurifier_Token_Text(
|
||||||
'<' .
|
'<' .
|
||||||
$this->parseData(
|
$this->parseText(
|
||||||
substr($html, $cursor)
|
substr($html, $cursor), $config
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
if ($maintain_line_numbers) {
|
if ($maintain_line_numbers) {
|
||||||
@ -429,7 +429,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
if ($value === false) {
|
if ($value === false) {
|
||||||
$value = '';
|
$value = '';
|
||||||
}
|
}
|
||||||
return array($key => $this->parseData($value));
|
return array($key => $this->parseAttr($value, $config));
|
||||||
}
|
}
|
||||||
|
|
||||||
// setup loop environment
|
// setup loop environment
|
||||||
@ -518,7 +518,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
if ($value === false) {
|
if ($value === false) {
|
||||||
$value = '';
|
$value = '';
|
||||||
}
|
}
|
||||||
$array[$key] = $this->parseData($value);
|
$array[$key] = $this->parseAttr($value, $config);
|
||||||
$cursor++;
|
$cursor++;
|
||||||
} else {
|
} else {
|
||||||
// boolattr
|
// boolattr
|
||||||
|
@ -36,7 +36,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
|
|||||||
$doc->getElementsByTagName('html')->item(0)-> // <html>
|
$doc->getElementsByTagName('html')->item(0)-> // <html>
|
||||||
getElementsByTagName('body')->item(0) // <body>
|
getElementsByTagName('body')->item(0) // <body>
|
||||||
,
|
,
|
||||||
$tokens
|
$tokens, $config
|
||||||
);
|
);
|
||||||
return $tokens;
|
return $tokens;
|
||||||
}
|
}
|
||||||
@ -1515,6 +1515,7 @@ class HTML5
|
|||||||
// Consume the maximum number of characters possible, with the
|
// Consume the maximum number of characters possible, with the
|
||||||
// consumed characters case-sensitively matching one of the
|
// consumed characters case-sensitively matching one of the
|
||||||
// identifiers in the first column of the entities table.
|
// identifiers in the first column of the entities table.
|
||||||
|
|
||||||
$e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
|
$e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
|
||||||
$len = strlen($e_name);
|
$len = strlen($e_name);
|
||||||
|
|
||||||
@ -1547,7 +1548,7 @@ class HTML5
|
|||||||
|
|
||||||
// Return a character token for the character corresponding to the
|
// Return a character token for the character corresponding to the
|
||||||
// entity name (as given by the second column of the entities table).
|
// entity name (as given by the second column of the entities table).
|
||||||
return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
|
return html_entity_decode('&' . rtrim($entity, ';') . ';', ENT_QUOTES, 'UTF-8');
|
||||||
}
|
}
|
||||||
|
|
||||||
private function emitToken($token)
|
private function emitToken($token)
|
||||||
|
@ -46,11 +46,11 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
|
|
||||||
// HTMLPurifier_Lexer->parseData() -----------------------------------------
|
// HTMLPurifier_Lexer->parseData() -----------------------------------------
|
||||||
|
|
||||||
public function assertParseData($input, $expect = true)
|
public function assertParseData($input, $expect = true, $is_attr = false)
|
||||||
{
|
{
|
||||||
if ($expect === true) $expect = $input;
|
if ($expect === true) $expect = $input;
|
||||||
$lexer = new HTMLPurifier_Lexer();
|
$lexer = new HTMLPurifier_Lexer();
|
||||||
$this->assertIdentical($expect, $lexer->parseData($input));
|
$this->assertIdentical($expect, $lexer->parseData($input, $is_attr, $this->config));
|
||||||
}
|
}
|
||||||
|
|
||||||
public function test_parseData_plainText()
|
public function test_parseData_plainText()
|
||||||
@ -95,7 +95,58 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
|
|
||||||
public function test_parseData_improperEntityFaultToleranceTest()
|
public function test_parseData_improperEntityFaultToleranceTest()
|
||||||
{
|
{
|
||||||
$this->assertParseData('-');
|
$this->assertParseData('-', '-');
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_parseData_noTrailingSemi()
|
||||||
|
{
|
||||||
|
$this->assertParseData('&A', '&A');
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_parseData_noTrailingSemiAttr()
|
||||||
|
{
|
||||||
|
$this->assertParseData('&A', '&A', true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_parseData_T119()
|
||||||
|
{
|
||||||
|
$this->assertParseData('&A', '&A', true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_parseData_T119b()
|
||||||
|
{
|
||||||
|
$this->assertParseData('&trade=', true, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_parseData_legacy1()
|
||||||
|
{
|
||||||
|
$this->config->set('Core.LegacyEntityDecoder', true);
|
||||||
|
$this->assertParseData('&a', true);
|
||||||
|
$this->assertParseData('&=', "&=");
|
||||||
|
$this->assertParseData('&a', true, true);
|
||||||
|
$this->assertParseData('&=', "&=", true);
|
||||||
|
$this->assertParseData('<a', true);
|
||||||
|
$this->assertParseData('<=', "<=");
|
||||||
|
$this->assertParseData('<a', true, true);
|
||||||
|
$this->assertParseData('<=', "<=", true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_parseData_nonlegacy1()
|
||||||
|
{
|
||||||
|
$this->assertParseData('&a', "&a");
|
||||||
|
$this->assertParseData('&=', "&=");
|
||||||
|
$this->assertParseData('&a', true, true);
|
||||||
|
$this->assertParseData('&=', true, true);
|
||||||
|
$this->assertParseData('<a', "<a");
|
||||||
|
$this->assertParseData('<=', "<=");
|
||||||
|
$this->assertParseData('<a', true, true);
|
||||||
|
$this->assertParseData('<=', true, true);
|
||||||
|
$this->assertParseData('<a;', "<a;");
|
||||||
|
}
|
||||||
|
|
||||||
|
public function test_parseData_noTrailingSemiNever()
|
||||||
|
{
|
||||||
|
$this->assertParseData('&imath');
|
||||||
}
|
}
|
||||||
|
|
||||||
// HTMLPurifier_Lexer->extractBody() ---------------------------------------
|
// HTMLPurifier_Lexer->extractBody() ---------------------------------------
|
||||||
|
Loading…
Reference in New Issue
Block a user