Revamp entity decoding to be more like HTML5.

See %Core.LegacyEntityDecoder for more details. Signed-off-by: Edward Z. Yang <ezyang@cs.stanford.edu>
2024-12-22 08:21:52 +00:00 · 2017-03-07 13:34:55 -08:00 · 2017-03-07 13:34:55 -08:00 · 7e11c271b9
commit 7e11c271b9
parent 66bbae73a9
10 changed files with 272 additions and 35 deletions
--- a/4
+++ b/4
@ -32,6 +32,10 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
  rest of the text in case it ran off the end.  (#78)
 - Fix PREG_BACKTRACK_LIMIT_ERROR in HTMLPurifier_Filter_ExtractStyle.
  Thanks @breathbath for contributing the report and fix (#120)
 - Fix entity decoding algorithm to be more conservative about
  decoding entities that are missing trailing semicolon.
  To get old behavior, set %Core.LegacyEntityDecoder to true.
  (#119)
 # By default, when a link has a target attribute associated
  with it, we now also add rel="noopener" in order to
  prevent the new window from being able to overwrite
--- a/configdoc/usage.xml
+++ b/configdoc/usage.xml
@ -6,7 +6,7 @@
  </file>
  <file name="HTMLPurifier/Lexer.php">
   <line>85</line>
-   <line>315</line>
+   <line>322</line>
  </file>
  <file name="HTMLPurifier/Lexer/DirectLex.php">
   <line>67</line>
@ -124,7 +124,7 @@
   <line>122</line>
  </file>
  <file name="HTMLPurifier/Lexer.php">
-   <line>297</line>
+   <line>304</line>
  </file>
 </directive>
 <directive id="Output.Newline">
@ -172,7 +172,7 @@
   <line>234</line>
  </file>
  <file name="HTMLPurifier/Lexer.php">
-   <line>302</line>
+   <line>309</line>
  </file>
  <file name="HTMLPurifier/HTMLModule/Image.php">
   <line>37</line>
@ -262,12 +262,12 @@
 </directive>
 <directive id="Core.ConvertDocumentToFragment">
  <file name="HTMLPurifier/Lexer.php">
-   <line>313</line>
+   <line>320</line>
  </file>
 </directive>
 <directive id="Core.RemoveProcessingInstructions">
  <file name="HTMLPurifier/Lexer.php">
-   <line>334</line>
+   <line>343</line>
  </file>
 </directive>
 <directive id="URI.">
@ -444,12 +444,12 @@
 </directive>
 <directive id="Filter.ExtractStyleBlocks.Scope">
  <file name="HTMLPurifier/Filter/ExtractStyleBlocks.php">
-   <line>122</line>
+   <line>125</line>
  </file>
 </directive>
 <directive id="Filter.ExtractStyleBlocks.Escaping">
  <file name="HTMLPurifier/Filter/ExtractStyleBlocks.php">
-   <line>327</line>
+   <line>330</line>
  </file>
 </directive>
 <directive id="HTML.SafeIframe">
--- a/library/HTMLPurifier/ConfigSchema/schema.ser
+++ b/library/HTMLPurifier/ConfigSchema/schema.ser
--- a/library/HTMLPurifier/ConfigSchema/schema/Core.LegacyEntityDecoder.txt
+++ b/library/HTMLPurifier/ConfigSchema/schema/Core.LegacyEntityDecoder.txt
@ -0,0 +1,36 @@
 Core.LegacyEntityDecoder
 TYPE: bool
 VERSION: 4.9.0
 DEFAULT: false
 --DESCRIPTION--
 <p>
    Prior to HTML Purifier 4.9.0, entities were decoded by performing
    a global search replace for all entities whose decoded versions
    did not have special meanings under HTML, and replaced them with
    their decoded versions.  We would match all entities, even if they did
    not have a trailing semicolon, but only if there weren't any trailing
    alphanumeric characters.
 </p>
 <table>
 <tr><th>Original</th><th>Text</th><th>Attribute</th></tr>
 <tr><td>&amp;yen;</td><td>&yen;</td><td>&yen;</td></tr>
 <tr><td>&amp;yen</td><td>&yen;</td><td>&yen;</td></tr>
 <tr><td>&amp;yena</td><td>&amp;yena</td><td>&amp;yena</td></tr>
 <tr><td>&amp;yen=</td><td>&yen;=</td><td>&yen;=</td></tr>
 </table>
 <p>
    In HTML Purifier 4.9.0, we changed the behavior of entity parsing
    to match entities that had missing trailing semicolons in less
    cases, to more closely match HTML5 parsing behavior:
 </p>
 <table>
 <tr><th>Original</th><th>Text</th><th>Attribute</th></tr>
 <tr><td>&amp;yen;</td><td>&yen;</td><td>&yen;</td></tr>
 <tr><td>&amp;yen</td><td>&yen;</td><td>&yen;</td></tr>
 <tr><td>&amp;yena</td><td>&yen;a</td><td>&amp;yena</td></tr>
 <tr><td>&amp;yen=</td><td>&yen;=</td><td>&amp;yen=</td></tr>
 </table>
 <p>
    This flag reverts back to pre-HTML Purifier 4.9.0 behavior.
 </p>
 --# vim: et sw=4 sts=4
--- a/library/HTMLPurifier/EntityParser.php
+++ b/library/HTMLPurifier/EntityParser.php
@ -16,6 +16,138 @@ class HTMLPurifier_EntityParser
     */
    protected $_entity_lookup;
    /**
     * Callback regex string for entities in text.
     * @type string
     */
    protected $_textEntitiesRegex;
    /**
     * Callback regex string for entities in attributes.
     * @type string
     */
    protected $_attrEntitiesRegex;
    /**
     * Tests if the beginning of a string is a semi-optional regex
     */
    protected $_semiOptionalPrefixRegex;
    public function __construct() {
        // From
        // http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon
        $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml";
        // NB: three empty captures to put the fourth match in the right
        // place
        $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/";
        $this->_textEntitiesRegex =
            '/&(?:'.
            // hex
            '[#]x([a-fA-F0-9]+);?|'.
            // dec
            '[#]0*(\d+);?|'.
            // string (mandatory semicolon)
            // NB: order matters: match semicolon preferentially
            '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
            // string (optional semicolon)
            "($semi_optional)".
            ')/';
        $this->_attrEntitiesRegex =
            '/&(?:'.
            // hex
            '[#]x([a-fA-F0-9]+);?|'.
            // dec
            '[#]0*(\d+);?|'.
            // string (mandatory semicolon)
            // NB: order matters: match semicolon preferentially
            '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
            // string (optional semicolon)
            // don't match if trailing is equals or alphanumeric (URL
            // like)
            "($semi_optional)(?![=;A-Za-z0-9])".
            ')/';
    }
    /**
     * Substitute entities with the parsed equivalents.  Use this on
     * textual data in an HTML document (as opposed to attributes.)
     *
     * @param string $string String to have entities parsed.
     * @return string Parsed string.
     */
    public function substituteTextEntities($string)
    {
        return preg_replace_callback(
            $this->_textEntitiesRegex,
            array($this, 'entityCallback'),
            $string
        );
    }
    /**
     * Substitute entities with the parsed equivalents.  Use this on
     * attribute contents in documents.
     *
     * @param string $string String to have entities parsed.
     * @return string Parsed string.
     */
    public function substituteAttrEntities($string)
    {
        return preg_replace_callback(
            $this->_attrEntitiesRegex,
            array($this, 'entityCallback'),
            $string
        );
    }
    /**
     * Callback function for substituteNonSpecialEntities() that does the work.
     *
     * @param array $matches  PCRE matches array, with 0 the entire match, and
     *                  either index 1, 2 or 3 set with a hex value, dec value,
     *                  or string (respectively).
     * @return string Replacement string.
     */
    protected function entityCallback($matches)
    {
        $entity = $matches[0];
        $hex_part = @$matches[1];
        $dec_part = @$matches[2];
        $named_part = empty($matches[3]) ? @$matches[4] : $matches[3];
        if ($hex_part) {
            return HTMLPurifier_Encoder::unichr(hexdec($hex_part));
        } elseif ($dec_part) {
            return HTMLPurifier_Encoder((int) $dec_part);
        } else {
            if (!$this->_entity_lookup) {
                $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
            }
            if (isset($this->_entity_lookup->table[$named_part])) {
                return $this->_entity_lookup->table[$named_part];
            } else {
                // exact match didn't match anything, so test if
                // any of the semicolon optional match the prefix.
                // Test that this is an EXACT match is important to
                // prevent infinite loop
                if (!empty($matches[3])) {
                    return preg_replace_callback(
                        $this->_semiOptionalPrefixRegex,
                        array($this, 'entityCallback'),
                        $entity
                    );
                }
                return $entity;
            }
        }
    }
    // LEGACY CODE BELOW
    /**
     * Callback regex string for parsing entities.
     * @type string
@ -144,7 +276,7 @@ class HTMLPurifier_EntityParser
                $entity;
        } else {
            return isset($this->_special_ent2dec[$matches[3]]) ?
-                $this->_special_ent2dec[$matches[3]] :
+                $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] :
                $entity;
        }
    }
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@ -169,21 +169,24 @@ class HTMLPurifier_Lexer
            '&#x27;' => "'"
        );
    public function parseText($string, $config) {
        return $this->parseData($string, false, $config);
    }
    public function parseAttr($string, $config) {
        return $this->parseData($string, true, $config);
    }
    /**
     * Parses special entities into the proper characters.
     *
     * This string will translate escaped versions of the special characters
     * into the correct ones.
     *
     * @warning
     * You should be able to treat the output of this function as
     * completely parsed, but that's only because all other entities should
     * have been handled previously in substituteNonSpecialEntities()
     *
     * @param string $string String character data to be parsed.
     * @return string Parsed character data.
     */
-    public function parseData($string)
+    public function parseData($string, $is_attr, $config)
    {
        // following functions require at least one character
        if ($string === '') {
@ -209,7 +212,15 @@ class HTMLPurifier_Lexer
        }
        // hmm... now we have some uncommon entities. Use the callback.
-        $string = $this->_entity_parser->substituteSpecialEntities($string);
+        if ($config->get('Core.LegacyEntityDecoder')) {
            $string = $this->_entity_parser->substituteSpecialEntities($string);
        } else {
            if ($is_attr) {
                $string = $this->_entity_parser->substituteAttrEntities($string);
            } else {
                $string = $this->_entity_parser->substituteTextEntities($string);
            }
        }
        return $string;
    }
@ -323,7 +334,9 @@ class HTMLPurifier_Lexer
        }
        // expand entities that aren't the big five
-        $html = $this->_entity_parser->substituteNonSpecialEntities($html);
+        if ($config->get('Core.LegacyEntityDecoder')) {
            $html = $this->_entity_parser->substituteNonSpecialEntities($html);
        }
        // clean into wellformed UTF-8 string for an SGML context: this has
        // to be done after entity expansion because the entities sometimes
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@ -77,14 +77,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
        $div = $body->getElementsByTagName('div')->item(0); // <div>
        $tokens = array();
-        $this->tokenizeDOM($div, $tokens);
+        $this->tokenizeDOM($div, $tokens, $config);
        // If the div has a sibling, that means we tripped across
        // a premature </div> tag.  So remove the div we parsed,
        // and then tokenize the rest of body.  We can't tokenize
        // the sibling directly as we'll lose the tags in that case.
        if ($div->nextSibling) {
            $body->removeChild($div);
-            $this->tokenizeDOM($body, $tokens);
+            $this->tokenizeDOM($body, $tokens, $config);
        }
        return $tokens;
    }
@ -96,7 +96,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
     * @param HTMLPurifier_Token[] $tokens   Array-list of already tokenized tokens.
     * @return HTMLPurifier_Token of node appended to previously passed tokens.
     */
-    protected function tokenizeDOM($node, &$tokens)
+    protected function tokenizeDOM($node, &$tokens, $config)
    {
        $level = 0;
        $nodes = array($level => new HTMLPurifier_Queue(array($node)));
@ -105,7 +105,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
            while (!$nodes[$level]->isEmpty()) {
                $node = $nodes[$level]->shift(); // FIFO
                $collect = $level > 0 ? true : false;
-                $needEndingTag = $this->createStartNode($node, $tokens, $collect);
+                $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
                if ($needEndingTag) {
                    $closingNodes[$level][] = $node;
                }
@ -135,7 +135,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
     * @return bool if the token needs an endtoken
     * @todo data and tagName properties don't seem to exist in DOMNode?
     */
-    protected function createStartNode($node, &$tokens, $collect)
+    protected function createStartNode($node, &$tokens, $collect, $config)
    {
        // intercept non element nodes. WE MUST catch all of them,
        // but we're not getting the character reference nodes because
@ -159,7 +159,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
                    }
                }
            }
-            $tokens[] = $this->factory->createText($this->parseData($data));
+            $tokens[] = $this->factory->createText($this->parseText($data, $config));
            return false;
        } elseif ($node->nodeType === XML_COMMENT_NODE) {
            // this is code is only invoked for comments in script/style in versions
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@ -129,12 +129,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                // We are not inside tag and there still is another tag to parse
                $token = new
                HTMLPurifier_Token_Text(
-                    $this->parseData(
+                    $this->parseText(
                        substr(
                            $html,
                            $cursor,
                            $position_next_lt - $cursor
-                        )
+                        ), $config
                    )
                );
                if ($maintain_line_numbers) {
@ -154,11 +154,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                // Create Text of rest of string
                $token = new
                HTMLPurifier_Token_Text(
-                    $this->parseData(
+                    $this->parseText(
                        substr(
                            $html,
                            $cursor
-                        )
+                        ), $config
                    )
                );
                if ($maintain_line_numbers) {
@ -324,8 +324,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                $token = new
                HTMLPurifier_Token_Text(
                    '<' .
-                    $this->parseData(
+                    $this->parseText(
-                        substr($html, $cursor)
+                        substr($html, $cursor), $config
                    )
                );
                if ($maintain_line_numbers) {
@ -429,7 +429,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
            if ($value === false) {
                $value = '';
            }
-            return array($key => $this->parseData($value));
+            return array($key => $this->parseAttr($value, $config));
        }
        // setup loop environment
@ -518,7 +518,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                if ($value === false) {
                    $value = '';
                }
-                $array[$key] = $this->parseData($value);
+                $array[$key] = $this->parseAttr($value, $config);
                $cursor++;
            } else {
                // boolattr
--- a/library/HTMLPurifier/Lexer/PH5P.php
+++ b/library/HTMLPurifier/Lexer/PH5P.php
@ -36,7 +36,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
            $doc->getElementsByTagName('html')->item(0)-> // <html>
                  getElementsByTagName('body')->item(0) //   <body>
            ,
-            $tokens
+            $tokens, $config
        );
        return $tokens;
    }
@ -1515,6 +1515,7 @@ class HTML5
                // Consume the maximum number of characters possible, with the
                // consumed characters case-sensitively matching one of the
                // identifiers in the first column of the entities table.
                $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
                $len = strlen($e_name);
@ -1547,7 +1548,7 @@ class HTML5
        // Return a character token for the character corresponding to the
        // entity name (as given by the second column of the entities table).
-        return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
+        return html_entity_decode('&' . rtrim($entity, ';') . ';', ENT_QUOTES, 'UTF-8');
    }
    private function emitToken($token)
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -46,11 +46,11 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
    // HTMLPurifier_Lexer->parseData() -----------------------------------------
-    public function assertParseData($input, $expect = true)
+    public function assertParseData($input, $expect = true, $is_attr = false)
    {
        if ($expect === true) $expect = $input;
        $lexer = new HTMLPurifier_Lexer();
-        $this->assertIdentical($expect, $lexer->parseData($input));
+        $this->assertIdentical($expect, $lexer->parseData($input, $is_attr, $this->config));
    }
    public function test_parseData_plainText()
@ -95,7 +95,58 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
    public function test_parseData_improperEntityFaultToleranceTest()
    {
-        $this->assertParseData('&#x2D;');
+        $this->assertParseData('&#x2D;', '-');
    }
    public function test_parseData_noTrailingSemi()
    {
        $this->assertParseData('&ampA', '&A');
    }
    public function test_parseData_noTrailingSemiAttr()
    {
        $this->assertParseData('&ampA', '&ampA', true);
    }
    public function test_parseData_T119()
    {
        $this->assertParseData('&ampA', '&ampA', true);
    }
    public function test_parseData_T119b()
    {
        $this->assertParseData('&trade=', true, true);
    }
    public function test_parseData_legacy1()
    {
        $this->config->set('Core.LegacyEntityDecoder', true);
        $this->assertParseData('&ampa', true);
        $this->assertParseData('&amp=', "&=");
        $this->assertParseData('&ampa', true, true);
        $this->assertParseData('&amp=', "&=", true);
        $this->assertParseData('&lta', true);
        $this->assertParseData('&lt=', "<=");
        $this->assertParseData('&lta', true, true);
        $this->assertParseData('&lt=', "<=", true);
    }
    public function test_parseData_nonlegacy1()
    {
        $this->assertParseData('&ampa', "&a");
        $this->assertParseData('&amp=', "&=");
        $this->assertParseData('&ampa', true, true);
        $this->assertParseData('&amp=', true, true);
        $this->assertParseData('&lta', "<a");
        $this->assertParseData('&lt=', "<=");
        $this->assertParseData('&lta', true, true);
        $this->assertParseData('&lt=', true, true);
        $this->assertParseData('&lta;', "<a;");
    }
    public function test_parseData_noTrailingSemiNever()
    {
        $this->assertParseData('&imath');
    }
    // HTMLPurifier_Lexer->extractBody() ---------------------------------------