Revamp entity decoding to be more like HTML5.

See %Core.LegacyEntityDecoder for more details. Signed-off-by: Edward Z. Yang <ezyang@cs.stanford.edu>
2024-12-22 08:21:52 +00:00 · 2017-03-07 13:34:55 -08:00 · 2017-03-07 13:34:55 -08:00 · 7e11c271b9
commit 7e11c271b9
parent 66bbae73a9
10 changed files with 272 additions and 35 deletions
--- a/4
+++ b/4
@ -32,6 +32,10 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
  rest of the text in case it ran off the end.  (#78)
 - Fix PREG_BACKTRACK_LIMIT_ERROR in HTMLPurifier_Filter_ExtractStyle.
  Thanks @breathbath for contributing the report and fix (#120)
+- Fix entity decoding algorithm to be more conservative about
+  decoding entities that are missing trailing semicolon.
+  To get old behavior, set %Core.LegacyEntityDecoder to true.
+  (#119)
 # By default, when a link has a target attribute associated
  with it, we now also add rel="noopener" in order to
  prevent the new window from being able to overwrite
--- a/configdoc/usage.xml
+++ b/configdoc/usage.xml
@ -6,7 +6,7 @@
  </file>
  <file name="HTMLPurifier/Lexer.php">
   <line>85</line>
-   <line>315</line>
+   <line>322</line>
  </file>
  <file name="HTMLPurifier/Lexer/DirectLex.php">
   <line>67</line>
@ -124,7 +124,7 @@
   <line>122</line>
  </file>
  <file name="HTMLPurifier/Lexer.php">
-   <line>297</line>
+   <line>304</line>
  </file>
 </directive>
 <directive id="Output.Newline">
@ -172,7 +172,7 @@
   <line>234</line>
  </file>
  <file name="HTMLPurifier/Lexer.php">
-   <line>302</line>
+   <line>309</line>
  </file>
  <file name="HTMLPurifier/HTMLModule/Image.php">
   <line>37</line>
@ -262,12 +262,12 @@
 </directive>
 <directive id="Core.ConvertDocumentToFragment">
  <file name="HTMLPurifier/Lexer.php">
-   <line>313</line>
+   <line>320</line>
  </file>
 </directive>
 <directive id="Core.RemoveProcessingInstructions">
  <file name="HTMLPurifier/Lexer.php">
-   <line>334</line>
+   <line>343</line>
  </file>
 </directive>
 <directive id="URI.">
@ -444,12 +444,12 @@
 </directive>
 <directive id="Filter.ExtractStyleBlocks.Scope">
  <file name="HTMLPurifier/Filter/ExtractStyleBlocks.php">
-   <line>122</line>
+   <line>125</line>
  </file>
 </directive>
 <directive id="Filter.ExtractStyleBlocks.Escaping">
  <file name="HTMLPurifier/Filter/ExtractStyleBlocks.php">
-   <line>327</line>
+   <line>330</line>
  </file>
 </directive>
 <directive id="HTML.SafeIframe">
--- a/library/HTMLPurifier/ConfigSchema/schema.ser
+++ b/library/HTMLPurifier/ConfigSchema/schema.ser
--- a/library/HTMLPurifier/ConfigSchema/schema/Core.LegacyEntityDecoder.txt
+++ b/library/HTMLPurifier/ConfigSchema/schema/Core.LegacyEntityDecoder.txt
@ -0,0 +1,36 @@
+Core.LegacyEntityDecoder
+TYPE: bool
+VERSION: 4.9.0
+DEFAULT: false
+--DESCRIPTION--
+<p>
+    Prior to HTML Purifier 4.9.0, entities were decoded by performing
+    a global search replace for all entities whose decoded versions
+    did not have special meanings under HTML, and replaced them with
+    their decoded versions.  We would match all entities, even if they did
+    not have a trailing semicolon, but only if there weren't any trailing
+    alphanumeric characters.
+</p>
+<table>
+<tr><th>Original</th><th>Text</th><th>Attribute</th></tr>
+<tr><td>&amp;yen;</td><td>&yen;</td><td>&yen;</td></tr>
+<tr><td>&amp;yen</td><td>&yen;</td><td>&yen;</td></tr>
+<tr><td>&amp;yena</td><td>&amp;yena</td><td>&amp;yena</td></tr>
+<tr><td>&amp;yen=</td><td>&yen;=</td><td>&yen;=</td></tr>
+</table>
+<p>
+    In HTML Purifier 4.9.0, we changed the behavior of entity parsing
+    to match entities that had missing trailing semicolons in less
+    cases, to more closely match HTML5 parsing behavior:
+</p>
+<table>
+<tr><th>Original</th><th>Text</th><th>Attribute</th></tr>
+<tr><td>&amp;yen;</td><td>&yen;</td><td>&yen;</td></tr>
+<tr><td>&amp;yen</td><td>&yen;</td><td>&yen;</td></tr>
+<tr><td>&amp;yena</td><td>&yen;a</td><td>&amp;yena</td></tr>
+<tr><td>&amp;yen=</td><td>&yen;=</td><td>&amp;yen=</td></tr>
+</table>
+<p>
+    This flag reverts back to pre-HTML Purifier 4.9.0 behavior.
+</p>
+--# vim: et sw=4 sts=4
--- a/library/HTMLPurifier/EntityParser.php
+++ b/library/HTMLPurifier/EntityParser.php
@ -16,6 +16,138 @@ class HTMLPurifier_EntityParser
     */
    protected $_entity_lookup;

+    /**
+     * Callback regex string for entities in text.
+     * @type string
+     */
+    protected $_textEntitiesRegex;
+
+    /**
+     * Callback regex string for entities in attributes.
+     * @type string
+     */
+    protected $_attrEntitiesRegex;
+
+    /**
+     * Tests if the beginning of a string is a semi-optional regex
+     */
+    protected $_semiOptionalPrefixRegex;
+
+    public function __construct() {
+        // From
+        // http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon
+        $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml";
+
+        // NB: three empty captures to put the fourth match in the right
+        // place
+        $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/";
+
+        $this->_textEntitiesRegex =
+            '/&(?:'.
+            // hex
+            '[#]x([a-fA-F0-9]+);?|'.
+            // dec
+            '[#]0*(\d+);?|'.
+            // string (mandatory semicolon)
+            // NB: order matters: match semicolon preferentially
+            '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
+            // string (optional semicolon)
+            "($semi_optional)".
+            ')/';
+
+        $this->_attrEntitiesRegex =
+            '/&(?:'.
+            // hex
+            '[#]x([a-fA-F0-9]+);?|'.
+            // dec
+            '[#]0*(\d+);?|'.
+            // string (mandatory semicolon)
+            // NB: order matters: match semicolon preferentially
+            '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
+            // string (optional semicolon)
+            // don't match if trailing is equals or alphanumeric (URL
+            // like)
+            "($semi_optional)(?![=;A-Za-z0-9])".
+            ')/';
+
+    }
+
+    /**
+     * Substitute entities with the parsed equivalents.  Use this on
+     * textual data in an HTML document (as opposed to attributes.)
+     *
+     * @param string $string String to have entities parsed.
+     * @return string Parsed string.
+     */
+    public function substituteTextEntities($string)
+    {
+        return preg_replace_callback(
+            $this->_textEntitiesRegex,
+            array($this, 'entityCallback'),
+            $string
+        );
+    }
+
+    /**
+     * Substitute entities with the parsed equivalents.  Use this on
+     * attribute contents in documents.
+     *
+     * @param string $string String to have entities parsed.
+     * @return string Parsed string.
+     */
+    public function substituteAttrEntities($string)
+    {
+        return preg_replace_callback(
+            $this->_attrEntitiesRegex,
+            array($this, 'entityCallback'),
+            $string
+        );
+    }
+
+    /**
+     * Callback function for substituteNonSpecialEntities() that does the work.
+     *
+     * @param array $matches  PCRE matches array, with 0 the entire match, and
+     *                  either index 1, 2 or 3 set with a hex value, dec value,
+     *                  or string (respectively).
+     * @return string Replacement string.
+     */
+
+    protected function entityCallback($matches)
+    {
+        $entity = $matches[0];
+        $hex_part = @$matches[1];
+        $dec_part = @$matches[2];
+        $named_part = empty($matches[3]) ? @$matches[4] : $matches[3];
+        if ($hex_part) {
+            return HTMLPurifier_Encoder::unichr(hexdec($hex_part));
+        } elseif ($dec_part) {
+            return HTMLPurifier_Encoder((int) $dec_part);
+        } else {
+            if (!$this->_entity_lookup) {
+                $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
+            }
+            if (isset($this->_entity_lookup->table[$named_part])) {
+                return $this->_entity_lookup->table[$named_part];
+            } else {
+                // exact match didn't match anything, so test if
+                // any of the semicolon optional match the prefix.
+                // Test that this is an EXACT match is important to
+                // prevent infinite loop
+                if (!empty($matches[3])) {
+                    return preg_replace_callback(
+                        $this->_semiOptionalPrefixRegex,
+                        array($this, 'entityCallback'),
+                        $entity
+                    );
+                }
+                return $entity;
+            }
+        }
+    }
+
+    // LEGACY CODE BELOW
+
    /**
     * Callback regex string for parsing entities.
     * @type string
@ -144,7 +276,7 @@ class HTMLPurifier_EntityParser
                $entity;
        } else {
            return isset($this->_special_ent2dec[$matches[3]]) ?
-                $this->_special_ent2dec[$matches[3]] :
+                $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] :
                $entity;
        }
    }
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@ -169,21 +169,24 @@ class HTMLPurifier_Lexer
            '&#x27;' => "'"
        );

+    public function parseText($string, $config) {
+        return $this->parseData($string, false, $config);
+    }
+
+    public function parseAttr($string, $config) {
+        return $this->parseData($string, true, $config);
+    }
+
    /**
     * Parses special entities into the proper characters.
     *
     * This string will translate escaped versions of the special characters
     * into the correct ones.
     *
-     * @warning
-     * You should be able to treat the output of this function as
-     * completely parsed, but that's only because all other entities should
-     * have been handled previously in substituteNonSpecialEntities()
-     *
     * @param string $string String character data to be parsed.
     * @return string Parsed character data.
     */
-    public function parseData($string)
+    public function parseData($string, $is_attr, $config)
    {
        // following functions require at least one character
        if ($string === '') {
@ -209,7 +212,15 @@ class HTMLPurifier_Lexer
        }

        // hmm... now we have some uncommon entities. Use the callback.
-        $string = $this->_entity_parser->substituteSpecialEntities($string);
+        if ($config->get('Core.LegacyEntityDecoder')) {
+            $string = $this->_entity_parser->substituteSpecialEntities($string);
+        } else {
+            if ($is_attr) {
+                $string = $this->_entity_parser->substituteAttrEntities($string);
+            } else {
+                $string = $this->_entity_parser->substituteTextEntities($string);
+            }
+        }
        return $string;
    }

@ -323,7 +334,9 @@ class HTMLPurifier_Lexer
        }

        // expand entities that aren't the big five
-        $html = $this->_entity_parser->substituteNonSpecialEntities($html);
+        if ($config->get('Core.LegacyEntityDecoder')) {
+            $html = $this->_entity_parser->substituteNonSpecialEntities($html);
+        }

        // clean into wellformed UTF-8 string for an SGML context: this has
        // to be done after entity expansion because the entities sometimes
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@ -77,14 +77,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer

        $div = $body->getElementsByTagName('div')->item(0); // <div>
        $tokens = array();
-        $this->tokenizeDOM($div, $tokens);
+        $this->tokenizeDOM($div, $tokens, $config);
        // If the div has a sibling, that means we tripped across
        // a premature </div> tag.  So remove the div we parsed,
        // and then tokenize the rest of body.  We can't tokenize
        // the sibling directly as we'll lose the tags in that case.
        if ($div->nextSibling) {
            $body->removeChild($div);
-            $this->tokenizeDOM($body, $tokens);
+            $this->tokenizeDOM($body, $tokens, $config);
        }
        return $tokens;
    }
@ -96,7 +96,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
     * @param HTMLPurifier_Token[] $tokens   Array-list of already tokenized tokens.
     * @return HTMLPurifier_Token of node appended to previously passed tokens.
     */
-    protected function tokenizeDOM($node, &$tokens)
+    protected function tokenizeDOM($node, &$tokens, $config)
    {
        $level = 0;
        $nodes = array($level => new HTMLPurifier_Queue(array($node)));
@ -105,7 +105,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
            while (!$nodes[$level]->isEmpty()) {
                $node = $nodes[$level]->shift(); // FIFO
                $collect = $level > 0 ? true : false;
-                $needEndingTag = $this->createStartNode($node, $tokens, $collect);
+                $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
                if ($needEndingTag) {
                    $closingNodes[$level][] = $node;
                }
@ -135,7 +135,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
     * @return bool if the token needs an endtoken
     * @todo data and tagName properties don't seem to exist in DOMNode?
     */
-    protected function createStartNode($node, &$tokens, $collect)
+    protected function createStartNode($node, &$tokens, $collect, $config)
    {
        // intercept non element nodes. WE MUST catch all of them,
        // but we're not getting the character reference nodes because
@ -159,7 +159,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
                    }
                }
            }
-            $tokens[] = $this->factory->createText($this->parseData($data));
+            $tokens[] = $this->factory->createText($this->parseText($data, $config));
            return false;
        } elseif ($node->nodeType === XML_COMMENT_NODE) {
            // this is code is only invoked for comments in script/style in versions
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@ -129,12 +129,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                // We are not inside tag and there still is another tag to parse
                $token = new
                HTMLPurifier_Token_Text(
-                    $this->parseData(
+                    $this->parseText(
                        substr(
                            $html,
                            $cursor,
                            $position_next_lt - $cursor
-                        )
+                        ), $config
                    )
                );
                if ($maintain_line_numbers) {
@ -154,11 +154,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                // Create Text of rest of string
                $token = new
                HTMLPurifier_Token_Text(
-                    $this->parseData(
+                    $this->parseText(
                        substr(
                            $html,
                            $cursor
-                        )
+                        ), $config
                    )
                );
                if ($maintain_line_numbers) {
@ -324,8 +324,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                $token = new
                HTMLPurifier_Token_Text(
                    '<' .
-                    $this->parseData(
-                        substr($html, $cursor)
+                    $this->parseText(
+                        substr($html, $cursor), $config
                    )
                );
                if ($maintain_line_numbers) {
@ -429,7 +429,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
            if ($value === false) {
                $value = '';
            }
-            return array($key => $this->parseData($value));
+            return array($key => $this->parseAttr($value, $config));
        }

        // setup loop environment
@ -518,7 +518,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                if ($value === false) {
                    $value = '';
                }
-                $array[$key] = $this->parseData($value);
+                $array[$key] = $this->parseAttr($value, $config);
                $cursor++;
            } else {
                // boolattr
--- a/library/HTMLPurifier/Lexer/PH5P.php
+++ b/library/HTMLPurifier/Lexer/PH5P.php
@ -36,7 +36,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
            $doc->getElementsByTagName('html')->item(0)-> // <html>
                  getElementsByTagName('body')->item(0) //   <body>
            ,
-            $tokens
+            $tokens, $config
        );
        return $tokens;
    }
@ -1515,6 +1515,7 @@ class HTML5
                // Consume the maximum number of characters possible, with the
                // consumed characters case-sensitively matching one of the
                // identifiers in the first column of the entities table.
+
                $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
                $len = strlen($e_name);

@ -1547,7 +1548,7 @@ class HTML5

        // Return a character token for the character corresponding to the
        // entity name (as given by the second column of the entities table).
-        return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
+        return html_entity_decode('&' . rtrim($entity, ';') . ';', ENT_QUOTES, 'UTF-8');
    }

    private function emitToken($token)
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -46,11 +46,11 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness

    // HTMLPurifier_Lexer->parseData() -----------------------------------------

-    public function assertParseData($input, $expect = true)
+    public function assertParseData($input, $expect = true, $is_attr = false)
    {
        if ($expect === true) $expect = $input;
        $lexer = new HTMLPurifier_Lexer();
-        $this->assertIdentical($expect, $lexer->parseData($input));
+        $this->assertIdentical($expect, $lexer->parseData($input, $is_attr, $this->config));
    }

    public function test_parseData_plainText()
@ -95,7 +95,58 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness

    public function test_parseData_improperEntityFaultToleranceTest()
    {
-        $this->assertParseData('&#x2D;');
+        $this->assertParseData('&#x2D;', '-');
+    }
+
+    public function test_parseData_noTrailingSemi()
+    {
+        $this->assertParseData('&ampA', '&A');
+    }
+
+    public function test_parseData_noTrailingSemiAttr()
+    {
+        $this->assertParseData('&ampA', '&ampA', true);
+    }
+
+    public function test_parseData_T119()
+    {
+        $this->assertParseData('&ampA', '&ampA', true);
+    }
+
+    public function test_parseData_T119b()
+    {
+        $this->assertParseData('&trade=', true, true);
+    }
+
+    public function test_parseData_legacy1()
+    {
+        $this->config->set('Core.LegacyEntityDecoder', true);
+        $this->assertParseData('&ampa', true);
+        $this->assertParseData('&amp=', "&=");
+        $this->assertParseData('&ampa', true, true);
+        $this->assertParseData('&amp=', "&=", true);
+        $this->assertParseData('&lta', true);
+        $this->assertParseData('&lt=', "<=");
+        $this->assertParseData('&lta', true, true);
+        $this->assertParseData('&lt=', "<=", true);
+    }
+
+    public function test_parseData_nonlegacy1()
+    {
+        $this->assertParseData('&ampa', "&a");
+        $this->assertParseData('&amp=', "&=");
+        $this->assertParseData('&ampa', true, true);
+        $this->assertParseData('&amp=', true, true);
+        $this->assertParseData('&lta', "<a");
+        $this->assertParseData('&lt=', "<=");
+        $this->assertParseData('&lta', true, true);
+        $this->assertParseData('&lt=', true, true);
+        $this->assertParseData('&lta;', "<a;");
+    }
+
+    public function test_parseData_noTrailingSemiNever()
+    {
+        $this->assertParseData('&imath');
    }

    // HTMLPurifier_Lexer->extractBody() ---------------------------------------