From 5ce0ae7056ba2d2dd01e766545234e7f53c45203 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sun, 23 Jul 2006 21:07:30 +0000 Subject: [PATCH] Implement EntityLookup and put in the Lexer. Some behavior was migrated, since it looks like it will have to be used in all Lexers, not just DirectLex (which is the only one that uses it). git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@105 48356398-32a2-884e-a903-53898d9a118a --- library/HTMLPurifier/EntityLookup.php | 25 ++++++++ library/HTMLPurifier/EntityLookup/data.txt | 1 + library/HTMLPurifier/Lexer.php | 62 ++++++++++++++++++++ library/HTMLPurifier/Lexer/DirectLex.php | 51 ---------------- maintenance/.htaccess | 1 + maintenance/generate-entity-file.php | 68 ++++++++++++++++++++++ tests/HTMLPurifier/EntityLookupTest.php | 29 +++++++++ tests/HTMLPurifier/LexerTest.php | 12 +++- tests/index.php | 1 + 9 files changed, 196 insertions(+), 54 deletions(-) create mode 100644 library/HTMLPurifier/EntityLookup.php create mode 100644 library/HTMLPurifier/EntityLookup/data.txt create mode 100644 maintenance/.htaccess create mode 100644 maintenance/generate-entity-file.php create mode 100644 tests/HTMLPurifier/EntityLookupTest.php diff --git a/library/HTMLPurifier/EntityLookup.php b/library/HTMLPurifier/EntityLookup.php new file mode 100644 index 00000000..ce3f4f01 --- /dev/null +++ b/library/HTMLPurifier/EntityLookup.php @@ -0,0 +1,25 @@ +table = unserialize(file_get_contents($file)); + } + + function instance() { + // no references, since PHP doesn't copy unless modified + static $instance = null; + if (!$instance) { + $instance = new HTMLPurifier_EntityLookup(); + } + return $instance; + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/EntityLookup/data.txt b/library/HTMLPurifier/EntityLookup/data.txt new file mode 100644 index 00000000..f2b8b8f2 --- /dev/null +++ b/library/HTMLPurifier/EntityLookup/data.txt @@ -0,0 +1 @@ +a:246:{s:4:"nbsp";s:2:" ";s:5:"iexcl";s:2:"¡";s:4:"cent";s:2:"¢";s:5:"pound";s:2:"£";s:6:"curren";s:2:"¤";s:3:"yen";s:2:"¥";s:6:"brvbar";s:2:"¦";s:4:"sect";s:2:"§";s:3:"uml";s:2:"¨";s:4:"copy";s:2:"©";s:4:"ordf";s:2:"ª";s:5:"laquo";s:2:"«";s:3:"not";s:2:"¬";s:3:"shy";s:2:"­";s:3:"reg";s:2:"®";s:4:"macr";s:2:"¯";s:3:"deg";s:2:"°";s:6:"plusmn";s:2:"±";s:5:"acute";s:2:"´";s:5:"micro";s:2:"µ";s:4:"para";s:2:"¶";s:6:"middot";s:2:"·";s:5:"cedil";s:2:"¸";s:4:"ordm";s:2:"º";s:5:"raquo";s:2:"»";s:6:"iquest";s:2:"¿";s:6:"Agrave";s:2:"À";s:6:"Aacute";s:2:"Á";s:5:"Acirc";s:2:"Â";s:6:"Atilde";s:2:"Ã";s:4:"Auml";s:2:"Ä";s:5:"Aring";s:2:"Å";s:5:"AElig";s:2:"Æ";s:6:"Ccedil";s:2:"Ç";s:6:"Egrave";s:2:"È";s:6:"Eacute";s:2:"É";s:5:"Ecirc";s:2:"Ê";s:4:"Euml";s:2:"Ë";s:6:"Igrave";s:2:"Ì";s:6:"Iacute";s:2:"Í";s:5:"Icirc";s:2:"Î";s:4:"Iuml";s:2:"Ï";s:3:"ETH";s:2:"Ð";s:6:"Ntilde";s:2:"Ñ";s:6:"Ograve";s:2:"Ò";s:6:"Oacute";s:2:"Ó";s:5:"Ocirc";s:2:"Ô";s:6:"Otilde";s:2:"Õ";s:4:"Ouml";s:2:"Ö";s:5:"times";s:2:"×";s:6:"Oslash";s:2:"Ø";s:6:"Ugrave";s:2:"Ù";s:6:"Uacute";s:2:"Ú";s:5:"Ucirc";s:2:"Û";s:4:"Uuml";s:2:"Ü";s:6:"Yacute";s:2:"Ý";s:5:"THORN";s:2:"Þ";s:5:"szlig";s:2:"ß";s:6:"agrave";s:2:"à";s:6:"aacute";s:2:"á";s:5:"acirc";s:2:"â";s:6:"atilde";s:2:"ã";s:4:"auml";s:2:"ä";s:5:"aring";s:2:"å";s:5:"aelig";s:2:"æ";s:6:"ccedil";s:2:"ç";s:6:"egrave";s:2:"è";s:6:"eacute";s:2:"é";s:5:"ecirc";s:2:"ê";s:4:"euml";s:2:"ë";s:6:"igrave";s:2:"ì";s:6:"iacute";s:2:"í";s:5:"icirc";s:2:"î";s:4:"iuml";s:2:"ï";s:3:"eth";s:2:"ð";s:6:"ntilde";s:2:"ñ";s:6:"ograve";s:2:"ò";s:6:"oacute";s:2:"ó";s:5:"ocirc";s:2:"ô";s:6:"otilde";s:2:"õ";s:4:"ouml";s:2:"ö";s:6:"divide";s:2:"÷";s:6:"oslash";s:2:"ø";s:6:"ugrave";s:2:"ù";s:6:"uacute";s:2:"ú";s:5:"ucirc";s:2:"û";s:4:"uuml";s:2:"ü";s:6:"yacute";s:2:"ý";s:5:"thorn";s:2:"þ";s:4:"yuml";s:2:"ÿ";s:4:"quot";s:1:""";s:3:"amp";s:1:"&";s:2:"lt";s:1:"<";s:2:"gt";s:1:">";s:4:"apos";s:1:"'";s:5:"OElig";s:2:"Œ";s:5:"oelig";s:2:"œ";s:6:"Scaron";s:2:"Š";s:6:"scaron";s:2:"š";s:4:"Yuml";s:2:"Ÿ";s:4:"circ";s:2:"ˆ";s:5:"tilde";s:2:"˜";s:4:"ensp";s:3:" ";s:4:"emsp";s:3:" ";s:6:"thinsp";s:3:" ";s:4:"zwnj";s:3:"‌";s:3:"zwj";s:3:"‍";s:3:"lrm";s:3:"‎";s:3:"rlm";s:3:"‏";s:5:"ndash";s:3:"–";s:5:"mdash";s:3:"—";s:5:"lsquo";s:3:"‘";s:5:"rsquo";s:3:"’";s:5:"sbquo";s:3:"‚";s:5:"ldquo";s:3:"“";s:5:"rdquo";s:3:"”";s:5:"bdquo";s:3:"„";s:6:"dagger";s:3:"†";s:6:"Dagger";s:3:"‡";s:6:"permil";s:3:"‰";s:6:"lsaquo";s:3:"‹";s:6:"rsaquo";s:3:"›";s:4:"euro";s:3:"€";s:4:"fnof";s:2:"ƒ";s:5:"Alpha";s:2:"Α";s:4:"Beta";s:2:"Β";s:5:"Gamma";s:2:"Γ";s:5:"Delta";s:2:"Δ";s:7:"Epsilon";s:2:"Ε";s:4:"Zeta";s:2:"Ζ";s:3:"Eta";s:2:"Η";s:5:"Theta";s:2:"Θ";s:4:"Iota";s:2:"Ι";s:5:"Kappa";s:2:"Κ";s:6:"Lambda";s:2:"Λ";s:2:"Mu";s:2:"Μ";s:2:"Nu";s:2:"Ν";s:2:"Xi";s:2:"Ξ";s:7:"Omicron";s:2:"Ο";s:2:"Pi";s:2:"Π";s:3:"Rho";s:2:"Ρ";s:5:"Sigma";s:2:"Σ";s:3:"Tau";s:2:"Τ";s:7:"Upsilon";s:2:"Υ";s:3:"Phi";s:2:"Φ";s:3:"Chi";s:2:"Χ";s:3:"Psi";s:2:"Ψ";s:5:"Omega";s:2:"Ω";s:5:"alpha";s:2:"α";s:4:"beta";s:2:"β";s:5:"gamma";s:2:"γ";s:5:"delta";s:2:"δ";s:7:"epsilon";s:2:"ε";s:4:"zeta";s:2:"ζ";s:3:"eta";s:2:"η";s:5:"theta";s:2:"θ";s:4:"iota";s:2:"ι";s:5:"kappa";s:2:"κ";s:6:"lambda";s:2:"λ";s:2:"mu";s:2:"μ";s:2:"nu";s:2:"ν";s:2:"xi";s:2:"ξ";s:7:"omicron";s:2:"ο";s:2:"pi";s:2:"π";s:3:"rho";s:2:"ρ";s:6:"sigmaf";s:2:"ς";s:5:"sigma";s:2:"σ";s:3:"tau";s:2:"τ";s:7:"upsilon";s:2:"υ";s:3:"phi";s:2:"φ";s:3:"chi";s:2:"χ";s:3:"psi";s:2:"ψ";s:5:"omega";s:2:"ω";s:8:"thetasym";s:2:"ϑ";s:5:"upsih";s:2:"ϒ";s:3:"piv";s:2:"ϖ";s:4:"bull";s:3:"•";s:6:"hellip";s:3:"…";s:5:"prime";s:3:"′";s:5:"Prime";s:3:"″";s:5:"oline";s:3:"‾";s:5:"frasl";s:3:"⁄";s:6:"weierp";s:3:"℘";s:5:"image";s:3:"ℑ";s:4:"real";s:3:"ℜ";s:5:"trade";s:3:"™";s:7:"alefsym";s:3:"ℵ";s:4:"larr";s:3:"←";s:4:"uarr";s:3:"↑";s:4:"rarr";s:3:"→";s:4:"darr";s:3:"↓";s:4:"harr";s:3:"↔";s:5:"crarr";s:3:"↵";s:4:"lArr";s:3:"⇐";s:4:"uArr";s:3:"⇑";s:4:"rArr";s:3:"⇒";s:4:"dArr";s:3:"⇓";s:4:"hArr";s:3:"⇔";s:6:"forall";s:3:"∀";s:4:"part";s:3:"∂";s:5:"exist";s:3:"∃";s:5:"empty";s:3:"∅";s:5:"nabla";s:3:"∇";s:4:"isin";s:3:"∈";s:5:"notin";s:3:"∉";s:2:"ni";s:3:"∋";s:4:"prod";s:3:"∏";s:3:"sum";s:3:"∑";s:5:"minus";s:3:"−";s:6:"lowast";s:3:"∗";s:5:"radic";s:3:"√";s:4:"prop";s:3:"∝";s:5:"infin";s:3:"∞";s:3:"ang";s:3:"∠";s:3:"and";s:3:"∧";s:2:"or";s:3:"∨";s:3:"cap";s:3:"∩";s:3:"cup";s:3:"∪";s:3:"int";s:3:"∫";s:3:"sim";s:3:"∼";s:4:"cong";s:3:"≅";s:5:"asymp";s:3:"≈";s:2:"ne";s:3:"≠";s:5:"equiv";s:3:"≡";s:2:"le";s:3:"≤";s:2:"ge";s:3:"≥";s:3:"sub";s:3:"⊂";s:3:"sup";s:3:"⊃";s:4:"nsub";s:3:"⊄";s:4:"sube";s:3:"⊆";s:4:"supe";s:3:"⊇";s:5:"oplus";s:3:"⊕";s:6:"otimes";s:3:"⊗";s:4:"perp";s:3:"⊥";s:4:"sdot";s:3:"⋅";s:5:"lceil";s:3:"⌈";s:5:"rceil";s:3:"⌉";s:6:"lfloor";s:3:"⌊";s:6:"rfloor";s:3:"⌋";s:4:"lang";s:3:"〈";s:4:"rang";s:3:"〉";s:3:"loz";s:3:"◊";s:6:"spades";s:3:"♠";s:5:"clubs";s:3:"♣";s:6:"hearts";s:3:"♥";s:5:"diams";s:3:"♦";} \ No newline at end of file diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index 34c489ed..a079df82 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -85,6 +85,68 @@ class HTMLPurifier_Lexer return $lexer; } + + /** + * Callback regex string for parsing entities. + * @protected + */ + var $_substituteEntitiesRegex = + // 1. hex 2. dec 3. string + '/&[#](?:x([a-fA-F0-9]+)|0*(\d+)|([A-Za-z]+));?/'; + + /** + * Substitutes non-special entities with their parsed equivalents. Since + * running this whenever you have parsed character is t3h 5uck, we run + * it before everything else. + * + * @protected + * @param $string String to have non-special entities parsed. + * @returns Parsed string. + */ + function substituteNonSpecialEntities($string) { + // it will try to detect missing semicolons, but don't rely on it + return preg_replace_callback( + $this->_substituteEntitiesRegex, + array('HTMLPurifier_Lexer_DirectLex', 'nonSpecialEntityCallback'), + $string); + } + + /** + * Callback function for substituteNonSpecialEntities() that does the work. + * + * @warning Though this is public in order to let the callback happen, + * calling it directly is not recommended. + * @param $matches PCRE-style matches array, with 0 the entire match, and + * either index 1, 2 or 3 set with a hex value, dec value, + * or string (respectively). + * @returns Replacement string. + * @todo Implement string translations + */ + function nonSpecialEntityCallback($matches) { + // replaces all but big five + $entity = $matches[0]; + $is_num = (@$matches[0][1] === '#'); + if ($is_num) { + $is_hex = (@$entity[2] === 'x'); + $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; + if (isset($this->_special_dec2str[$int])) return $entity; + return chr($int); + } else { + if (isset($this->_special_ent2dec[$matches[3]])) return $entity; + if (!$this->_entity_lookup) { + require_once 'HTMLPurifier/EntityLookup.php'; + $this->_entity_lookup = EntityLookup::instance(); + } + if (isset($this->_entity_lookup->table[$matches[3]])) { + return $this->_entity_lookup->table[$matches[3]]; + } else { + return $entity; + } + } + } + + var $_entity_lookup; + } ?> \ No newline at end of file diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index 91706370..587f1928 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -11,7 +11,6 @@ require_once 'HTMLPurifier/Lexer.php'; * pales in comparison to HTMLPurifier_Lexer_DOMLex. It will support UTF-8 * completely eventually. * - * @todo Implement non-special string entity conversion. * @todo Reread XML spec and document differences. * @todo Add support for CDATA sections. * @todo Determine correct behavior in outputting comment data. (preserve dashes?) @@ -99,56 +98,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer ''' => "'", ); - /** - * Callback regex string for parsing entities. - * @protected - */ - var $_substituteEntitiesRegex = - // 1. hex 2. dec 3. string - '/&[#](?:x([a-fA-F0-9]+)|0*(\d+)|([A-Za-z]+));?/'; - - /** - * Substitutes non-special entities with their parsed equivalents. - * - * @protected - * @param $string String to have non-special entities parsed. - * @returns Parsed string. - */ - function substituteNonSpecialEntities($string) { - // it will try to detect missing semicolons, but don't rely on it - return preg_replace_callback( - $this->_substituteEntitiesRegex, - array('HTMLPurifier_Lexer_DirectLex', 'nonSpecialEntityCallback'), - $string); - } - - /** - * Callback function for substituteNonSpecialEntities() that does the work. - * - * @warning Though this is public in order to let the callback happen, - * calling it directly is not recommended. - * @param $matches PCRE-style matches array, with 0 the entire match, and - * either index 1, 2 or 3 set with a hex value, dec value, - * or string (respectively). - * @returns Replacement string. - * @todo Implement string translations - */ - function nonSpecialEntityCallback($matches) { - // replaces all but big five - $entity = $matches[0]; - $is_num = (@$matches[0][1] === '#'); - if ($is_num) { - $is_hex = (@$entity[2] === 'x'); - $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; - if (isset($this->_special_dec2str[$int])) return $entity; - return chr($int); - } else { - if (isset($this->_special_ent2dec[$matches[3]])) return $entity; - // translate $matches[3] - return ''; - } - } - /** * Substitutes only special entities with their parsed equivalents. * diff --git a/maintenance/.htaccess b/maintenance/.htaccess new file mode 100644 index 00000000..03688ee9 --- /dev/null +++ b/maintenance/.htaccess @@ -0,0 +1 @@ +Deny from all diff --git a/maintenance/generate-entity-file.php b/maintenance/generate-entity-file.php new file mode 100644 index 00000000..b46586b5 --- /dev/null +++ b/maintenance/generate-entity-file.php @@ -0,0 +1,68 @@ +#!/usr/bin/php +/'; + +foreach ( $entity_files as $file ) { + $contents = file_get_contents($entity_dir . $file); + $matches = array(); + preg_match_all($regexp, $contents, $matches, PREG_SET_ORDER); + foreach ($matches as $match) { + $entity_table[$match[1]] = unichr($match[2]); + } +} + +$output = serialize($entity_table); + +$fh = fopen($output_file, 'w'); +fwrite($fh, $output); +fclose($fh); + +echo "Completed successfully."; + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/EntityLookupTest.php b/tests/HTMLPurifier/EntityLookupTest.php new file mode 100644 index 00000000..c490d55a --- /dev/null +++ b/tests/HTMLPurifier/EntityLookupTest.php @@ -0,0 +1,29 @@ +assertIdentical('â', $lookup->table['acirc']); + + // special char + $this->assertIdentical('"', $lookup->table['quot']); + $this->assertIdentical('“', $lookup->table['ldquo']); + $this->assertIdentical('<', $lookup->table['lt']); //expressed strangely + + // symbol char + $this->assertIdentical('θ', $lookup->table['theta']); + + } + +} + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index 2693b825..0f988a6b 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -7,6 +7,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase { var $DirectLex, $PEARSax3, $DOMLex; + var $_entity_lookup; var $_has_dom; function setUp() { @@ -14,12 +15,13 @@ class HTMLPurifier_LexerTest extends UnitTestCase $this->PEARSax3 = new HTMLPurifier_Lexer_PEARSax3(); $this->_has_dom = version_compare(PHP_VERSION, '5', '>='); - if ($this->_has_dom) { require_once 'HTMLPurifier/Lexer/DOMLex.php'; $this->DOMLex = new HTMLPurifier_Lexer_DOMLex(); } + $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); + } function test_tokenizeHTML() { @@ -152,8 +154,12 @@ class HTMLPurifier_LexerTest extends UnitTestCase // compare with this valid one: $input[12] = '"'; $expect[12] = array( new HTMLPurifier_Token_Text('"') ); - $sax_expect[12] = false; - // SAX chokes on this? We do have entity parsing on, so it should work! + $sax_expect[12] = false; // choked! + + // DOM and SAX choke on this + //$char_circ = $this->_entity_lookup->table['circ']; + //$input[13] = 'ˆ'; + //$expect[13] = array( new HTMLPurifier_Token_Text($char_circ) ); foreach($input as $i => $discard) { $result = $this->DirectLex->tokenizeHTML($input[$i]); diff --git a/tests/index.php b/tests/index.php index 593023ec..2dcd7694 100644 --- a/tests/index.php +++ b/tests/index.php @@ -19,6 +19,7 @@ $test->addTestFile('HTMLPurifier/Lexer/DirectLexTest.php'); $test->addTestFile('HTMLPurifier/DefinitionTest.php'); $test->addTestFile('HTMLPurifier/ChildDefTest.php'); $test->addTestFile('HTMLPurifier/GeneratorTest.php'); +$test->addTestFile('HTMLPurifier/EntityLookupTest.php'); $test->run( new HtmlReporter() );