diff --git a/library/HTMLPurifier/EntityLookup.php b/library/HTMLPurifier/EntityLookup.php new file mode 100644 index 00000000..ce3f4f01 --- /dev/null +++ b/library/HTMLPurifier/EntityLookup.php @@ -0,0 +1,25 @@ +table = unserialize(file_get_contents($file)); + } + + function instance() { + // no references, since PHP doesn't copy unless modified + static $instance = null; + if (!$instance) { + $instance = new HTMLPurifier_EntityLookup(); + } + return $instance; + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/EntityLookup/data.txt b/library/HTMLPurifier/EntityLookup/data.txt new file mode 100644 index 00000000..f2b8b8f2 --- /dev/null +++ b/library/HTMLPurifier/EntityLookup/data.txt @@ -0,0 +1 @@ +a:246:{s:4:"nbsp";s:2:" ";s:5:"iexcl";s:2:"¡";s:4:"cent";s:2:"¢";s:5:"pound";s:2:"£";s:6:"curren";s:2:"¤";s:3:"yen";s:2:"¥";s:6:"brvbar";s:2:"¦";s:4:"sect";s:2:"§";s:3:"uml";s:2:"¨";s:4:"copy";s:2:"©";s:4:"ordf";s:2:"ª";s:5:"laquo";s:2:"«";s:3:"not";s:2:"¬";s:3:"shy";s:2:"­";s:3:"reg";s:2:"®";s:4:"macr";s:2:"¯";s:3:"deg";s:2:"°";s:6:"plusmn";s:2:"±";s:5:"acute";s:2:"´";s:5:"micro";s:2:"µ";s:4:"para";s:2:"¶";s:6:"middot";s:2:"·";s:5:"cedil";s:2:"¸";s:4:"ordm";s:2:"º";s:5:"raquo";s:2:"»";s:6:"iquest";s:2:"¿";s:6:"Agrave";s:2:"À";s:6:"Aacute";s:2:"Á";s:5:"Acirc";s:2:"Â";s:6:"Atilde";s:2:"Ã";s:4:"Auml";s:2:"Ä";s:5:"Aring";s:2:"Å";s:5:"AElig";s:2:"Æ";s:6:"Ccedil";s:2:"Ç";s:6:"Egrave";s:2:"È";s:6:"Eacute";s:2:"É";s:5:"Ecirc";s:2:"Ê";s:4:"Euml";s:2:"Ë";s:6:"Igrave";s:2:"Ì";s:6:"Iacute";s:2:"Í";s:5:"Icirc";s:2:"Î";s:4:"Iuml";s:2:"Ï";s:3:"ETH";s:2:"Ð";s:6:"Ntilde";s:2:"Ñ";s:6:"Ograve";s:2:"Ò";s:6:"Oacute";s:2:"Ó";s:5:"Ocirc";s:2:"Ô";s:6:"Otilde";s:2:"Õ";s:4:"Ouml";s:2:"Ö";s:5:"times";s:2:"×";s:6:"Oslash";s:2:"Ø";s:6:"Ugrave";s:2:"Ù";s:6:"Uacute";s:2:"Ú";s:5:"Ucirc";s:2:"Û";s:4:"Uuml";s:2:"Ü";s:6:"Yacute";s:2:"Ý";s:5:"THORN";s:2:"Þ";s:5:"szlig";s:2:"ß";s:6:"agrave";s:2:"à";s:6:"aacute";s:2:"á";s:5:"acirc";s:2:"â";s:6:"atilde";s:2:"ã";s:4:"auml";s:2:"ä";s:5:"aring";s:2:"å";s:5:"aelig";s:2:"æ";s:6:"ccedil";s:2:"ç";s:6:"egrave";s:2:"è";s:6:"eacute";s:2:"é";s:5:"ecirc";s:2:"ê";s:4:"euml";s:2:"ë";s:6:"igrave";s:2:"ì";s:6:"iacute";s:2:"í";s:5:"icirc";s:2:"î";s:4:"iuml";s:2:"ï";s:3:"eth";s:2:"ð";s:6:"ntilde";s:2:"ñ";s:6:"ograve";s:2:"ò";s:6:"oacute";s:2:"ó";s:5:"ocirc";s:2:"ô";s:6:"otilde";s:2:"õ";s:4:"ouml";s:2:"ö";s:6:"divide";s:2:"÷";s:6:"oslash";s:2:"ø";s:6:"ugrave";s:2:"ù";s:6:"uacute";s:2:"ú";s:5:"ucirc";s:2:"û";s:4:"uuml";s:2:"ü";s:6:"yacute";s:2:"ý";s:5:"thorn";s:2:"þ";s:4:"yuml";s:2:"ÿ";s:4:"quot";s:1:""";s:3:"amp";s:1:"&";s:2:"lt";s:1:"<";s:2:"gt";s:1:">";s:4:"apos";s:1:"'";s:5:"OElig";s:2:"Œ";s:5:"oelig";s:2:"œ";s:6:"Scaron";s:2:"Š";s:6:"scaron";s:2:"š";s:4:"Yuml";s:2:"Ÿ";s:4:"circ";s:2:"ˆ";s:5:"tilde";s:2:"˜";s:4:"ensp";s:3:" ";s:4:"emsp";s:3:" ";s:6:"thinsp";s:3:" ";s:4:"zwnj";s:3:"‌";s:3:"zwj";s:3:"‍";s:3:"lrm";s:3:"‎";s:3:"rlm";s:3:"‏";s:5:"ndash";s:3:"–";s:5:"mdash";s:3:"—";s:5:"lsquo";s:3:"‘";s:5:"rsquo";s:3:"’";s:5:"sbquo";s:3:"‚";s:5:"ldquo";s:3:"“";s:5:"rdquo";s:3:"”";s:5:"bdquo";s:3:"„";s:6:"dagger";s:3:"†";s:6:"Dagger";s:3:"‡";s:6:"permil";s:3:"‰";s:6:"lsaquo";s:3:"‹";s:6:"rsaquo";s:3:"›";s:4:"euro";s:3:"€";s:4:"fnof";s:2:"ƒ";s:5:"Alpha";s:2:"Α";s:4:"Beta";s:2:"Β";s:5:"Gamma";s:2:"Γ";s:5:"Delta";s:2:"Δ";s:7:"Epsilon";s:2:"Ε";s:4:"Zeta";s:2:"Ζ";s:3:"Eta";s:2:"Η";s:5:"Theta";s:2:"Θ";s:4:"Iota";s:2:"Ι";s:5:"Kappa";s:2:"Κ";s:6:"Lambda";s:2:"Λ";s:2:"Mu";s:2:"Μ";s:2:"Nu";s:2:"Ν";s:2:"Xi";s:2:"Ξ";s:7:"Omicron";s:2:"Ο";s:2:"Pi";s:2:"Π";s:3:"Rho";s:2:"Ρ";s:5:"Sigma";s:2:"Σ";s:3:"Tau";s:2:"Τ";s:7:"Upsilon";s:2:"Υ";s:3:"Phi";s:2:"Φ";s:3:"Chi";s:2:"Χ";s:3:"Psi";s:2:"Ψ";s:5:"Omega";s:2:"Ω";s:5:"alpha";s:2:"α";s:4:"beta";s:2:"β";s:5:"gamma";s:2:"γ";s:5:"delta";s:2:"δ";s:7:"epsilon";s:2:"ε";s:4:"zeta";s:2:"ζ";s:3:"eta";s:2:"η";s:5:"theta";s:2:"θ";s:4:"iota";s:2:"ι";s:5:"kappa";s:2:"κ";s:6:"lambda";s:2:"λ";s:2:"mu";s:2:"μ";s:2:"nu";s:2:"ν";s:2:"xi";s:2:"ξ";s:7:"omicron";s:2:"ο";s:2:"pi";s:2:"π";s:3:"rho";s:2:"ρ";s:6:"sigmaf";s:2:"ς";s:5:"sigma";s:2:"σ";s:3:"tau";s:2:"τ";s:7:"upsilon";s:2:"υ";s:3:"phi";s:2:"φ";s:3:"chi";s:2:"χ";s:3:"psi";s:2:"ψ";s:5:"omega";s:2:"ω";s:8:"thetasym";s:2:"ϑ";s:5:"upsih";s:2:"ϒ";s:3:"piv";s:2:"ϖ";s:4:"bull";s:3:"•";s:6:"hellip";s:3:"…";s:5:"prime";s:3:"′";s:5:"Prime";s:3:"″";s:5:"oline";s:3:"‾";s:5:"frasl";s:3:"⁄";s:6:"weierp";s:3:"℘";s:5:"image";s:3:"ℑ";s:4:"real";s:3:"ℜ";s:5:"trade";s:3:"™";s:7:"alefsym";s:3:"ℵ";s:4:"larr";s:3:"←";s:4:"uarr";s:3:"↑";s:4:"rarr";s:3:"→";s:4:"darr";s:3:"↓";s:4:"harr";s:3:"↔";s:5:"crarr";s:3:"↵";s:4:"lArr";s:3:"⇐";s:4:"uArr";s:3:"⇑";s:4:"rArr";s:3:"⇒";s:4:"dArr";s:3:"⇓";s:4:"hArr";s:3:"⇔";s:6:"forall";s:3:"∀";s:4:"part";s:3:"∂";s:5:"exist";s:3:"∃";s:5:"empty";s:3:"∅";s:5:"nabla";s:3:"∇";s:4:"isin";s:3:"∈";s:5:"notin";s:3:"∉";s:2:"ni";s:3:"∋";s:4:"prod";s:3:"∏";s:3:"sum";s:3:"∑";s:5:"minus";s:3:"−";s:6:"lowast";s:3:"∗";s:5:"radic";s:3:"√";s:4:"prop";s:3:"∝";s:5:"infin";s:3:"∞";s:3:"ang";s:3:"∠";s:3:"and";s:3:"∧";s:2:"or";s:3:"∨";s:3:"cap";s:3:"∩";s:3:"cup";s:3:"∪";s:3:"int";s:3:"∫";s:3:"sim";s:3:"∼";s:4:"cong";s:3:"≅";s:5:"asymp";s:3:"≈";s:2:"ne";s:3:"≠";s:5:"equiv";s:3:"≡";s:2:"le";s:3:"≤";s:2:"ge";s:3:"≥";s:3:"sub";s:3:"⊂";s:3:"sup";s:3:"⊃";s:4:"nsub";s:3:"⊄";s:4:"sube";s:3:"⊆";s:4:"supe";s:3:"⊇";s:5:"oplus";s:3:"⊕";s:6:"otimes";s:3:"⊗";s:4:"perp";s:3:"⊥";s:4:"sdot";s:3:"⋅";s:5:"lceil";s:3:"⌈";s:5:"rceil";s:3:"⌉";s:6:"lfloor";s:3:"⌊";s:6:"rfloor";s:3:"⌋";s:4:"lang";s:3:"〈";s:4:"rang";s:3:"〉";s:3:"loz";s:3:"◊";s:6:"spades";s:3:"♠";s:5:"clubs";s:3:"♣";s:6:"hearts";s:3:"♥";s:5:"diams";s:3:"♦";} \ No newline at end of file diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index 34c489ed..a079df82 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -85,6 +85,68 @@ class HTMLPurifier_Lexer return $lexer; } + + /** + * Callback regex string for parsing entities. + * @protected + */ + var $_substituteEntitiesRegex = + // 1. hex 2. dec 3. string + '/&[#](?:x([a-fA-F0-9]+)|0*(\d+)|([A-Za-z]+));?/'; + + /** + * Substitutes non-special entities with their parsed equivalents. Since + * running this whenever you have parsed character is t3h 5uck, we run + * it before everything else. + * + * @protected + * @param $string String to have non-special entities parsed. + * @returns Parsed string. + */ + function substituteNonSpecialEntities($string) { + // it will try to detect missing semicolons, but don't rely on it + return preg_replace_callback( + $this->_substituteEntitiesRegex, + array('HTMLPurifier_Lexer_DirectLex', 'nonSpecialEntityCallback'), + $string); + } + + /** + * Callback function for substituteNonSpecialEntities() that does the work. + * + * @warning Though this is public in order to let the callback happen, + * calling it directly is not recommended. + * @param $matches PCRE-style matches array, with 0 the entire match, and + * either index 1, 2 or 3 set with a hex value, dec value, + * or string (respectively). + * @returns Replacement string. + * @todo Implement string translations + */ + function nonSpecialEntityCallback($matches) { + // replaces all but big five + $entity = $matches[0]; + $is_num = (@$matches[0][1] === '#'); + if ($is_num) { + $is_hex = (@$entity[2] === 'x'); + $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; + if (isset($this->_special_dec2str[$int])) return $entity; + return chr($int); + } else { + if (isset($this->_special_ent2dec[$matches[3]])) return $entity; + if (!$this->_entity_lookup) { + require_once 'HTMLPurifier/EntityLookup.php'; + $this->_entity_lookup = EntityLookup::instance(); + } + if (isset($this->_entity_lookup->table[$matches[3]])) { + return $this->_entity_lookup->table[$matches[3]]; + } else { + return $entity; + } + } + } + + var $_entity_lookup; + } ?> \ No newline at end of file diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index 91706370..587f1928 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -11,7 +11,6 @@ require_once 'HTMLPurifier/Lexer.php'; * pales in comparison to HTMLPurifier_Lexer_DOMLex. It will support UTF-8 * completely eventually. * - * @todo Implement non-special string entity conversion. * @todo Reread XML spec and document differences. * @todo Add support for CDATA sections. * @todo Determine correct behavior in outputting comment data. (preserve dashes?) @@ -99,56 +98,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer ''' => "'", ); - /** - * Callback regex string for parsing entities. - * @protected - */ - var $_substituteEntitiesRegex = - // 1. hex 2. dec 3. string - '/&[#](?:x([a-fA-F0-9]+)|0*(\d+)|([A-Za-z]+));?/'; - - /** - * Substitutes non-special entities with their parsed equivalents. - * - * @protected - * @param $string String to have non-special entities parsed. - * @returns Parsed string. - */ - function substituteNonSpecialEntities($string) { - // it will try to detect missing semicolons, but don't rely on it - return preg_replace_callback( - $this->_substituteEntitiesRegex, - array('HTMLPurifier_Lexer_DirectLex', 'nonSpecialEntityCallback'), - $string); - } - - /** - * Callback function for substituteNonSpecialEntities() that does the work. - * - * @warning Though this is public in order to let the callback happen, - * calling it directly is not recommended. - * @param $matches PCRE-style matches array, with 0 the entire match, and - * either index 1, 2 or 3 set with a hex value, dec value, - * or string (respectively). - * @returns Replacement string. - * @todo Implement string translations - */ - function nonSpecialEntityCallback($matches) { - // replaces all but big five - $entity = $matches[0]; - $is_num = (@$matches[0][1] === '#'); - if ($is_num) { - $is_hex = (@$entity[2] === 'x'); - $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; - if (isset($this->_special_dec2str[$int])) return $entity; - return chr($int); - } else { - if (isset($this->_special_ent2dec[$matches[3]])) return $entity; - // translate $matches[3] - return ''; - } - } - /** * Substitutes only special entities with their parsed equivalents. * diff --git a/maintenance/.htaccess b/maintenance/.htaccess new file mode 100644 index 00000000..03688ee9 --- /dev/null +++ b/maintenance/.htaccess @@ -0,0 +1 @@ +Deny from all diff --git a/maintenance/generate-entity-file.php b/maintenance/generate-entity-file.php new file mode 100644 index 00000000..b46586b5 --- /dev/null +++ b/maintenance/generate-entity-file.php @@ -0,0 +1,68 @@ +#!/usr/bin/php +/'; + +foreach ( $entity_files as $file ) { + $contents = file_get_contents($entity_dir . $file); + $matches = array(); + preg_match_all($regexp, $contents, $matches, PREG_SET_ORDER); + foreach ($matches as $match) { + $entity_table[$match[1]] = unichr($match[2]); + } +} + +$output = serialize($entity_table); + +$fh = fopen($output_file, 'w'); +fwrite($fh, $output); +fclose($fh); + +echo "Completed successfully."; + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/EntityLookupTest.php b/tests/HTMLPurifier/EntityLookupTest.php new file mode 100644 index 00000000..c490d55a --- /dev/null +++ b/tests/HTMLPurifier/EntityLookupTest.php @@ -0,0 +1,29 @@ +assertIdentical('â', $lookup->table['acirc']); + + // special char + $this->assertIdentical('"', $lookup->table['quot']); + $this->assertIdentical('“', $lookup->table['ldquo']); + $this->assertIdentical('<', $lookup->table['lt']); //expressed strangely + + // symbol char + $this->assertIdentical('θ', $lookup->table['theta']); + + } + +} + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index 2693b825..0f988a6b 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -7,6 +7,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase { var $DirectLex, $PEARSax3, $DOMLex; + var $_entity_lookup; var $_has_dom; function setUp() { @@ -14,12 +15,13 @@ class HTMLPurifier_LexerTest extends UnitTestCase $this->PEARSax3 = new HTMLPurifier_Lexer_PEARSax3(); $this->_has_dom = version_compare(PHP_VERSION, '5', '>='); - if ($this->_has_dom) { require_once 'HTMLPurifier/Lexer/DOMLex.php'; $this->DOMLex = new HTMLPurifier_Lexer_DOMLex(); } + $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); + } function test_tokenizeHTML() { @@ -152,8 +154,12 @@ class HTMLPurifier_LexerTest extends UnitTestCase // compare with this valid one: $input[12] = '"'; $expect[12] = array( new HTMLPurifier_Token_Text('"') ); - $sax_expect[12] = false; - // SAX chokes on this? We do have entity parsing on, so it should work! + $sax_expect[12] = false; // choked! + + // DOM and SAX choke on this + //$char_circ = $this->_entity_lookup->table['circ']; + //$input[13] = 'ˆ'; + //$expect[13] = array( new HTMLPurifier_Token_Text($char_circ) ); foreach($input as $i => $discard) { $result = $this->DirectLex->tokenizeHTML($input[$i]); diff --git a/tests/index.php b/tests/index.php index 593023ec..2dcd7694 100644 --- a/tests/index.php +++ b/tests/index.php @@ -19,6 +19,7 @@ $test->addTestFile('HTMLPurifier/Lexer/DirectLexTest.php'); $test->addTestFile('HTMLPurifier/DefinitionTest.php'); $test->addTestFile('HTMLPurifier/ChildDefTest.php'); $test->addTestFile('HTMLPurifier/GeneratorTest.php'); +$test->addTestFile('HTMLPurifier/EntityLookupTest.php'); $test->run( new HtmlReporter() );