diff --git a/library/HTMLPurifier/EntityLookup.php b/library/HTMLPurifier/EntityLookup.php
new file mode 100644
index 00000000..ce3f4f01
--- /dev/null
+++ b/library/HTMLPurifier/EntityLookup.php
@@ -0,0 +1,25 @@
+table = unserialize(file_get_contents($file));
+ }
+
+ function instance() {
+ // no references, since PHP doesn't copy unless modified
+ static $instance = null;
+ if (!$instance) {
+ $instance = new HTMLPurifier_EntityLookup();
+ }
+ return $instance;
+ }
+
+}
+
+?>
\ No newline at end of file
diff --git a/library/HTMLPurifier/EntityLookup/data.txt b/library/HTMLPurifier/EntityLookup/data.txt
new file mode 100644
index 00000000..f2b8b8f2
--- /dev/null
+++ b/library/HTMLPurifier/EntityLookup/data.txt
@@ -0,0 +1 @@
+a:246:{s:4:"nbsp";s:2:" ";s:5:"iexcl";s:2:"¡";s:4:"cent";s:2:"¢";s:5:"pound";s:2:"£";s:6:"curren";s:2:"¤";s:3:"yen";s:2:"¥";s:6:"brvbar";s:2:"¦";s:4:"sect";s:2:"§";s:3:"uml";s:2:"¨";s:4:"copy";s:2:"©";s:4:"ordf";s:2:"ª";s:5:"laquo";s:2:"«";s:3:"not";s:2:"¬";s:3:"shy";s:2:"";s:3:"reg";s:2:"®";s:4:"macr";s:2:"¯";s:3:"deg";s:2:"°";s:6:"plusmn";s:2:"±";s:5:"acute";s:2:"´";s:5:"micro";s:2:"µ";s:4:"para";s:2:"¶";s:6:"middot";s:2:"·";s:5:"cedil";s:2:"¸";s:4:"ordm";s:2:"º";s:5:"raquo";s:2:"»";s:6:"iquest";s:2:"¿";s:6:"Agrave";s:2:"À";s:6:"Aacute";s:2:"Á";s:5:"Acirc";s:2:"Â";s:6:"Atilde";s:2:"Ã";s:4:"Auml";s:2:"Ä";s:5:"Aring";s:2:"Å";s:5:"AElig";s:2:"Æ";s:6:"Ccedil";s:2:"Ç";s:6:"Egrave";s:2:"È";s:6:"Eacute";s:2:"É";s:5:"Ecirc";s:2:"Ê";s:4:"Euml";s:2:"Ë";s:6:"Igrave";s:2:"Ì";s:6:"Iacute";s:2:"Í";s:5:"Icirc";s:2:"Î";s:4:"Iuml";s:2:"Ï";s:3:"ETH";s:2:"Ð";s:6:"Ntilde";s:2:"Ñ";s:6:"Ograve";s:2:"Ò";s:6:"Oacute";s:2:"Ó";s:5:"Ocirc";s:2:"Ô";s:6:"Otilde";s:2:"Õ";s:4:"Ouml";s:2:"Ö";s:5:"times";s:2:"×";s:6:"Oslash";s:2:"Ø";s:6:"Ugrave";s:2:"Ù";s:6:"Uacute";s:2:"Ú";s:5:"Ucirc";s:2:"Û";s:4:"Uuml";s:2:"Ü";s:6:"Yacute";s:2:"Ý";s:5:"THORN";s:2:"Þ";s:5:"szlig";s:2:"ß";s:6:"agrave";s:2:"à";s:6:"aacute";s:2:"á";s:5:"acirc";s:2:"â";s:6:"atilde";s:2:"ã";s:4:"auml";s:2:"ä";s:5:"aring";s:2:"å";s:5:"aelig";s:2:"æ";s:6:"ccedil";s:2:"ç";s:6:"egrave";s:2:"è";s:6:"eacute";s:2:"é";s:5:"ecirc";s:2:"ê";s:4:"euml";s:2:"ë";s:6:"igrave";s:2:"ì";s:6:"iacute";s:2:"í";s:5:"icirc";s:2:"î";s:4:"iuml";s:2:"ï";s:3:"eth";s:2:"ð";s:6:"ntilde";s:2:"ñ";s:6:"ograve";s:2:"ò";s:6:"oacute";s:2:"ó";s:5:"ocirc";s:2:"ô";s:6:"otilde";s:2:"õ";s:4:"ouml";s:2:"ö";s:6:"divide";s:2:"÷";s:6:"oslash";s:2:"ø";s:6:"ugrave";s:2:"ù";s:6:"uacute";s:2:"ú";s:5:"ucirc";s:2:"û";s:4:"uuml";s:2:"ü";s:6:"yacute";s:2:"ý";s:5:"thorn";s:2:"þ";s:4:"yuml";s:2:"ÿ";s:4:"quot";s:1:""";s:3:"amp";s:1:"&";s:2:"lt";s:1:"<";s:2:"gt";s:1:">";s:4:"apos";s:1:"'";s:5:"OElig";s:2:"Œ";s:5:"oelig";s:2:"œ";s:6:"Scaron";s:2:"Š";s:6:"scaron";s:2:"š";s:4:"Yuml";s:2:"Ÿ";s:4:"circ";s:2:"ˆ";s:5:"tilde";s:2:"˜";s:4:"ensp";s:3:" ";s:4:"emsp";s:3:" ";s:6:"thinsp";s:3:" ";s:4:"zwnj";s:3:"";s:3:"zwj";s:3:"";s:3:"lrm";s:3:"";s:3:"rlm";s:3:"";s:5:"ndash";s:3:"–";s:5:"mdash";s:3:"—";s:5:"lsquo";s:3:"‘";s:5:"rsquo";s:3:"’";s:5:"sbquo";s:3:"‚";s:5:"ldquo";s:3:"“";s:5:"rdquo";s:3:"”";s:5:"bdquo";s:3:"„";s:6:"dagger";s:3:"†";s:6:"Dagger";s:3:"‡";s:6:"permil";s:3:"‰";s:6:"lsaquo";s:3:"‹";s:6:"rsaquo";s:3:"›";s:4:"euro";s:3:"€";s:4:"fnof";s:2:"ƒ";s:5:"Alpha";s:2:"Α";s:4:"Beta";s:2:"Β";s:5:"Gamma";s:2:"Γ";s:5:"Delta";s:2:"Δ";s:7:"Epsilon";s:2:"Ε";s:4:"Zeta";s:2:"Ζ";s:3:"Eta";s:2:"Η";s:5:"Theta";s:2:"Θ";s:4:"Iota";s:2:"Ι";s:5:"Kappa";s:2:"Κ";s:6:"Lambda";s:2:"Λ";s:2:"Mu";s:2:"Μ";s:2:"Nu";s:2:"Ν";s:2:"Xi";s:2:"Ξ";s:7:"Omicron";s:2:"Ο";s:2:"Pi";s:2:"Π";s:3:"Rho";s:2:"Ρ";s:5:"Sigma";s:2:"Σ";s:3:"Tau";s:2:"Τ";s:7:"Upsilon";s:2:"Υ";s:3:"Phi";s:2:"Φ";s:3:"Chi";s:2:"Χ";s:3:"Psi";s:2:"Ψ";s:5:"Omega";s:2:"Ω";s:5:"alpha";s:2:"α";s:4:"beta";s:2:"β";s:5:"gamma";s:2:"γ";s:5:"delta";s:2:"δ";s:7:"epsilon";s:2:"ε";s:4:"zeta";s:2:"ζ";s:3:"eta";s:2:"η";s:5:"theta";s:2:"θ";s:4:"iota";s:2:"ι";s:5:"kappa";s:2:"κ";s:6:"lambda";s:2:"λ";s:2:"mu";s:2:"μ";s:2:"nu";s:2:"ν";s:2:"xi";s:2:"ξ";s:7:"omicron";s:2:"ο";s:2:"pi";s:2:"π";s:3:"rho";s:2:"ρ";s:6:"sigmaf";s:2:"ς";s:5:"sigma";s:2:"σ";s:3:"tau";s:2:"τ";s:7:"upsilon";s:2:"υ";s:3:"phi";s:2:"φ";s:3:"chi";s:2:"χ";s:3:"psi";s:2:"ψ";s:5:"omega";s:2:"ω";s:8:"thetasym";s:2:"ϑ";s:5:"upsih";s:2:"ϒ";s:3:"piv";s:2:"ϖ";s:4:"bull";s:3:"•";s:6:"hellip";s:3:"…";s:5:"prime";s:3:"′";s:5:"Prime";s:3:"″";s:5:"oline";s:3:"‾";s:5:"frasl";s:3:"⁄";s:6:"weierp";s:3:"℘";s:5:"image";s:3:"ℑ";s:4:"real";s:3:"ℜ";s:5:"trade";s:3:"™";s:7:"alefsym";s:3:"ℵ";s:4:"larr";s:3:"←";s:4:"uarr";s:3:"↑";s:4:"rarr";s:3:"→";s:4:"darr";s:3:"↓";s:4:"harr";s:3:"↔";s:5:"crarr";s:3:"↵";s:4:"lArr";s:3:"⇐";s:4:"uArr";s:3:"⇑";s:4:"rArr";s:3:"⇒";s:4:"dArr";s:3:"⇓";s:4:"hArr";s:3:"⇔";s:6:"forall";s:3:"∀";s:4:"part";s:3:"∂";s:5:"exist";s:3:"∃";s:5:"empty";s:3:"∅";s:5:"nabla";s:3:"∇";s:4:"isin";s:3:"∈";s:5:"notin";s:3:"∉";s:2:"ni";s:3:"∋";s:4:"prod";s:3:"∏";s:3:"sum";s:3:"∑";s:5:"minus";s:3:"−";s:6:"lowast";s:3:"∗";s:5:"radic";s:3:"√";s:4:"prop";s:3:"∝";s:5:"infin";s:3:"∞";s:3:"ang";s:3:"∠";s:3:"and";s:3:"∧";s:2:"or";s:3:"∨";s:3:"cap";s:3:"∩";s:3:"cup";s:3:"∪";s:3:"int";s:3:"∫";s:3:"sim";s:3:"∼";s:4:"cong";s:3:"≅";s:5:"asymp";s:3:"≈";s:2:"ne";s:3:"≠";s:5:"equiv";s:3:"≡";s:2:"le";s:3:"≤";s:2:"ge";s:3:"≥";s:3:"sub";s:3:"⊂";s:3:"sup";s:3:"⊃";s:4:"nsub";s:3:"⊄";s:4:"sube";s:3:"⊆";s:4:"supe";s:3:"⊇";s:5:"oplus";s:3:"⊕";s:6:"otimes";s:3:"⊗";s:4:"perp";s:3:"⊥";s:4:"sdot";s:3:"⋅";s:5:"lceil";s:3:"⌈";s:5:"rceil";s:3:"⌉";s:6:"lfloor";s:3:"⌊";s:6:"rfloor";s:3:"⌋";s:4:"lang";s:3:"〈";s:4:"rang";s:3:"〉";s:3:"loz";s:3:"◊";s:6:"spades";s:3:"♠";s:5:"clubs";s:3:"♣";s:6:"hearts";s:3:"♥";s:5:"diams";s:3:"♦";}
\ No newline at end of file
diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php
index 34c489ed..a079df82 100644
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@@ -85,6 +85,68 @@ class HTMLPurifier_Lexer
return $lexer;
}
+
+ /**
+ * Callback regex string for parsing entities.
+ * @protected
+ */
+ var $_substituteEntitiesRegex =
+ // 1. hex 2. dec 3. string
+ '/&[#](?:x([a-fA-F0-9]+)|0*(\d+)|([A-Za-z]+));?/';
+
+ /**
+ * Substitutes non-special entities with their parsed equivalents. Since
+ * running this whenever you have parsed character is t3h 5uck, we run
+ * it before everything else.
+ *
+ * @protected
+ * @param $string String to have non-special entities parsed.
+ * @returns Parsed string.
+ */
+ function substituteNonSpecialEntities($string) {
+ // it will try to detect missing semicolons, but don't rely on it
+ return preg_replace_callback(
+ $this->_substituteEntitiesRegex,
+ array('HTMLPurifier_Lexer_DirectLex', 'nonSpecialEntityCallback'),
+ $string);
+ }
+
+ /**
+ * Callback function for substituteNonSpecialEntities() that does the work.
+ *
+ * @warning Though this is public in order to let the callback happen,
+ * calling it directly is not recommended.
+ * @param $matches PCRE-style matches array, with 0 the entire match, and
+ * either index 1, 2 or 3 set with a hex value, dec value,
+ * or string (respectively).
+ * @returns Replacement string.
+ * @todo Implement string translations
+ */
+ function nonSpecialEntityCallback($matches) {
+ // replaces all but big five
+ $entity = $matches[0];
+ $is_num = (@$matches[0][1] === '#');
+ if ($is_num) {
+ $is_hex = (@$entity[2] === 'x');
+ $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
+ if (isset($this->_special_dec2str[$int])) return $entity;
+ return chr($int);
+ } else {
+ if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
+ if (!$this->_entity_lookup) {
+ require_once 'HTMLPurifier/EntityLookup.php';
+ $this->_entity_lookup = EntityLookup::instance();
+ }
+ if (isset($this->_entity_lookup->table[$matches[3]])) {
+ return $this->_entity_lookup->table[$matches[3]];
+ } else {
+ return $entity;
+ }
+ }
+ }
+
+ var $_entity_lookup;
+
}
?>
\ No newline at end of file
diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php
index 91706370..587f1928 100644
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -11,7 +11,6 @@ require_once 'HTMLPurifier/Lexer.php';
* pales in comparison to HTMLPurifier_Lexer_DOMLex. It will support UTF-8
* completely eventually.
*
- * @todo Implement non-special string entity conversion.
* @todo Reread XML spec and document differences.
* @todo Add support for CDATA sections.
* @todo Determine correct behavior in outputting comment data. (preserve dashes?)
@@ -99,56 +98,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
''' => "'",
);
- /**
- * Callback regex string for parsing entities.
- * @protected
- */
- var $_substituteEntitiesRegex =
- // 1. hex 2. dec 3. string
- '/&[#](?:x([a-fA-F0-9]+)|0*(\d+)|([A-Za-z]+));?/';
-
- /**
- * Substitutes non-special entities with their parsed equivalents.
- *
- * @protected
- * @param $string String to have non-special entities parsed.
- * @returns Parsed string.
- */
- function substituteNonSpecialEntities($string) {
- // it will try to detect missing semicolons, but don't rely on it
- return preg_replace_callback(
- $this->_substituteEntitiesRegex,
- array('HTMLPurifier_Lexer_DirectLex', 'nonSpecialEntityCallback'),
- $string);
- }
-
- /**
- * Callback function for substituteNonSpecialEntities() that does the work.
- *
- * @warning Though this is public in order to let the callback happen,
- * calling it directly is not recommended.
- * @param $matches PCRE-style matches array, with 0 the entire match, and
- * either index 1, 2 or 3 set with a hex value, dec value,
- * or string (respectively).
- * @returns Replacement string.
- * @todo Implement string translations
- */
- function nonSpecialEntityCallback($matches) {
- // replaces all but big five
- $entity = $matches[0];
- $is_num = (@$matches[0][1] === '#');
- if ($is_num) {
- $is_hex = (@$entity[2] === 'x');
- $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
- if (isset($this->_special_dec2str[$int])) return $entity;
- return chr($int);
- } else {
- if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
- // translate $matches[3]
- return '';
- }
- }
-
/**
* Substitutes only special entities with their parsed equivalents.
*
diff --git a/maintenance/.htaccess b/maintenance/.htaccess
new file mode 100644
index 00000000..03688ee9
--- /dev/null
+++ b/maintenance/.htaccess
@@ -0,0 +1 @@
+Deny from all
diff --git a/maintenance/generate-entity-file.php b/maintenance/generate-entity-file.php
new file mode 100644
index 00000000..b46586b5
--- /dev/null
+++ b/maintenance/generate-entity-file.php
@@ -0,0 +1,68 @@
+#!/usr/bin/php
+/';
+
+foreach ( $entity_files as $file ) {
+ $contents = file_get_contents($entity_dir . $file);
+ $matches = array();
+ preg_match_all($regexp, $contents, $matches, PREG_SET_ORDER);
+ foreach ($matches as $match) {
+ $entity_table[$match[1]] = unichr($match[2]);
+ }
+}
+
+$output = serialize($entity_table);
+
+$fh = fopen($output_file, 'w');
+fwrite($fh, $output);
+fclose($fh);
+
+echo "Completed successfully.";
+
+?>
\ No newline at end of file
diff --git a/tests/HTMLPurifier/EntityLookupTest.php b/tests/HTMLPurifier/EntityLookupTest.php
new file mode 100644
index 00000000..c490d55a
--- /dev/null
+++ b/tests/HTMLPurifier/EntityLookupTest.php
@@ -0,0 +1,29 @@
+assertIdentical('â', $lookup->table['acirc']);
+
+ // special char
+ $this->assertIdentical('"', $lookup->table['quot']);
+ $this->assertIdentical('“', $lookup->table['ldquo']);
+ $this->assertIdentical('<', $lookup->table['lt']); //expressed strangely
+
+ // symbol char
+ $this->assertIdentical('θ', $lookup->table['theta']);
+
+ }
+
+}
+
+?>
\ No newline at end of file
diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php
index 2693b825..0f988a6b 100644
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@@ -7,6 +7,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase
{
var $DirectLex, $PEARSax3, $DOMLex;
+ var $_entity_lookup;
var $_has_dom;
function setUp() {
@@ -14,12 +15,13 @@ class HTMLPurifier_LexerTest extends UnitTestCase
$this->PEARSax3 = new HTMLPurifier_Lexer_PEARSax3();
$this->_has_dom = version_compare(PHP_VERSION, '5', '>=');
-
if ($this->_has_dom) {
require_once 'HTMLPurifier/Lexer/DOMLex.php';
$this->DOMLex = new HTMLPurifier_Lexer_DOMLex();
}
+ $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
+
}
function test_tokenizeHTML() {
@@ -152,8 +154,12 @@ class HTMLPurifier_LexerTest extends UnitTestCase
// compare with this valid one:
$input[12] = '"';
$expect[12] = array( new HTMLPurifier_Token_Text('"') );
- $sax_expect[12] = false;
- // SAX chokes on this? We do have entity parsing on, so it should work!
+ $sax_expect[12] = false; // choked!
+
+ // DOM and SAX choke on this
+ //$char_circ = $this->_entity_lookup->table['circ'];
+ //$input[13] = 'ˆ';
+ //$expect[13] = array( new HTMLPurifier_Token_Text($char_circ) );
foreach($input as $i => $discard) {
$result = $this->DirectLex->tokenizeHTML($input[$i]);
diff --git a/tests/index.php b/tests/index.php
index 593023ec..2dcd7694 100644
--- a/tests/index.php
+++ b/tests/index.php
@@ -19,6 +19,7 @@ $test->addTestFile('HTMLPurifier/Lexer/DirectLexTest.php');
$test->addTestFile('HTMLPurifier/DefinitionTest.php');
$test->addTestFile('HTMLPurifier/ChildDefTest.php');
$test->addTestFile('HTMLPurifier/GeneratorTest.php');
+$test->addTestFile('HTMLPurifier/EntityLookupTest.php');
$test->run( new HtmlReporter() );