Implement EntityLookup and put in the Lexer. Some behavior was migrated, since it looks like it will have to be used in all Lexers, not just DirectLex (which is the only one that uses it).

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@105 48356398-32a2-884e-a903-53898d9a118a
2024-12-22 16:31:53 +00:00 · 2006-07-23 21:07:30 +00:00 · 2006-07-23 21:07:30 +00:00 · 5ce0ae7056
commit 5ce0ae7056
parent 7268987846
9 changed files with 196 additions and 54 deletions
--- a/library/HTMLPurifier/EntityLookup.php
+++ b/library/HTMLPurifier/EntityLookup.php
@ -0,0 +1,25 @@
+<?php
+
+class HTMLPurifier_EntityLookup {
+    
+    var $table;
+    
+    function HTMLPurifier_EntityLookup($file = false) {
+        if (!$file) {
+            $file = dirname(__FILE__) . '/EntityLookup/data.txt';
+        }
+        $this->table = unserialize(file_get_contents($file));
+    }
+    
+    function instance() {
+        // no references, since PHP doesn't copy unless modified
+        static $instance = null;
+        if (!$instance) {
+            $instance = new HTMLPurifier_EntityLookup();
+        }
+        return $instance;
+    }
+    
+}
+
+?>
--- a/library/HTMLPurifier/EntityLookup/data.txt
+++ b/library/HTMLPurifier/EntityLookup/data.txt
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@ -85,6 +85,68 @@ class HTMLPurifier_Lexer
        return $lexer;
    }
    
+    
+    /**
+     * Callback regex string for parsing entities.
+     * @protected
+     */
+    var $_substituteEntitiesRegex =
+        //       1. hex          2. dec  3. string
+        '/&[#](?:x([a-fA-F0-9]+)|0*(\d+)|([A-Za-z]+));?/';
+    
+    /**
+     * Substitutes non-special entities with their parsed equivalents. Since
+     * running this whenever you have parsed character is t3h 5uck, we run
+     * it before everything else.
+     * 
+     * @protected
+     * @param $string String to have non-special entities parsed.
+     * @returns Parsed string.
+     */
+    function substituteNonSpecialEntities($string) {
+        // it will try to detect missing semicolons, but don't rely on it
+        return preg_replace_callback(
+            $this->_substituteEntitiesRegex,
+            array('HTMLPurifier_Lexer_DirectLex', 'nonSpecialEntityCallback'),
+            $string);
+    }
+    
+    /**
+     * Callback function for substituteNonSpecialEntities() that does the work.
+     * 
+     * @warning Though this is public in order to let the callback happen,
+     *          calling it directly is not recommended.
+     * @param $matches  PCRE-style matches array, with 0 the entire match, and
+     *                  either index 1, 2 or 3 set with a hex value, dec value,
+     *                  or string (respectively).
+     * @returns Replacement string.
+     * @todo Implement string translations
+     */
+    function nonSpecialEntityCallback($matches) {
+        // replaces all but big five
+        $entity = $matches[0];
+        $is_num = (@$matches[0][1] === '#');
+        if ($is_num) {
+            $is_hex = (@$entity[2] === 'x');
+            $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
+            if (isset($this->_special_dec2str[$int]))  return $entity;
+            return chr($int);
+        } else {
+            if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
+            if (!$this->_entity_lookup) {
+                require_once 'HTMLPurifier/EntityLookup.php';
+                $this->_entity_lookup = EntityLookup::instance();
+            }
+            if (isset($this->_entity_lookup->table[$matches[3]])) {
+                return $this->_entity_lookup->table[$matches[3]];
+            } else {
+                return $entity;
+            }
+        }
+    }
+    
+    var $_entity_lookup;
+    
 }

 ?>
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@ -11,7 +11,6 @@ require_once 'HTMLPurifier/Lexer.php';
 * pales in comparison to HTMLPurifier_Lexer_DOMLex.  It will support UTF-8
 * completely eventually.
 * 
- * @todo Implement non-special string entity conversion.
 * @todo Reread XML spec and document differences.
 * @todo Add support for CDATA sections.
 * @todo Determine correct behavior in outputting comment data. (preserve dashes?)
@ -99,56 +98,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
            '&#x27;' => "'",
        );
    
-    /**
-     * Callback regex string for parsing entities.
-     * @protected
-     */
-    var $_substituteEntitiesRegex =
-        //       1. hex          2. dec  3. string
-        '/&[#](?:x([a-fA-F0-9]+)|0*(\d+)|([A-Za-z]+));?/';
-    
-    /**
-     * Substitutes non-special entities with their parsed equivalents.
-     * 
-     * @protected
-     * @param $string String to have non-special entities parsed.
-     * @returns Parsed string.
-     */
-    function substituteNonSpecialEntities($string) {
-        // it will try to detect missing semicolons, but don't rely on it
-        return preg_replace_callback(
-            $this->_substituteEntitiesRegex,
-            array('HTMLPurifier_Lexer_DirectLex', 'nonSpecialEntityCallback'),
-            $string);
-    }
-    
-    /**
-     * Callback function for substituteNonSpecialEntities() that does the work.
-     * 
-     * @warning Though this is public in order to let the callback happen,
-     *          calling it directly is not recommended.
-     * @param $matches  PCRE-style matches array, with 0 the entire match, and
-     *                  either index 1, 2 or 3 set with a hex value, dec value,
-     *                  or string (respectively).
-     * @returns Replacement string.
-     * @todo Implement string translations
-     */
-    function nonSpecialEntityCallback($matches) {
-        // replaces all but big five
-        $entity = $matches[0];
-        $is_num = (@$matches[0][1] === '#');
-        if ($is_num) {
-            $is_hex = (@$entity[2] === 'x');
-            $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
-            if (isset($this->_special_dec2str[$int]))  return $entity;
-            return chr($int);
-        } else {
-            if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
-            // translate $matches[3]
-            return '';
-        }
-    }
-    
    /**
     * Substitutes only special entities with their parsed equivalents.
     * 
--- a/maintenance/.htaccess
+++ b/maintenance/.htaccess
@ -0,0 +1 @@
+Deny from all
--- a/maintenance/generate-entity-file.php
+++ b/maintenance/generate-entity-file.php
@ -0,0 +1,68 @@
+#!/usr/bin/php
+<?php
+
+/**
+ * Parses *.ent files into an entity lookup table, and then serializes and
+ * writes the whole kaboodle to a file. The resulting file should be versioned.
+ */
+
+chdir( dirname(__FILE__) );
+
+// here's where the entity files are located, assuming working directory
+// is the same as the location of this PHP file. Needs trailing slash.
+$entity_dir = '../docs/entities/';
+
+// defines the output file for the serialized content.
+$output_file = '../library/HTMLPurifier/EntityLookup/data.txt';
+
+function unichr($dec) {
+    if ($dec < 128) {
+        $utf  = chr($dec);
+    } else if ($dec < 2048) {
+        $utf  = chr(192 + (($dec - ($dec % 64)) / 64));
+        $utf .= chr(128 + ($dec % 64));
+    } else {
+        $utf  = chr(224 + (($dec - ($dec % 4096)) / 4096));
+        $utf .= chr(128 + ((($dec % 4096) - ($dec % 64)) / 64));
+        $utf .= chr(128 + ($dec % 64));
+    }
+    return $utf;
+}
+
+if ( !is_dir($entity_dir) ) exit("Fatal Error: Can't find entity directory.\n");
+if ( file_exists($output_file) ) exit("Fatal Error: entity-lookup.txt already exists.\n");
+
+$dh = @opendir($entity_dir);
+if ( !$dh ) exit("Fatal Error: Cannot read entity directory.\n");
+
+$entity_files = array();
+while (($file = readdir($dh)) !== false) {
+    if (@$file[0] === '.') continue;
+    if (substr(strrchr($file, "."), 1) !== 'ent') continue;
+    $entity_files[] = $file;
+}
+closedir($dh);
+
+if ( !$entity_files ) exit("Fatal Error: No entity files to parse.\n");
+
+$entity_table = array();
+$regexp = '/<!ENTITY\s+([A-Za-z]+)\s+"&#(?:38;#)?([0-9]+);">/';
+
+foreach ( $entity_files as $file ) {
+    $contents = file_get_contents($entity_dir . $file);
+    $matches = array();
+    preg_match_all($regexp, $contents, $matches, PREG_SET_ORDER);
+    foreach ($matches as $match) {
+        $entity_table[$match[1]] = unichr($match[2]);
+    }
+}
+
+$output = serialize($entity_table);
+
+$fh = fopen($output_file, 'w');
+fwrite($fh, $output);
+fclose($fh);
+
+echo "Completed successfully.";
+
+?>
--- a/tests/HTMLPurifier/EntityLookupTest.php
+++ b/tests/HTMLPurifier/EntityLookupTest.php
@ -0,0 +1,29 @@
+<?php
+
+// this page is UTF-8 encoded!
+
+require_once 'HTMLPurifier/EntityLookup.php';
+
+class HTMLPurifier_EntityLookupTest extends UnitTestCase
+{
+    
+    function test() {
+        
+        $lookup = HTMLPurifier_EntityLookup::instance();
+        
+        // latin char
+        $this->assertIdentical('â', $lookup->table['acirc']);
+        
+        // special char
+        $this->assertIdentical('"', $lookup->table['quot']);
+        $this->assertIdentical('“', $lookup->table['ldquo']);
+        $this->assertIdentical('<', $lookup->table['lt']); //expressed strangely
+        
+        // symbol char
+        $this->assertIdentical('θ', $lookup->table['theta']);
+        
+    }
+    
+}
+
+?>
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -7,6 +7,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase
 {
    
    var $DirectLex, $PEARSax3, $DOMLex;
+    var $_entity_lookup;
    var $_has_dom;
    
    function setUp() {
@ -14,12 +15,13 @@ class HTMLPurifier_LexerTest extends UnitTestCase
        $this->PEARSax3  = new HTMLPurifier_Lexer_PEARSax3();
        
        $this->_has_dom = version_compare(PHP_VERSION, '5', '>=');
-        
        if ($this->_has_dom) {
            require_once 'HTMLPurifier/Lexer/DOMLex.php';
            $this->DOMLex    = new HTMLPurifier_Lexer_DOMLex();
        }
        
+        $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
+        
    }
    
    function test_tokenizeHTML() {
@ -152,8 +154,12 @@ class HTMLPurifier_LexerTest extends UnitTestCase
        // compare with this valid one:
        $input[12] = '&quot;';
        $expect[12] = array( new HTMLPurifier_Token_Text('"') );
-        $sax_expect[12] = false;
-        // SAX chokes on this? We do have entity parsing on, so it should work!
+        $sax_expect[12] = false; // choked!
+        
+        // DOM and SAX choke on this
+        //$char_circ = $this->_entity_lookup->table['circ'];
+        //$input[13] = '&circ;';
+        //$expect[13] = array( new HTMLPurifier_Token_Text($char_circ) );
        
        foreach($input as $i => $discard) {
            $result = $this->DirectLex->tokenizeHTML($input[$i]);
--- a/tests/index.php
+++ b/tests/index.php
@ -19,6 +19,7 @@ $test->addTestFile('HTMLPurifier/Lexer/DirectLexTest.php');
 $test->addTestFile('HTMLPurifier/DefinitionTest.php');
 $test->addTestFile('HTMLPurifier/ChildDefTest.php');
 $test->addTestFile('HTMLPurifier/GeneratorTest.php');
+$test->addTestFile('HTMLPurifier/EntityLookupTest.php');

 $test->run( new HtmlReporter() );