0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-12-22 16:31:53 +00:00

Implement EntityLookup and put in the Lexer. Some behavior was migrated, since it looks like it will have to be used in all Lexers, not just DirectLex (which is the only one that uses it).

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@105 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-07-23 21:07:30 +00:00
parent 7268987846
commit 5ce0ae7056
9 changed files with 196 additions and 54 deletions

View File

@ -0,0 +1,25 @@
<?php
class HTMLPurifier_EntityLookup {
var $table;
function HTMLPurifier_EntityLookup($file = false) {
if (!$file) {
$file = dirname(__FILE__) . '/EntityLookup/data.txt';
}
$this->table = unserialize(file_get_contents($file));
}
function instance() {
// no references, since PHP doesn't copy unless modified
static $instance = null;
if (!$instance) {
$instance = new HTMLPurifier_EntityLookup();
}
return $instance;
}
}
?>

File diff suppressed because one or more lines are too long

View File

@ -85,6 +85,68 @@ class HTMLPurifier_Lexer
return $lexer;
}
/**
* Callback regex string for parsing entities.
* @protected
*/
var $_substituteEntitiesRegex =
// 1. hex 2. dec 3. string
'/&[#](?:x([a-fA-F0-9]+)|0*(\d+)|([A-Za-z]+));?/';
/**
* Substitutes non-special entities with their parsed equivalents. Since
* running this whenever you have parsed character is t3h 5uck, we run
* it before everything else.
*
* @protected
* @param $string String to have non-special entities parsed.
* @returns Parsed string.
*/
function substituteNonSpecialEntities($string) {
// it will try to detect missing semicolons, but don't rely on it
return preg_replace_callback(
$this->_substituteEntitiesRegex,
array('HTMLPurifier_Lexer_DirectLex', 'nonSpecialEntityCallback'),
$string);
}
/**
* Callback function for substituteNonSpecialEntities() that does the work.
*
* @warning Though this is public in order to let the callback happen,
* calling it directly is not recommended.
* @param $matches PCRE-style matches array, with 0 the entire match, and
* either index 1, 2 or 3 set with a hex value, dec value,
* or string (respectively).
* @returns Replacement string.
* @todo Implement string translations
*/
function nonSpecialEntityCallback($matches) {
// replaces all but big five
$entity = $matches[0];
$is_num = (@$matches[0][1] === '#');
if ($is_num) {
$is_hex = (@$entity[2] === 'x');
$int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
if (isset($this->_special_dec2str[$int])) return $entity;
return chr($int);
} else {
if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
if (!$this->_entity_lookup) {
require_once 'HTMLPurifier/EntityLookup.php';
$this->_entity_lookup = EntityLookup::instance();
}
if (isset($this->_entity_lookup->table[$matches[3]])) {
return $this->_entity_lookup->table[$matches[3]];
} else {
return $entity;
}
}
}
var $_entity_lookup;
}
?>

View File

@ -11,7 +11,6 @@ require_once 'HTMLPurifier/Lexer.php';
* pales in comparison to HTMLPurifier_Lexer_DOMLex. It will support UTF-8
* completely eventually.
*
* @todo Implement non-special string entity conversion.
* @todo Reread XML spec and document differences.
* @todo Add support for CDATA sections.
* @todo Determine correct behavior in outputting comment data. (preserve dashes?)
@ -99,56 +98,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
'&#x27;' => "'",
);
/**
* Callback regex string for parsing entities.
* @protected
*/
var $_substituteEntitiesRegex =
// 1. hex 2. dec 3. string
'/&[#](?:x([a-fA-F0-9]+)|0*(\d+)|([A-Za-z]+));?/';
/**
* Substitutes non-special entities with their parsed equivalents.
*
* @protected
* @param $string String to have non-special entities parsed.
* @returns Parsed string.
*/
function substituteNonSpecialEntities($string) {
// it will try to detect missing semicolons, but don't rely on it
return preg_replace_callback(
$this->_substituteEntitiesRegex,
array('HTMLPurifier_Lexer_DirectLex', 'nonSpecialEntityCallback'),
$string);
}
/**
* Callback function for substituteNonSpecialEntities() that does the work.
*
* @warning Though this is public in order to let the callback happen,
* calling it directly is not recommended.
* @param $matches PCRE-style matches array, with 0 the entire match, and
* either index 1, 2 or 3 set with a hex value, dec value,
* or string (respectively).
* @returns Replacement string.
* @todo Implement string translations
*/
function nonSpecialEntityCallback($matches) {
// replaces all but big five
$entity = $matches[0];
$is_num = (@$matches[0][1] === '#');
if ($is_num) {
$is_hex = (@$entity[2] === 'x');
$int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
if (isset($this->_special_dec2str[$int])) return $entity;
return chr($int);
} else {
if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
// translate $matches[3]
return '';
}
}
/**
* Substitutes only special entities with their parsed equivalents.
*

1
maintenance/.htaccess Normal file
View File

@ -0,0 +1 @@
Deny from all

View File

@ -0,0 +1,68 @@
#!/usr/bin/php
<?php
/**
* Parses *.ent files into an entity lookup table, and then serializes and
* writes the whole kaboodle to a file. The resulting file should be versioned.
*/
chdir( dirname(__FILE__) );
// here's where the entity files are located, assuming working directory
// is the same as the location of this PHP file. Needs trailing slash.
$entity_dir = '../docs/entities/';
// defines the output file for the serialized content.
$output_file = '../library/HTMLPurifier/EntityLookup/data.txt';
function unichr($dec) {
if ($dec < 128) {
$utf = chr($dec);
} else if ($dec < 2048) {
$utf = chr(192 + (($dec - ($dec % 64)) / 64));
$utf .= chr(128 + ($dec % 64));
} else {
$utf = chr(224 + (($dec - ($dec % 4096)) / 4096));
$utf .= chr(128 + ((($dec % 4096) - ($dec % 64)) / 64));
$utf .= chr(128 + ($dec % 64));
}
return $utf;
}
if ( !is_dir($entity_dir) ) exit("Fatal Error: Can't find entity directory.\n");
if ( file_exists($output_file) ) exit("Fatal Error: entity-lookup.txt already exists.\n");
$dh = @opendir($entity_dir);
if ( !$dh ) exit("Fatal Error: Cannot read entity directory.\n");
$entity_files = array();
while (($file = readdir($dh)) !== false) {
if (@$file[0] === '.') continue;
if (substr(strrchr($file, "."), 1) !== 'ent') continue;
$entity_files[] = $file;
}
closedir($dh);
if ( !$entity_files ) exit("Fatal Error: No entity files to parse.\n");
$entity_table = array();
$regexp = '/<!ENTITY\s+([A-Za-z]+)\s+"&#(?:38;#)?([0-9]+);">/';
foreach ( $entity_files as $file ) {
$contents = file_get_contents($entity_dir . $file);
$matches = array();
preg_match_all($regexp, $contents, $matches, PREG_SET_ORDER);
foreach ($matches as $match) {
$entity_table[$match[1]] = unichr($match[2]);
}
}
$output = serialize($entity_table);
$fh = fopen($output_file, 'w');
fwrite($fh, $output);
fclose($fh);
echo "Completed successfully.";
?>

View File

@ -0,0 +1,29 @@
<?php
// this page is UTF-8 encoded!
require_once 'HTMLPurifier/EntityLookup.php';
class HTMLPurifier_EntityLookupTest extends UnitTestCase
{
function test() {
$lookup = HTMLPurifier_EntityLookup::instance();
// latin char
$this->assertIdentical('â', $lookup->table['acirc']);
// special char
$this->assertIdentical('"', $lookup->table['quot']);
$this->assertIdentical('“', $lookup->table['ldquo']);
$this->assertIdentical('<', $lookup->table['lt']); //expressed strangely
// symbol char
$this->assertIdentical('θ', $lookup->table['theta']);
}
}
?>

View File

@ -7,6 +7,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase
{
var $DirectLex, $PEARSax3, $DOMLex;
var $_entity_lookup;
var $_has_dom;
function setUp() {
@ -14,12 +15,13 @@ class HTMLPurifier_LexerTest extends UnitTestCase
$this->PEARSax3 = new HTMLPurifier_Lexer_PEARSax3();
$this->_has_dom = version_compare(PHP_VERSION, '5', '>=');
if ($this->_has_dom) {
require_once 'HTMLPurifier/Lexer/DOMLex.php';
$this->DOMLex = new HTMLPurifier_Lexer_DOMLex();
}
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
}
function test_tokenizeHTML() {
@ -152,8 +154,12 @@ class HTMLPurifier_LexerTest extends UnitTestCase
// compare with this valid one:
$input[12] = '&quot;';
$expect[12] = array( new HTMLPurifier_Token_Text('"') );
$sax_expect[12] = false;
// SAX chokes on this? We do have entity parsing on, so it should work!
$sax_expect[12] = false; // choked!
// DOM and SAX choke on this
//$char_circ = $this->_entity_lookup->table['circ'];
//$input[13] = '&circ;';
//$expect[13] = array( new HTMLPurifier_Token_Text($char_circ) );
foreach($input as $i => $discard) {
$result = $this->DirectLex->tokenizeHTML($input[$i]);

View File

@ -19,6 +19,7 @@ $test->addTestFile('HTMLPurifier/Lexer/DirectLexTest.php');
$test->addTestFile('HTMLPurifier/DefinitionTest.php');
$test->addTestFile('HTMLPurifier/ChildDefTest.php');
$test->addTestFile('HTMLPurifier/GeneratorTest.php');
$test->addTestFile('HTMLPurifier/EntityLookupTest.php');
$test->run( new HtmlReporter() );