mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-23 00:41:52 +00:00
Implement EntityLookup and put in the Lexer. Some behavior was migrated, since it looks like it will have to be used in all Lexers, not just DirectLex (which is the only one that uses it).
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@105 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
7268987846
commit
5ce0ae7056
25
library/HTMLPurifier/EntityLookup.php
Normal file
25
library/HTMLPurifier/EntityLookup.php
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
class HTMLPurifier_EntityLookup {
|
||||||
|
|
||||||
|
var $table;
|
||||||
|
|
||||||
|
function HTMLPurifier_EntityLookup($file = false) {
|
||||||
|
if (!$file) {
|
||||||
|
$file = dirname(__FILE__) . '/EntityLookup/data.txt';
|
||||||
|
}
|
||||||
|
$this->table = unserialize(file_get_contents($file));
|
||||||
|
}
|
||||||
|
|
||||||
|
function instance() {
|
||||||
|
// no references, since PHP doesn't copy unless modified
|
||||||
|
static $instance = null;
|
||||||
|
if (!$instance) {
|
||||||
|
$instance = new HTMLPurifier_EntityLookup();
|
||||||
|
}
|
||||||
|
return $instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
?>
|
1
library/HTMLPurifier/EntityLookup/data.txt
Normal file
1
library/HTMLPurifier/EntityLookup/data.txt
Normal file
File diff suppressed because one or more lines are too long
@ -85,6 +85,68 @@ class HTMLPurifier_Lexer
|
|||||||
return $lexer;
|
return $lexer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Callback regex string for parsing entities.
|
||||||
|
* @protected
|
||||||
|
*/
|
||||||
|
var $_substituteEntitiesRegex =
|
||||||
|
// 1. hex 2. dec 3. string
|
||||||
|
'/&[#](?:x([a-fA-F0-9]+)|0*(\d+)|([A-Za-z]+));?/';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Substitutes non-special entities with their parsed equivalents. Since
|
||||||
|
* running this whenever you have parsed character is t3h 5uck, we run
|
||||||
|
* it before everything else.
|
||||||
|
*
|
||||||
|
* @protected
|
||||||
|
* @param $string String to have non-special entities parsed.
|
||||||
|
* @returns Parsed string.
|
||||||
|
*/
|
||||||
|
function substituteNonSpecialEntities($string) {
|
||||||
|
// it will try to detect missing semicolons, but don't rely on it
|
||||||
|
return preg_replace_callback(
|
||||||
|
$this->_substituteEntitiesRegex,
|
||||||
|
array('HTMLPurifier_Lexer_DirectLex', 'nonSpecialEntityCallback'),
|
||||||
|
$string);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Callback function for substituteNonSpecialEntities() that does the work.
|
||||||
|
*
|
||||||
|
* @warning Though this is public in order to let the callback happen,
|
||||||
|
* calling it directly is not recommended.
|
||||||
|
* @param $matches PCRE-style matches array, with 0 the entire match, and
|
||||||
|
* either index 1, 2 or 3 set with a hex value, dec value,
|
||||||
|
* or string (respectively).
|
||||||
|
* @returns Replacement string.
|
||||||
|
* @todo Implement string translations
|
||||||
|
*/
|
||||||
|
function nonSpecialEntityCallback($matches) {
|
||||||
|
// replaces all but big five
|
||||||
|
$entity = $matches[0];
|
||||||
|
$is_num = (@$matches[0][1] === '#');
|
||||||
|
if ($is_num) {
|
||||||
|
$is_hex = (@$entity[2] === 'x');
|
||||||
|
$int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
|
||||||
|
if (isset($this->_special_dec2str[$int])) return $entity;
|
||||||
|
return chr($int);
|
||||||
|
} else {
|
||||||
|
if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
|
||||||
|
if (!$this->_entity_lookup) {
|
||||||
|
require_once 'HTMLPurifier/EntityLookup.php';
|
||||||
|
$this->_entity_lookup = EntityLookup::instance();
|
||||||
|
}
|
||||||
|
if (isset($this->_entity_lookup->table[$matches[3]])) {
|
||||||
|
return $this->_entity_lookup->table[$matches[3]];
|
||||||
|
} else {
|
||||||
|
return $entity;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var $_entity_lookup;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
?>
|
@ -11,7 +11,6 @@ require_once 'HTMLPurifier/Lexer.php';
|
|||||||
* pales in comparison to HTMLPurifier_Lexer_DOMLex. It will support UTF-8
|
* pales in comparison to HTMLPurifier_Lexer_DOMLex. It will support UTF-8
|
||||||
* completely eventually.
|
* completely eventually.
|
||||||
*
|
*
|
||||||
* @todo Implement non-special string entity conversion.
|
|
||||||
* @todo Reread XML spec and document differences.
|
* @todo Reread XML spec and document differences.
|
||||||
* @todo Add support for CDATA sections.
|
* @todo Add support for CDATA sections.
|
||||||
* @todo Determine correct behavior in outputting comment data. (preserve dashes?)
|
* @todo Determine correct behavior in outputting comment data. (preserve dashes?)
|
||||||
@ -99,56 +98,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
''' => "'",
|
''' => "'",
|
||||||
);
|
);
|
||||||
|
|
||||||
/**
|
|
||||||
* Callback regex string for parsing entities.
|
|
||||||
* @protected
|
|
||||||
*/
|
|
||||||
var $_substituteEntitiesRegex =
|
|
||||||
// 1. hex 2. dec 3. string
|
|
||||||
'/&[#](?:x([a-fA-F0-9]+)|0*(\d+)|([A-Za-z]+));?/';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Substitutes non-special entities with their parsed equivalents.
|
|
||||||
*
|
|
||||||
* @protected
|
|
||||||
* @param $string String to have non-special entities parsed.
|
|
||||||
* @returns Parsed string.
|
|
||||||
*/
|
|
||||||
function substituteNonSpecialEntities($string) {
|
|
||||||
// it will try to detect missing semicolons, but don't rely on it
|
|
||||||
return preg_replace_callback(
|
|
||||||
$this->_substituteEntitiesRegex,
|
|
||||||
array('HTMLPurifier_Lexer_DirectLex', 'nonSpecialEntityCallback'),
|
|
||||||
$string);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Callback function for substituteNonSpecialEntities() that does the work.
|
|
||||||
*
|
|
||||||
* @warning Though this is public in order to let the callback happen,
|
|
||||||
* calling it directly is not recommended.
|
|
||||||
* @param $matches PCRE-style matches array, with 0 the entire match, and
|
|
||||||
* either index 1, 2 or 3 set with a hex value, dec value,
|
|
||||||
* or string (respectively).
|
|
||||||
* @returns Replacement string.
|
|
||||||
* @todo Implement string translations
|
|
||||||
*/
|
|
||||||
function nonSpecialEntityCallback($matches) {
|
|
||||||
// replaces all but big five
|
|
||||||
$entity = $matches[0];
|
|
||||||
$is_num = (@$matches[0][1] === '#');
|
|
||||||
if ($is_num) {
|
|
||||||
$is_hex = (@$entity[2] === 'x');
|
|
||||||
$int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
|
|
||||||
if (isset($this->_special_dec2str[$int])) return $entity;
|
|
||||||
return chr($int);
|
|
||||||
} else {
|
|
||||||
if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
|
|
||||||
// translate $matches[3]
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Substitutes only special entities with their parsed equivalents.
|
* Substitutes only special entities with their parsed equivalents.
|
||||||
*
|
*
|
||||||
|
1
maintenance/.htaccess
Normal file
1
maintenance/.htaccess
Normal file
@ -0,0 +1 @@
|
|||||||
|
Deny from all
|
68
maintenance/generate-entity-file.php
Normal file
68
maintenance/generate-entity-file.php
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
#!/usr/bin/php
|
||||||
|
<?php
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses *.ent files into an entity lookup table, and then serializes and
|
||||||
|
* writes the whole kaboodle to a file. The resulting file should be versioned.
|
||||||
|
*/
|
||||||
|
|
||||||
|
chdir( dirname(__FILE__) );
|
||||||
|
|
||||||
|
// here's where the entity files are located, assuming working directory
|
||||||
|
// is the same as the location of this PHP file. Needs trailing slash.
|
||||||
|
$entity_dir = '../docs/entities/';
|
||||||
|
|
||||||
|
// defines the output file for the serialized content.
|
||||||
|
$output_file = '../library/HTMLPurifier/EntityLookup/data.txt';
|
||||||
|
|
||||||
|
function unichr($dec) {
|
||||||
|
if ($dec < 128) {
|
||||||
|
$utf = chr($dec);
|
||||||
|
} else if ($dec < 2048) {
|
||||||
|
$utf = chr(192 + (($dec - ($dec % 64)) / 64));
|
||||||
|
$utf .= chr(128 + ($dec % 64));
|
||||||
|
} else {
|
||||||
|
$utf = chr(224 + (($dec - ($dec % 4096)) / 4096));
|
||||||
|
$utf .= chr(128 + ((($dec % 4096) - ($dec % 64)) / 64));
|
||||||
|
$utf .= chr(128 + ($dec % 64));
|
||||||
|
}
|
||||||
|
return $utf;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( !is_dir($entity_dir) ) exit("Fatal Error: Can't find entity directory.\n");
|
||||||
|
if ( file_exists($output_file) ) exit("Fatal Error: entity-lookup.txt already exists.\n");
|
||||||
|
|
||||||
|
$dh = @opendir($entity_dir);
|
||||||
|
if ( !$dh ) exit("Fatal Error: Cannot read entity directory.\n");
|
||||||
|
|
||||||
|
$entity_files = array();
|
||||||
|
while (($file = readdir($dh)) !== false) {
|
||||||
|
if (@$file[0] === '.') continue;
|
||||||
|
if (substr(strrchr($file, "."), 1) !== 'ent') continue;
|
||||||
|
$entity_files[] = $file;
|
||||||
|
}
|
||||||
|
closedir($dh);
|
||||||
|
|
||||||
|
if ( !$entity_files ) exit("Fatal Error: No entity files to parse.\n");
|
||||||
|
|
||||||
|
$entity_table = array();
|
||||||
|
$regexp = '/<!ENTITY\s+([A-Za-z]+)\s+"&#(?:38;#)?([0-9]+);">/';
|
||||||
|
|
||||||
|
foreach ( $entity_files as $file ) {
|
||||||
|
$contents = file_get_contents($entity_dir . $file);
|
||||||
|
$matches = array();
|
||||||
|
preg_match_all($regexp, $contents, $matches, PREG_SET_ORDER);
|
||||||
|
foreach ($matches as $match) {
|
||||||
|
$entity_table[$match[1]] = unichr($match[2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$output = serialize($entity_table);
|
||||||
|
|
||||||
|
$fh = fopen($output_file, 'w');
|
||||||
|
fwrite($fh, $output);
|
||||||
|
fclose($fh);
|
||||||
|
|
||||||
|
echo "Completed successfully.";
|
||||||
|
|
||||||
|
?>
|
29
tests/HTMLPurifier/EntityLookupTest.php
Normal file
29
tests/HTMLPurifier/EntityLookupTest.php
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
// this page is UTF-8 encoded!
|
||||||
|
|
||||||
|
require_once 'HTMLPurifier/EntityLookup.php';
|
||||||
|
|
||||||
|
class HTMLPurifier_EntityLookupTest extends UnitTestCase
|
||||||
|
{
|
||||||
|
|
||||||
|
function test() {
|
||||||
|
|
||||||
|
$lookup = HTMLPurifier_EntityLookup::instance();
|
||||||
|
|
||||||
|
// latin char
|
||||||
|
$this->assertIdentical('â', $lookup->table['acirc']);
|
||||||
|
|
||||||
|
// special char
|
||||||
|
$this->assertIdentical('"', $lookup->table['quot']);
|
||||||
|
$this->assertIdentical('“', $lookup->table['ldquo']);
|
||||||
|
$this->assertIdentical('<', $lookup->table['lt']); //expressed strangely
|
||||||
|
|
||||||
|
// symbol char
|
||||||
|
$this->assertIdentical('θ', $lookup->table['theta']);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
?>
|
@ -7,6 +7,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
|||||||
{
|
{
|
||||||
|
|
||||||
var $DirectLex, $PEARSax3, $DOMLex;
|
var $DirectLex, $PEARSax3, $DOMLex;
|
||||||
|
var $_entity_lookup;
|
||||||
var $_has_dom;
|
var $_has_dom;
|
||||||
|
|
||||||
function setUp() {
|
function setUp() {
|
||||||
@ -14,12 +15,13 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
|||||||
$this->PEARSax3 = new HTMLPurifier_Lexer_PEARSax3();
|
$this->PEARSax3 = new HTMLPurifier_Lexer_PEARSax3();
|
||||||
|
|
||||||
$this->_has_dom = version_compare(PHP_VERSION, '5', '>=');
|
$this->_has_dom = version_compare(PHP_VERSION, '5', '>=');
|
||||||
|
|
||||||
if ($this->_has_dom) {
|
if ($this->_has_dom) {
|
||||||
require_once 'HTMLPurifier/Lexer/DOMLex.php';
|
require_once 'HTMLPurifier/Lexer/DOMLex.php';
|
||||||
$this->DOMLex = new HTMLPurifier_Lexer_DOMLex();
|
$this->DOMLex = new HTMLPurifier_Lexer_DOMLex();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function test_tokenizeHTML() {
|
function test_tokenizeHTML() {
|
||||||
@ -152,8 +154,12 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
|||||||
// compare with this valid one:
|
// compare with this valid one:
|
||||||
$input[12] = '"';
|
$input[12] = '"';
|
||||||
$expect[12] = array( new HTMLPurifier_Token_Text('"') );
|
$expect[12] = array( new HTMLPurifier_Token_Text('"') );
|
||||||
$sax_expect[12] = false;
|
$sax_expect[12] = false; // choked!
|
||||||
// SAX chokes on this? We do have entity parsing on, so it should work!
|
|
||||||
|
// DOM and SAX choke on this
|
||||||
|
//$char_circ = $this->_entity_lookup->table['circ'];
|
||||||
|
//$input[13] = 'ˆ';
|
||||||
|
//$expect[13] = array( new HTMLPurifier_Token_Text($char_circ) );
|
||||||
|
|
||||||
foreach($input as $i => $discard) {
|
foreach($input as $i => $discard) {
|
||||||
$result = $this->DirectLex->tokenizeHTML($input[$i]);
|
$result = $this->DirectLex->tokenizeHTML($input[$i]);
|
||||||
|
@ -19,6 +19,7 @@ $test->addTestFile('HTMLPurifier/Lexer/DirectLexTest.php');
|
|||||||
$test->addTestFile('HTMLPurifier/DefinitionTest.php');
|
$test->addTestFile('HTMLPurifier/DefinitionTest.php');
|
||||||
$test->addTestFile('HTMLPurifier/ChildDefTest.php');
|
$test->addTestFile('HTMLPurifier/ChildDefTest.php');
|
||||||
$test->addTestFile('HTMLPurifier/GeneratorTest.php');
|
$test->addTestFile('HTMLPurifier/GeneratorTest.php');
|
||||||
|
$test->addTestFile('HTMLPurifier/EntityLookupTest.php');
|
||||||
|
|
||||||
$test->run( new HtmlReporter() );
|
$test->run( new HtmlReporter() );
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user