mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-01-20 12:31:53 +00:00
[1.1.2]
- Documentation updated - API docs now exclude more files that are not classes - Fixed lack of attribute parsing in HTMLPurifier_Lexer_PEARSax3 - (internal) Refactored parseData() to general Lexer class git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@466 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
d9bb97cc26
commit
37def0104b
11
Doxyfile
11
Doxyfile
@ -4,7 +4,7 @@
|
|||||||
# Project related configuration options
|
# Project related configuration options
|
||||||
#---------------------------------------------------------------------------
|
#---------------------------------------------------------------------------
|
||||||
PROJECT_NAME = HTML Purifier
|
PROJECT_NAME = HTML Purifier
|
||||||
PROJECT_NUMBER = 1.0.0
|
PROJECT_NUMBER = 1.1.1
|
||||||
OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen"
|
OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen"
|
||||||
CREATE_SUBDIRS = NO
|
CREATE_SUBDIRS = NO
|
||||||
OUTPUT_LANGUAGE = English
|
OUTPUT_LANGUAGE = English
|
||||||
@ -89,9 +89,12 @@ EXCLUDE =
|
|||||||
EXCLUDE_SYMLINKS = NO
|
EXCLUDE_SYMLINKS = NO
|
||||||
EXCLUDE_PATTERNS = */tests/* \
|
EXCLUDE_PATTERNS = */tests/* \
|
||||||
*/benchmarks/* \
|
*/benchmarks/* \
|
||||||
*/docs/phpdoc/* \
|
*/docs/* \
|
||||||
*/docs/doxygen/* \
|
*/test-settings.php \
|
||||||
*/test-settings.php
|
*/configdoc/* \
|
||||||
|
*/test-settings.php \
|
||||||
|
*/maintenance/* \
|
||||||
|
*/smoketests/*
|
||||||
EXAMPLE_PATH =
|
EXAMPLE_PATH =
|
||||||
EXAMPLE_PATTERNS = *
|
EXAMPLE_PATTERNS = *
|
||||||
EXAMPLE_RECURSIVE = NO
|
EXAMPLE_RECURSIVE = NO
|
||||||
|
3
NEWS
3
NEWS
@ -6,7 +6,10 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
|
|
||||||
1.1.2, unknown projected release date
|
1.1.2, unknown projected release date
|
||||||
(bugfix release, may be merged with 1.2.0 if new features precede major bugs)
|
(bugfix release, may be merged with 1.2.0 if new features precede major bugs)
|
||||||
|
- Documentation updated
|
||||||
|
- API docs now exclude more files that are not classes
|
||||||
- Line endings standardized throughout project
|
- Line endings standardized throughout project
|
||||||
|
- Fixed lack of attribute parsing in HTMLPurifier_Lexer_PEARSax3
|
||||||
|
|
||||||
1.1.1, released 2006-09-24
|
1.1.1, released 2006-09-24
|
||||||
- Various documentation updates
|
- Various documentation updates
|
||||||
|
@ -60,6 +60,60 @@ class HTMLPurifier_Lexer
|
|||||||
$this->_entity_parser = new HTMLPurifier_EntityParser();
|
$this->_entity_parser = new HTMLPurifier_EntityParser();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Most common entity to raw value conversion table for special entities.
|
||||||
|
* @protected
|
||||||
|
*/
|
||||||
|
var $_special_entity2str =
|
||||||
|
array(
|
||||||
|
'"' => '"',
|
||||||
|
'&' => '&',
|
||||||
|
'<' => '<',
|
||||||
|
'>' => '>',
|
||||||
|
''' => "'",
|
||||||
|
''' => "'",
|
||||||
|
''' => "'"
|
||||||
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses special entities into the proper characters.
|
||||||
|
*
|
||||||
|
* This string will translate escaped versions of the special characters
|
||||||
|
* into the correct ones.
|
||||||
|
*
|
||||||
|
* @warning
|
||||||
|
* You should be able to treat the output of this function as
|
||||||
|
* completely parsed, but that's only because all other entities should
|
||||||
|
* have been handled previously in substituteNonSpecialEntities()
|
||||||
|
*
|
||||||
|
* @param $string String character data to be parsed.
|
||||||
|
* @returns Parsed character data.
|
||||||
|
*/
|
||||||
|
function parseData($string) {
|
||||||
|
|
||||||
|
// following functions require at least one character
|
||||||
|
if ($string === '') return '';
|
||||||
|
|
||||||
|
// subtracts amps that cannot possibly be escaped
|
||||||
|
$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
|
||||||
|
($string[strlen($string)-1] === '&' ? 1 : 0);
|
||||||
|
|
||||||
|
if (!$num_amp) return $string; // abort if no entities
|
||||||
|
$num_esc_amp = substr_count($string, '&');
|
||||||
|
$string = strtr($string, $this->_special_entity2str);
|
||||||
|
|
||||||
|
// code duplication for sake of optimization, see above
|
||||||
|
$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
|
||||||
|
($string[strlen($string)-1] === '&' ? 1 : 0);
|
||||||
|
|
||||||
|
if ($num_amp_2 <= $num_esc_amp) return $string;
|
||||||
|
|
||||||
|
// hmm... now we have some uncommon entities. Use the callback.
|
||||||
|
$string = $this->_entity_parser->substituteSpecialEntities($string);
|
||||||
|
return $string;
|
||||||
|
}
|
||||||
|
|
||||||
var $_encoder;
|
var $_encoder;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -12,64 +12,12 @@ require_once 'HTMLPurifier/Lexer.php';
|
|||||||
* completely eventually.
|
* completely eventually.
|
||||||
*
|
*
|
||||||
* @todo Reread XML spec and document differences.
|
* @todo Reread XML spec and document differences.
|
||||||
* @todo Add support for CDATA sections.
|
*
|
||||||
* @todo Determine correct behavior in outputting comment data. (preserve dashes?)
|
* @todo Determine correct behavior in transforming comment data. (preserve dashes?)
|
||||||
* @todo Optimize main function tokenizeHTML().
|
|
||||||
* @todo Less than sign (<) being prohibited (even as entity) in attr-values?
|
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
|
||||||
* Most common entity to raw value conversion table for special entities.
|
|
||||||
* @protected
|
|
||||||
*/
|
|
||||||
var $_special_entity2str =
|
|
||||||
array(
|
|
||||||
'"' => '"',
|
|
||||||
'&' => '&',
|
|
||||||
'<' => '<',
|
|
||||||
'>' => '>',
|
|
||||||
''' => "'",
|
|
||||||
''' => "'",
|
|
||||||
''' => "'"
|
|
||||||
);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Parses special entities into the proper characters.
|
|
||||||
*
|
|
||||||
* This string will translate escaped versions of the special characters
|
|
||||||
* into the correct ones.
|
|
||||||
*
|
|
||||||
* @warning
|
|
||||||
* You should be able to treat the output of this function as
|
|
||||||
* completely parsed, but that's only because all other entities should
|
|
||||||
* have been handled previously in substituteNonSpecialEntities()
|
|
||||||
*
|
|
||||||
* @param $string String character data to be parsed.
|
|
||||||
* @returns Parsed character data.
|
|
||||||
*/
|
|
||||||
function parseData($string) {
|
|
||||||
|
|
||||||
// subtracts amps that cannot possibly be escaped
|
|
||||||
$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
|
|
||||||
($string[strlen($string)-1] === '&' ? 1 : 0);
|
|
||||||
|
|
||||||
if (!$num_amp) return $string; // abort if no entities
|
|
||||||
$num_esc_amp = substr_count($string, '&');
|
|
||||||
$string = strtr($string, $this->_special_entity2str);
|
|
||||||
|
|
||||||
// code duplication for sake of optimization, see above
|
|
||||||
$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
|
|
||||||
($string[strlen($string)-1] === '&' ? 1 : 0);
|
|
||||||
|
|
||||||
if ($num_amp_2 <= $num_esc_amp) return $string;
|
|
||||||
|
|
||||||
// hmm... now we have some uncommon entities. Use the callback.
|
|
||||||
$string = $this->_entity_parser->substituteSpecialEntities($string);
|
|
||||||
return $string;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Whitespace characters for str(c)spn.
|
* Whitespace characters for str(c)spn.
|
||||||
* @protected
|
* @protected
|
||||||
|
@ -18,6 +18,8 @@ require_once 'HTMLPurifier/Lexer.php';
|
|||||||
* whatever it does for poorly formed HTML is up to it.
|
* whatever it does for poorly formed HTML is up to it.
|
||||||
*
|
*
|
||||||
* @todo Generalize so that XML_HTMLSax is also supported.
|
* @todo Generalize so that XML_HTMLSax is also supported.
|
||||||
|
*
|
||||||
|
* @warning Entity-resolution inside attributes is broken.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
||||||
@ -41,6 +43,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
|||||||
$parser->set_element_handler('openHandler','closeHandler');
|
$parser->set_element_handler('openHandler','closeHandler');
|
||||||
$parser->set_data_handler('dataHandler');
|
$parser->set_data_handler('dataHandler');
|
||||||
$parser->set_escape_handler('escapeHandler');
|
$parser->set_escape_handler('escapeHandler');
|
||||||
|
|
||||||
|
// doesn't seem to work correctly for attributes
|
||||||
$parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
|
$parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
|
||||||
|
|
||||||
$parser->parse($string);
|
$parser->parse($string);
|
||||||
@ -53,6 +57,10 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
|||||||
* Open tag event handler, interface is defined by PEAR package.
|
* Open tag event handler, interface is defined by PEAR package.
|
||||||
*/
|
*/
|
||||||
function openHandler(&$parser, $name, $attrs, $closed) {
|
function openHandler(&$parser, $name, $attrs, $closed) {
|
||||||
|
// entities are not resolved in attrs
|
||||||
|
foreach ($attrs as $key => $attr) {
|
||||||
|
$attrs[$key] = $this->parseData($attr);
|
||||||
|
}
|
||||||
if ($closed) {
|
if ($closed) {
|
||||||
$this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
|
$this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
|
||||||
} else {
|
} else {
|
||||||
|
@ -11,24 +11,6 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase
|
|||||||
$this->DirectLex = new HTMLPurifier_Lexer_DirectLex();
|
$this->DirectLex = new HTMLPurifier_Lexer_DirectLex();
|
||||||
}
|
}
|
||||||
|
|
||||||
function test_parseData() {
|
|
||||||
$HP =& $this->DirectLex;
|
|
||||||
|
|
||||||
$this->assertIdentical('asdf', $HP->parseData('asdf'));
|
|
||||||
$this->assertIdentical('&', $HP->parseData('&'));
|
|
||||||
$this->assertIdentical('"', $HP->parseData('"'));
|
|
||||||
$this->assertIdentical("'", $HP->parseData('''));
|
|
||||||
$this->assertIdentical("'", $HP->parseData('''));
|
|
||||||
$this->assertIdentical('&&&', $HP->parseData('&&&'));
|
|
||||||
$this->assertIdentical('&&', $HP->parseData('&&')); // [INVALID]
|
|
||||||
$this->assertIdentical('Procter & Gamble',
|
|
||||||
$HP->parseData('Procter & Gamble')); // [INVALID]
|
|
||||||
|
|
||||||
// This is not special, thus not converted. Test of fault tolerance,
|
|
||||||
// realistically speaking, this should never happen
|
|
||||||
$this->assertIdentical('-', $HP->parseData('-'));
|
|
||||||
}
|
|
||||||
|
|
||||||
// internals testing
|
// internals testing
|
||||||
function test_parseAttributeString() {
|
function test_parseAttributeString() {
|
||||||
|
|
||||||
|
@ -38,6 +38,25 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
|||||||
$this->assertIdentical($extract, $result);
|
$this->assertIdentical($extract, $result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function test_parseData() {
|
||||||
|
$HP =& $this->Lexer;
|
||||||
|
|
||||||
|
$this->assertIdentical('asdf', $HP->parseData('asdf'));
|
||||||
|
$this->assertIdentical('&', $HP->parseData('&'));
|
||||||
|
$this->assertIdentical('"', $HP->parseData('"'));
|
||||||
|
$this->assertIdentical("'", $HP->parseData('''));
|
||||||
|
$this->assertIdentical("'", $HP->parseData('''));
|
||||||
|
$this->assertIdentical('&&&', $HP->parseData('&&&'));
|
||||||
|
$this->assertIdentical('&&', $HP->parseData('&&')); // [INVALID]
|
||||||
|
$this->assertIdentical('Procter & Gamble',
|
||||||
|
$HP->parseData('Procter & Gamble')); // [INVALID]
|
||||||
|
|
||||||
|
// This is not special, thus not converted. Test of fault tolerance,
|
||||||
|
// realistically speaking, this should never happen
|
||||||
|
$this->assertIdentical('-', $HP->parseData('-'));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
function test_extractBody() {
|
function test_extractBody() {
|
||||||
$this->assertExtractBody('<b>Bold</b>');
|
$this->assertExtractBody('<b>Bold</b>');
|
||||||
$this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
|
$this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
|
||||||
@ -249,13 +268,16 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
|||||||
,new HTMLPurifier_Token_Text('Link')
|
,new HTMLPurifier_Token_Text('Link')
|
||||||
,new HTMLPurifier_Token_End('a')
|
,new HTMLPurifier_Token_End('a')
|
||||||
);
|
);
|
||||||
$sax_expect[16] = false; // PEARSax doesn't support it!
|
|
||||||
|
|
||||||
// test that UTF-8 is preserved
|
// test that UTF-8 is preserved
|
||||||
$char_hearts = $this->_entity_lookup->table['hearts'];
|
$char_hearts = $this->_entity_lookup->table['hearts'];
|
||||||
$input[17] = $char_hearts;
|
$input[17] = $char_hearts;
|
||||||
$expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) );
|
$expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) );
|
||||||
|
|
||||||
|
// test weird characters in attributes
|
||||||
|
$input[18] = '<br test="x < 6" />';
|
||||||
|
$expect[18] = array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) );
|
||||||
|
|
||||||
$default_config = HTMLPurifier_Config::createDefault();
|
$default_config = HTMLPurifier_Config::createDefault();
|
||||||
foreach($input as $i => $discard) {
|
foreach($input as $i => $discard) {
|
||||||
if (!isset($config[$i])) $config[$i] = $default_config;
|
if (!isset($config[$i])) $config[$i] = $default_config;
|
||||||
|
Loading…
Reference in New Issue
Block a user