From 37def0104b261af268d0d5a41a99902929b207ed Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Wed, 27 Sep 2006 02:09:54 +0000 Subject: [PATCH] [1.1.2] - Documentation updated - API docs now exclude more files that are not classes - Fixed lack of attribute parsing in HTMLPurifier_Lexer_PEARSax3 - (internal) Refactored parseData() to general Lexer class git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@466 48356398-32a2-884e-a903-53898d9a118a --- Doxyfile | 11 +++-- NEWS | 3 ++ library/HTMLPurifier.php | 4 +- library/HTMLPurifier/Lexer.php | 54 +++++++++++++++++++++ library/HTMLPurifier/Lexer/DirectLex.php | 56 +--------------------- library/HTMLPurifier/Lexer/PEARSax3.php | 8 ++++ tests/HTMLPurifier/Lexer/DirectLexTest.php | 18 ------- tests/HTMLPurifier/LexerTest.php | 24 +++++++++- 8 files changed, 99 insertions(+), 79 deletions(-) diff --git a/Doxyfile b/Doxyfile index 8853c756..8667cae9 100644 --- a/Doxyfile +++ b/Doxyfile @@ -4,7 +4,7 @@ # Project related configuration options #--------------------------------------------------------------------------- PROJECT_NAME = HTML Purifier -PROJECT_NUMBER = 1.0.0 +PROJECT_NUMBER = 1.1.1 OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen" CREATE_SUBDIRS = NO OUTPUT_LANGUAGE = English @@ -89,9 +89,12 @@ EXCLUDE = EXCLUDE_SYMLINKS = NO EXCLUDE_PATTERNS = */tests/* \ */benchmarks/* \ - */docs/phpdoc/* \ - */docs/doxygen/* \ - */test-settings.php + */docs/* \ + */test-settings.php \ + */configdoc/* \ + */test-settings.php \ + */maintenance/* \ + */smoketests/* EXAMPLE_PATH = EXAMPLE_PATTERNS = * EXAMPLE_RECURSIVE = NO diff --git a/NEWS b/NEWS index 151e09f0..96157e19 100644 --- a/NEWS +++ b/NEWS @@ -6,7 +6,10 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier 1.1.2, unknown projected release date (bugfix release, may be merged with 1.2.0 if new features precede major bugs) +- Documentation updated +- API docs now exclude more files that are not classes - Line endings standardized throughout project +- Fixed lack of attribute parsing in HTMLPurifier_Lexer_PEARSax3 1.1.1, released 2006-09-24 - Various documentation updates diff --git a/library/HTMLPurifier.php b/library/HTMLPurifier.php index eeb959e5..f02bf0c2 100644 --- a/library/HTMLPurifier.php +++ b/library/HTMLPurifier.php @@ -3,7 +3,7 @@ /*! * @mainpage * - * HTMLPurifier is an HTML filter that will take an arbitrary snippet of + * HTML Purifier is an HTML filter that will take an arbitrary snippet of * HTML and rigorously test, validate and filter it into a version that * is safe for output onto webpages. It achieves this by: * @@ -22,7 +22,7 @@ */ /* - HTMLPurifier - Standards Compliant HTML Filtering + HTML Purifier - Standards Compliant HTML Filtering Copyright (C) 2006 Edward Z. Yang This library is free software; you can redistribute it and/or diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index e43c7b8d..962cb7bf 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -60,6 +60,60 @@ class HTMLPurifier_Lexer $this->_entity_parser = new HTMLPurifier_EntityParser(); } + + /** + * Most common entity to raw value conversion table for special entities. + * @protected + */ + var $_special_entity2str = + array( + '"' => '"', + '&' => '&', + '<' => '<', + '>' => '>', + ''' => "'", + ''' => "'", + ''' => "'" + ); + + /** + * Parses special entities into the proper characters. + * + * This string will translate escaped versions of the special characters + * into the correct ones. + * + * @warning + * You should be able to treat the output of this function as + * completely parsed, but that's only because all other entities should + * have been handled previously in substituteNonSpecialEntities() + * + * @param $string String character data to be parsed. + * @returns Parsed character data. + */ + function parseData($string) { + + // following functions require at least one character + if ($string === '') return ''; + + // subtracts amps that cannot possibly be escaped + $num_amp = substr_count($string, '&') - substr_count($string, '& ') - + ($string[strlen($string)-1] === '&' ? 1 : 0); + + if (!$num_amp) return $string; // abort if no entities + $num_esc_amp = substr_count($string, '&'); + $string = strtr($string, $this->_special_entity2str); + + // code duplication for sake of optimization, see above + $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') - + ($string[strlen($string)-1] === '&' ? 1 : 0); + + if ($num_amp_2 <= $num_esc_amp) return $string; + + // hmm... now we have some uncommon entities. Use the callback. + $string = $this->_entity_parser->substituteSpecialEntities($string); + return $string; + } + var $_encoder; /** diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index c2d0a9b0..4b9bff1e 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -12,64 +12,12 @@ require_once 'HTMLPurifier/Lexer.php'; * completely eventually. * * @todo Reread XML spec and document differences. - * @todo Add support for CDATA sections. - * @todo Determine correct behavior in outputting comment data. (preserve dashes?) - * @todo Optimize main function tokenizeHTML(). - * @todo Less than sign (<) being prohibited (even as entity) in attr-values? + * + * @todo Determine correct behavior in transforming comment data. (preserve dashes?) */ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer { - /** - * Most common entity to raw value conversion table for special entities. - * @protected - */ - var $_special_entity2str = - array( - '"' => '"', - '&' => '&', - '<' => '<', - '>' => '>', - ''' => "'", - ''' => "'", - ''' => "'" - ); - - /** - * Parses special entities into the proper characters. - * - * This string will translate escaped versions of the special characters - * into the correct ones. - * - * @warning - * You should be able to treat the output of this function as - * completely parsed, but that's only because all other entities should - * have been handled previously in substituteNonSpecialEntities() - * - * @param $string String character data to be parsed. - * @returns Parsed character data. - */ - function parseData($string) { - - // subtracts amps that cannot possibly be escaped - $num_amp = substr_count($string, '&') - substr_count($string, '& ') - - ($string[strlen($string)-1] === '&' ? 1 : 0); - - if (!$num_amp) return $string; // abort if no entities - $num_esc_amp = substr_count($string, '&'); - $string = strtr($string, $this->_special_entity2str); - - // code duplication for sake of optimization, see above - $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') - - ($string[strlen($string)-1] === '&' ? 1 : 0); - - if ($num_amp_2 <= $num_esc_amp) return $string; - - // hmm... now we have some uncommon entities. Use the callback. - $string = $this->_entity_parser->substituteSpecialEntities($string); - return $string; - } - /** * Whitespace characters for str(c)spn. * @protected diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php index d2d90a12..229b4636 100644 --- a/library/HTMLPurifier/Lexer/PEARSax3.php +++ b/library/HTMLPurifier/Lexer/PEARSax3.php @@ -18,6 +18,8 @@ require_once 'HTMLPurifier/Lexer.php'; * whatever it does for poorly formed HTML is up to it. * * @todo Generalize so that XML_HTMLSax is also supported. + * + * @warning Entity-resolution inside attributes is broken. */ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer @@ -41,6 +43,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer $parser->set_element_handler('openHandler','closeHandler'); $parser->set_data_handler('dataHandler'); $parser->set_escape_handler('escapeHandler'); + + // doesn't seem to work correctly for attributes $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1); $parser->parse($string); @@ -53,6 +57,10 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer * Open tag event handler, interface is defined by PEAR package. */ function openHandler(&$parser, $name, $attrs, $closed) { + // entities are not resolved in attrs + foreach ($attrs as $key => $attr) { + $attrs[$key] = $this->parseData($attr); + } if ($closed) { $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs); } else { diff --git a/tests/HTMLPurifier/Lexer/DirectLexTest.php b/tests/HTMLPurifier/Lexer/DirectLexTest.php index 2ad14476..de35c1d1 100644 --- a/tests/HTMLPurifier/Lexer/DirectLexTest.php +++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php @@ -11,24 +11,6 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase $this->DirectLex = new HTMLPurifier_Lexer_DirectLex(); } - function test_parseData() { - $HP =& $this->DirectLex; - - $this->assertIdentical('asdf', $HP->parseData('asdf')); - $this->assertIdentical('&', $HP->parseData('&')); - $this->assertIdentical('"', $HP->parseData('"')); - $this->assertIdentical("'", $HP->parseData(''')); - $this->assertIdentical("'", $HP->parseData(''')); - $this->assertIdentical('&&&', $HP->parseData('&&&')); - $this->assertIdentical('&&', $HP->parseData('&&')); // [INVALID] - $this->assertIdentical('Procter & Gamble', - $HP->parseData('Procter & Gamble')); // [INVALID] - - // This is not special, thus not converted. Test of fault tolerance, - // realistically speaking, this should never happen - $this->assertIdentical('-', $HP->parseData('-')); - } - // internals testing function test_parseAttributeString() { diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index 25fff13c..1ddc8a67 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -38,6 +38,25 @@ class HTMLPurifier_LexerTest extends UnitTestCase $this->assertIdentical($extract, $result); } + function test_parseData() { + $HP =& $this->Lexer; + + $this->assertIdentical('asdf', $HP->parseData('asdf')); + $this->assertIdentical('&', $HP->parseData('&')); + $this->assertIdentical('"', $HP->parseData('"')); + $this->assertIdentical("'", $HP->parseData(''')); + $this->assertIdentical("'", $HP->parseData(''')); + $this->assertIdentical('&&&', $HP->parseData('&&&')); + $this->assertIdentical('&&', $HP->parseData('&&')); // [INVALID] + $this->assertIdentical('Procter & Gamble', + $HP->parseData('Procter & Gamble')); // [INVALID] + + // This is not special, thus not converted. Test of fault tolerance, + // realistically speaking, this should never happen + $this->assertIdentical('-', $HP->parseData('-')); + } + + function test_extractBody() { $this->assertExtractBody('Bold'); $this->assertExtractBody('Bold', 'Bold'); @@ -249,13 +268,16 @@ class HTMLPurifier_LexerTest extends UnitTestCase ,new HTMLPurifier_Token_Text('Link') ,new HTMLPurifier_Token_End('a') ); - $sax_expect[16] = false; // PEARSax doesn't support it! // test that UTF-8 is preserved $char_hearts = $this->_entity_lookup->table['hearts']; $input[17] = $char_hearts; $expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) ); + // test weird characters in attributes + $input[18] = '
'; + $expect[18] = array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) ); + $default_config = HTMLPurifier_Config::createDefault(); foreach($input as $i => $discard) { if (!isset($config[$i])) $config[$i] = $default_config;