diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index a079df82..ee341ff8 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -15,20 +15,26 @@ require_once 'HTMLPurifier/Token.php'; * recommended, as we adhere to a subset of the specification for optimization * reasons. * - * This class cannot be directly instantiated, but you may use create() to + * This class should not be directly instantiated, but you may use create() to * retrieve a default copy of the lexer. * + * @note The unit tests will instantiate this class for testing purposes, as + * many of the utility functions require a class to be instantiated. + * Be careful when porting this class to PHP 5. + * + * @par + * * @note * We use tokens rather than create a DOM representation because DOM would: * - * @note + * @par * -# Require more processing power to create, * -# Require recursion to iterate, * -# Must be compatible with PHP 5's DOM (otherwise duplication), * -# Has the entire document structure (html and body not needed), and * -# Has unknown readability improvement. * - * @note + * @par * What the last item means is that the functions for manipulating tokens are * already fairly compact, and when well-commented, more abstraction may not * be needed. @@ -85,14 +91,53 @@ class HTMLPurifier_Lexer return $lexer; } + /** + * Decimal to parsed string conversion table for special entities. + * @protected + */ + var $_special_dec2str = + array( + 34 => '"', + 38 => '&', + 39 => "'", + 60 => '<', + 62 => '>' + ); + + /** + * Stripped entity names to decimal conversion table for special entities. + * @protected + */ + var $_special_ent2dec = + array( + 'quot' => 34, + 'amp' => 38, + 'lt' => 60, + 'gt' => 62 + ); + + /** + * Most common entity to raw value conversion table for special entities. + * @protected + */ + var $_special_entity2str = + array( + '"' => '"', + '&' => '&', + '<' => '<', + '>' => '>', + ''' => "'", + ''' => "'", + ''' => "'" + ); /** * Callback regex string for parsing entities. * @protected - */ + */ var $_substituteEntitiesRegex = - // 1. hex 2. dec 3. string - '/&[#](?:x([a-fA-F0-9]+)|0*(\d+)|([A-Za-z]+));?/'; +'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z]+));?/'; +// 1. hex 2. dec 3. string /** * Substitutes non-special entities with their parsed equivalents. Since @@ -107,8 +152,9 @@ class HTMLPurifier_Lexer // it will try to detect missing semicolons, but don't rely on it return preg_replace_callback( $this->_substituteEntitiesRegex, - array('HTMLPurifier_Lexer_DirectLex', 'nonSpecialEntityCallback'), - $string); + array($this, 'nonSpecialEntityCallback'), + $string + ); } /** @@ -116,7 +162,7 @@ class HTMLPurifier_Lexer * * @warning Though this is public in order to let the callback happen, * calling it directly is not recommended. - * @param $matches PCRE-style matches array, with 0 the entire match, and + * @param $matches PCRE matches array, with 0 the entire match, and * either index 1, 2 or 3 set with a hex value, dec value, * or string (respectively). * @returns Replacement string. @@ -135,7 +181,7 @@ class HTMLPurifier_Lexer if (isset($this->_special_ent2dec[$matches[3]])) return $entity; if (!$this->_entity_lookup) { require_once 'HTMLPurifier/EntityLookup.php'; - $this->_entity_lookup = EntityLookup::instance(); + $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); } if (isset($this->_entity_lookup->table[$matches[3]])) { return $this->_entity_lookup->table[$matches[3]]; @@ -145,8 +191,41 @@ class HTMLPurifier_Lexer } } + /** + * Contains a copy of the EntityLookup table. + * @protected + */ var $_entity_lookup; + /** + * Translates CDATA sections into regular sections (through escaping). + * + * @protected + * @param $string HTML string to process. + * @returns HTML with CDATA sections escaped. + */ + function escapeCDATA($string) { + return preg_replace_callback( + '//', + array('HTMLPurifier_Lexer', 'CDATACallback'), + $string + ); + } + + /** + * Callback function for escapeCDATA() that does the work. + * + * @warning Though this is public in order to let the callback happen, + * calling it directly is not recommended. + * @params $matches PCRE matches array, with index 0 the entire match + * and 1 the inside of the CDATA section. + * @returns Escaped internals of the CDATA section. + */ + function CDATACallback($matches) { + // not exactly sure why the character set is needed, but whatever + return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8'); + } + } ?> \ No newline at end of file diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index d9f41e61..69e08098 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -27,8 +27,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer public function tokenizeHTML($string) { $doc = new DOMDocument(); + // preprocess string $string = '
'.$string.'
'; + + // replace and escape the CDATA sections, since parsing under HTML + // mode won't get 'em. + $string = $this->escapeCDATA($string); + @$doc->loadHTML($string); // mute all errors, handle it transparently return $this->tokenizeDOM( $doc->childNodes->item(1)-> // html @@ -55,7 +61,8 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer if ( !($node instanceof DOMElement) ) { if ($node instanceof DOMComment) { $tokens[] = new HTMLPurifier_Token_Comment($node->data); - } elseif ($node instanceof DOMText) { + } elseif ($node instanceof DOMText || + $node instanceof DOMCharacterData) { $tokens[] = new HTMLPurifier_Token_Text($node->data); } // quite possibly, the object wasn't handled, that's fine diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index 587f1928..c596f1b4 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -61,43 +61,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer */ var $_whitespace = "\x20\x09\x0D\x0A"; - /** - * Decimal to parsed string conversion table for special entities. - * @protected - */ - var $_special_dec2str = array( - 34 => '"', // quote - 38 => '&', // ampersand - 39 => "'", // apostrophe - 60 => '<', // less than sign - 62 => '>' // greater than sign - ); - - /** - * Stripped entity names to decimal conversion table for special entities. - * @protected - */ - var $_special_ent2dec = array( - 'quot' => 34, - 'amp' => 38, - 'lt' => 60, - 'gt' => 62, - ); - - /** - * Most common entity to raw value conversion table for special entities. - * @protected - */ - var $_special_entity2str = array( - '"' => '"', - '&' => '&', - '<' => '<', - '>' => '>', - ''' => "'", - ''' => "'", - ''' => "'", - ); - /** * Substitutes only special entities with their parsed equivalents. * @@ -153,6 +116,9 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $inside_tag = false; // whether or not we're parsing the inside of a tag $array = array(); // result array + // escape CDATA + $string = $this->escapeCDATA($string); + // expand entities THAT AREN'T THE BIG FIVE $string = $this->substituteNonSpecialEntities($string); diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php index b012c74a..13fac96a 100644 --- a/library/HTMLPurifier/Lexer/PEARSax3.php +++ b/library/HTMLPurifier/Lexer/PEARSax3.php @@ -29,6 +29,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer var $tokens = array(); function tokenizeHTML($html) { + $html = $this->escapeCDATA($html); + $html = $this->substituteNonSpecialEntities($html); $parser=& new XML_HTMLSax3(); $parser->set_object($this); $parser->set_element_handler('openHandler','closeHandler'); @@ -79,9 +81,14 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer * Escaped text handler, interface is defined by PEAR package. */ function escapeHandler(&$parser, $data) { - if (strpos($data, '-') === 0) { + if (strpos($data, '--') === 0) { $this->tokens[] = new HTMLPurifier_Token_Comment($data); } + // CDATA is handled elsewhere, but if it was handled here: + //if (strpos($data, '[CDATA[') === 0) { + // $this->tokens[] = new HTMLPurifier_Token_Text( + // substr($data, 7, strlen($data) - 9) ); + //} return true; } diff --git a/tests/HTMLPurifier/Lexer/DirectLexTest.php b/tests/HTMLPurifier/Lexer/DirectLexTest.php index e13ebd18..d1e6f088 100644 --- a/tests/HTMLPurifier/Lexer/DirectLexTest.php +++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php @@ -11,6 +11,13 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase $this->DirectLex = new HTMLPurifier_Lexer_DirectLex(); } + function test_specialEntityCallback() { + $HP =& $this->DirectLex; + + $this->assertIdentical("'",$HP->specialEntityCallback( + array(''', null, '39', null) )); + } + function test_parseData() { $HP =& $this->DirectLex; @@ -29,13 +36,6 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase $this->assertIdentical('-', $HP->parseData('-')); } - function test_specialEntityCallback() { - $HP =& $this->DirectLex; - - $this->assertIdentical("'",$HP->specialEntityCallback( - array(''', null, '39', null) )); - } - // internals testing function test_parseAttributeString() { diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index 0f988a6b..543b111f 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -6,11 +6,14 @@ require_once 'HTMLPurifier/Lexer/PEARSax3.php'; class HTMLPurifier_LexerTest extends UnitTestCase { + var $Lexer; var $DirectLex, $PEARSax3, $DOMLex; var $_entity_lookup; var $_has_dom; function setUp() { + $this->Lexer = new HTMLPurifier_Lexer(); + $this->DirectLex = new HTMLPurifier_Lexer_DirectLex(); $this->PEARSax3 = new HTMLPurifier_Lexer_PEARSax3(); @@ -24,6 +27,14 @@ class HTMLPurifier_LexerTest extends UnitTestCase } + function test_substituteNonSpecialEntities() { + $char_theta = $this->_entity_lookup->table['theta']; + $this->assertIdentical($char_theta, + $this->Lexer->substituteNonSpecialEntities('θ') ); + $this->assertIdentical('"', + $this->Lexer->substituteNonSpecialEntities('"') ); + } + function test_tokenizeHTML() { $input = array(); @@ -156,40 +167,70 @@ class HTMLPurifier_LexerTest extends UnitTestCase $expect[12] = array( new HTMLPurifier_Token_Text('"') ); $sax_expect[12] = false; // choked! - // DOM and SAX choke on this - //$char_circ = $this->_entity_lookup->table['circ']; - //$input[13] = 'ˆ'; - //$expect[13] = array( new HTMLPurifier_Token_Text($char_circ) ); + // CDATA sections! + $input[13] = 'can't get me!]]>'; + $expect[13] = array( new HTMLPurifier_Token_Text( + 'You can't get me!' // raw + ) ); + $sax_expect[13] = array( // SAX has a seperate call for each entity + new HTMLPurifier_Token_Text('You '), + new HTMLPurifier_Token_Text('<'), + new HTMLPurifier_Token_Text('b'), + new HTMLPurifier_Token_Text('>'), + new HTMLPurifier_Token_Text('can'), + new HTMLPurifier_Token_Text('&'), + new HTMLPurifier_Token_Text('#39;t'), + new HTMLPurifier_Token_Text('<'), + new HTMLPurifier_Token_Text('/b'), + new HTMLPurifier_Token_Text('>'), + new HTMLPurifier_Token_Text(' get me!') + ); + + $char_theta = $this->_entity_lookup->table['theta']; + $char_rarr = $this->_entity_lookup->table['rarr']; + + // test entity replacement + $input[14] = 'θ'; + $expect[14] = array( new HTMLPurifier_Token_Text($char_theta) ); + + // test that entities aren't replaced in CDATA sections + $input[15] = 'θ '; + $expect[15] = array( new HTMLPurifier_Token_Text($char_theta . ' →') ); + $sax_expect[15] = array( + new HTMLPurifier_Token_Text($char_theta . ' '), + new HTMLPurifier_Token_Text('&'), + new HTMLPurifier_Token_Text('rarr;') + ); foreach($input as $i => $discard) { $result = $this->DirectLex->tokenizeHTML($input[$i]); - $this->assertEqual($expect[$i], $result, 'Test '.$i.': %s'); + $this->assertEqual($expect[$i], $result, 'DirectLexTest '.$i.': %s'); paintIf($result, $expect[$i] != $result); // assert unless I say otherwise $sax_result = $this->PEARSax3->tokenizeHTML($input[$i]); if (!isset($sax_expect[$i])) { // by default, assert with normal result - $this->assertEqual($expect[$i], $sax_result, 'Test '.$i.': %s'); + $this->assertEqual($expect[$i], $sax_result, 'PEARSax3Test '.$i.': %s'); paintIf($sax_result, $expect[$i] != $sax_result); } elseif ($sax_expect[$i] === false) { // assertions were turned off, optionally dump // paintIf($sax_expect, $i == NUMBER); } else { // match with a custom SAX result array - $this->assertEqual($sax_expect[$i], $sax_result, 'Test '.$i.': %s'); + $this->assertEqual($sax_expect[$i], $sax_result, 'PEARSax3Test (custom) '.$i.': %s'); paintIf($sax_result, $sax_expect[$i] != $sax_result); } if ($this->_has_dom) { $dom_result = $this->DOMLex->tokenizeHTML($input[$i]); // same structure as SAX if (!isset($dom_expect[$i])) { - $this->assertEqual($expect[$i], $dom_result, 'Test '.$i.': %s'); + $this->assertEqual($expect[$i], $dom_result, 'DOMLexTest '.$i.': %s'); paintIf($dom_result, $expect[$i] != $dom_result); } elseif ($dom_expect[$i] === false) { // paintIf($dom_result, $i == NUMBER); } else { - $this->assertEqual($dom_expect[$i], $dom_result, 'Test '.$i.': %s'); + $this->assertEqual($dom_expect[$i], $dom_result, 'DOMLexTest (custom) '.$i.': %s'); paintIf($dom_result, $dom_expect[$i] != $dom_result); } }