From d7140f2e05bbdf56f10e8f4bd5077462a9f5d189 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Tue, 15 Aug 2006 00:31:12 +0000 Subject: [PATCH] Outfit a bunch of other classes so they can accept a configuration object. Put in basic scaffolding for extractBody() functionality. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@257 48356398-32a2-884e-a903-53898d9a118a --- docs/config.txt | 7 +++++++ library/HTMLPurifier.php | 6 ++++-- library/HTMLPurifier/ChildDef.php | 4 ++-- library/HTMLPurifier/Generator.php | 14 ++++++++------ library/HTMLPurifier/Lexer.php | 18 +++++++++++++++++- library/HTMLPurifier/Lexer/DOMLex.php | 4 +++- library/HTMLPurifier/Lexer/DirectLex.php | 4 +++- library/HTMLPurifier/Lexer/PEARSax3.php | 3 ++- .../HTMLPurifier/Strategy/MakeWellFormed.php | 4 ++-- .../Strategy/RemoveForeignElements.php | 2 +- tests/HTMLPurifier/ChildDefTest.php | 4 ++-- tests/HTMLPurifier/GeneratorTest.php | 15 ++++++++++++--- tests/HTMLPurifier/LexerTest.php | 14 +++++++++++--- tests/HTMLPurifier/StrategyHarness.php | 2 +- 14 files changed, 75 insertions(+), 26 deletions(-) diff --git a/docs/config.txt b/docs/config.txt index 0562c0af..53bc6ac6 100644 --- a/docs/config.txt +++ b/docs/config.txt @@ -9,3 +9,10 @@ are nevertheless error checking and a centralized configuration object. Directives are divided into namespaces, indicating the major portion of functionality they cover (although there may be overlaps. Please consult the documentation in ConfigDef for more information on these namespaces. + +Since configuration is dependent on context, most of the internal classes +require a configuration object to be passed as a parameter. However, a few +make this optional: they will supply a default configuration object if none +are passed. These classes are: HTMLPurifier::*, Generator::generateFromTokens +and Lexer::tokenizeHTML. However, whenever a valid configuration object +is defined, that object should be used. diff --git a/library/HTMLPurifier.php b/library/HTMLPurifier.php index 2b9f0e95..fc4c51e0 100644 --- a/library/HTMLPurifier.php +++ b/library/HTMLPurifier.php @@ -59,8 +59,10 @@ class HTMLPurifier $generator = new HTMLPurifier_Generator(); return $generator->generateFromTokens( $strategy->execute( - $lexer->tokenizeHTML($html), $config - ) + $lexer->tokenizeHTML($html, $config), + $config + ), + $config ); } diff --git a/library/HTMLPurifier/ChildDef.php b/library/HTMLPurifier/ChildDef.php index 30664cf5..1586f075 100644 --- a/library/HTMLPurifier/ChildDef.php +++ b/library/HTMLPurifier/ChildDef.php @@ -137,7 +137,7 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef $is_deleting = true; if ($pcdata_allowed && $escape_invalid_children) { $result[] = new HTMLPurifier_Token_Text( - $this->gen->generateFromToken($token) + $this->gen->generateFromToken($token, $config) ); } continue; @@ -148,7 +148,7 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef } elseif ($pcdata_allowed && $escape_invalid_children) { $result[] = new HTMLPurifier_Token_Text( - $this->gen->generateFromToken( $token ) + $this->gen->generateFromToken( $token, $config ) ); } else { // drop silently diff --git a/library/HTMLPurifier/Generator.php b/library/HTMLPurifier/Generator.php index 1d2cdaab..d1ac6d40 100644 --- a/library/HTMLPurifier/Generator.php +++ b/library/HTMLPurifier/Generator.php @@ -5,26 +5,28 @@ class HTMLPurifier_Generator { - function generateFromTokens($tokens) { + // only unit tests may omit configuration: internals MUST pass config + function generateFromTokens($tokens, $config = null) { $html = ''; + if (!$config) $config = HTMLPurifier_Config::createDefault(); if (!$tokens) return ''; foreach ($tokens as $token) { - $html .= $this->generateFromToken($token); + $html .= $this->generateFromToken($token, $config); } return $html; } - function generateFromToken($token) { + function generateFromToken($token, $config) { if (!isset($token->type)) return ''; if ($token->type == 'start') { - $attr = $this->generateAttributes($token->attributes); + $attr = $this->generateAttributes($token->attributes, $config); return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>'; } elseif ($token->type == 'end') { return 'name . '>'; } elseif ($token->type == 'empty') { - $attr = $this->generateAttributes($token->attributes); + $attr = $this->generateAttributes($token->attributes, $config); return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />'; } elseif ($token->type == 'text') { @@ -36,7 +38,7 @@ class HTMLPurifier_Generator } } - function generateAttributes($assoc_array_of_attributes) { + function generateAttributes($assoc_array_of_attributes, $config) { $html = ''; foreach ($assoc_array_of_attributes as $key => $value) { $html .= $key.'="'.htmlspecialchars($value, ENT_COMPAT, 'UTF-8').'" '; diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index 086d7fb9..e8fbf1a2 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -2,6 +2,14 @@ require_once 'HTMLPurifier/Token.php'; +HTMLPurifier_ConfigDef::define( + 'Core', 'AcceptFullDocuments', true, + 'This parameter determines whether or not the filter should accept full '. + 'HTML documents, not just HTML fragments. When on, it will '. + 'drop all sections except the content between body. Depending on '. + 'the implementation in use, this may speed up document parse times.' +); + /** * Forgivingly lexes HTML (SGML-style) markup into tokens. * @@ -52,7 +60,7 @@ class HTMLPurifier_Lexer * @param $string String HTML. * @return HTMLPurifier_Token array representation of HTML. */ - function tokenizeHTML($string) { + function tokenizeHTML($string, $config = null) { trigger_error('Call to abstract class', E_USER_ERROR); } @@ -228,6 +236,14 @@ class HTMLPurifier_Lexer return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8'); } + /** + * Takes a string of HTML (fragment or document) and returns the content + */ + function extractBody($html) { + if (strpos($html, ' \ No newline at end of file diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index 2320f9c2..3018423b 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -25,7 +25,9 @@ require_once 'HTMLPurifier/Lexer.php'; class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer { - public function tokenizeHTML($string) { + public function tokenizeHTML($string, $config = null) { + if (!$config) $config = HTMLPurifier_Config::createDefault(); + $doc = new DOMDocument(); $doc->encoding = 'UTF-8'; // technically does nothing, but comprehensive diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index c596f1b4..29634b69 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -106,7 +106,9 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer } } - function tokenizeHTML($string) { + function tokenizeHTML($string, $config = null) { + + if (!$config) $config = HTMLPurifier_Config::createDefault(); // some quick checking (if empty, return empty) $string = @ (string) $string; diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php index 9eee7def..da3843b0 100644 --- a/library/HTMLPurifier/Lexer/PEARSax3.php +++ b/library/HTMLPurifier/Lexer/PEARSax3.php @@ -29,7 +29,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer */ var $tokens = array(); - function tokenizeHTML($html) { + function tokenizeHTML($html, $config = null) { + if (!$config) $config = HTMLPurifier_Config::createDefault(); $html = $this->escapeCDATA($html); $html = $this->substituteNonSpecialEntities($html); $parser=& new XML_HTMLSax3(); diff --git a/library/HTMLPurifier/Strategy/MakeWellFormed.php b/library/HTMLPurifier/Strategy/MakeWellFormed.php index 6284f2b3..af7c878e 100644 --- a/library/HTMLPurifier/Strategy/MakeWellFormed.php +++ b/library/HTMLPurifier/Strategy/MakeWellFormed.php @@ -87,7 +87,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy // make sure that we have something open if (empty($current_nesting)) { $result[] = new HTMLPurifier_Token_Text( - $this->generator->generateFromToken($token) + $this->generator->generateFromToken($token, $config) ); continue; } @@ -122,7 +122,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy // we still didn't find the tag, so translate to text if ($skipped_tags === false) { $result[] = new HTMLPurifier_Token_Text( - $this->generator->generateFromToken($token) + $this->generator->generateFromToken($token, $config) ); continue; } diff --git a/library/HTMLPurifier/Strategy/RemoveForeignElements.php b/library/HTMLPurifier/Strategy/RemoveForeignElements.php index c6f66d97..d1e96b33 100644 --- a/library/HTMLPurifier/Strategy/RemoveForeignElements.php +++ b/library/HTMLPurifier/Strategy/RemoveForeignElements.php @@ -43,7 +43,7 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy } else { // invalid tag, generate HTML and insert in $token = new HTMLPurifier_Token_Text( - $this->generator->generateFromToken($token) + $this->generator->generateFromToken($token, $config) ); } } elseif ($token->type == 'comment') { diff --git a/tests/HTMLPurifier/ChildDefTest.php b/tests/HTMLPurifier/ChildDefTest.php index 5a15526e..228898fa 100644 --- a/tests/HTMLPurifier/ChildDefTest.php +++ b/tests/HTMLPurifier/ChildDefTest.php @@ -19,7 +19,6 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase function assertSeries($inputs, $expect, $config, $context = array()) { foreach ($inputs as $i => $input) { - $tokens = $this->lex->tokenizeHTML($input); if (!isset($context[$i])) { $context[$i] = null; @@ -28,12 +27,13 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase $config[$i] = HTMLPurifier_Config::createDefault(); } + $tokens = $this->lex->tokenizeHTML($input, $config[$i]); $result = $this->def->validateChildren($tokens, $config[$i], $context[$i]); if (is_bool($expect[$i])) { $this->assertIdentical($expect[$i], $result, "Test $i: %s"); } else { - $result_html = $this->gen->generateFromTokens($result); + $result_html = $this->gen->generateFromTokens($result, $config[$i]); $this->assertIdentical($expect[$i], $result_html, "Test $i: %s"); paintIf($result_html, $result_html != $expect[$i]); } diff --git a/tests/HTMLPurifier/GeneratorTest.php b/tests/HTMLPurifier/GeneratorTest.php index 5806868b..d567c236 100644 --- a/tests/HTMLPurifier/GeneratorTest.php +++ b/tests/HTMLPurifier/GeneratorTest.php @@ -19,6 +19,7 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase $inputs = array(); $expect = array(); + $config = array(); $inputs[0] = new HTMLPurifier_Token_Text('Foobar.<>'); $expect[0] = 'Foobar.<>'; @@ -51,8 +52,10 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase $inputs[7] = new HTMLPurifier_Token_Text($theta_char); $expect[7] = $theta_char; + $default_config = HTMLPurifier_Config::createDefault(); foreach ($inputs as $i => $input) { - $result = $this->gen->generateFromToken($input); + if (!isset($config[$i])) $config[$i] = $default_config; + $result = $this->gen->generateFromToken($input, $config[$i]); $this->assertEqual($result, $expect[$i]); paintIf($result, $result != $expect[$i]); } @@ -63,6 +66,7 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase $inputs = array(); $expect = array(); + $config = array(); $inputs[0] = array(); $expect[0] = ''; @@ -81,8 +85,10 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase $inputs[4] = array('title' => 'Theta is ' . $theta_char); $expect[4] = 'title="Theta is ' . $theta_char . '"'; + $default_config = HTMLPurifier_Config::createDefault(); foreach ($inputs as $i => $input) { - $result = $this->gen->generateAttributes($input); + if (!isset($config[$i])) $config[$i] = $default_config; + $result = $this->gen->generateAttributes($input, $config[$i]); $this->assertEqual($result, $expect[$i]); paintIf($result, $result != $expect[$i]); } @@ -93,6 +99,7 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase $inputs = array(); $expect = array(); + $config = array(); $inputs[0] = array( new HTMLPurifier_Token_Start('b'), @@ -104,8 +111,10 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase $inputs[1] = array(); $expect[1] = ''; + $default_config = HTMLPurifier_Config::createDefault(); foreach ($inputs as $i => $input) { - $result = $this->gen->generateFromTokens($input); + if (!isset($config[$i])) $config[$i] = $default_config; + $result = $this->gen->generateFromTokens($input, $config[$i]); $this->assertEqual($expect[$i], $result); paintIf($result, $result != $expect[$i]); } diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index dfc0a4d3..9f71a838 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -40,11 +40,16 @@ class HTMLPurifier_LexerTest extends UnitTestCase $this->Lexer->substituteNonSpecialEntities('"') ); } + function test_extractBody() { + + } + function test_tokenizeHTML() { $input = array(); $expect = array(); $sax_expect = array(); + $config = array(); $input[0] = ''; $expect[0] = array(); @@ -221,14 +226,17 @@ class HTMLPurifier_LexerTest extends UnitTestCase $input[17] = $char_hearts; $expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) ); + $default_config = HTMLPurifier_Config::createDefault(); foreach($input as $i => $discard) { - $result = $this->DirectLex->tokenizeHTML($input[$i]); + if (!isset($config[$i])) $config[$i] = $default_config; + + $result = $this->DirectLex->tokenizeHTML($input[$i], $config[$i]); $this->assertEqual($expect[$i], $result, 'DirectLexTest '.$i.': %s'); paintIf($result, $expect[$i] != $result); if ($this->_has_pear) { // assert unless I say otherwise - $sax_result = $this->PEARSax3->tokenizeHTML($input[$i]); + $sax_result = $this->PEARSax3->tokenizeHTML($input[$i], $config[$i]); if (!isset($sax_expect[$i])) { // by default, assert with normal result $this->assertEqual($expect[$i], $sax_result, 'PEARSax3Test '.$i.': %s'); @@ -244,7 +252,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase } if ($this->_has_dom) { - $dom_result = $this->DOMLex->tokenizeHTML($input[$i]); + $dom_result = $this->DOMLex->tokenizeHTML($input[$i], $config[$i]); // same structure as SAX if (!isset($dom_expect[$i])) { $this->assertEqual($expect[$i], $dom_result, 'DOMLexTest '.$i.': %s'); diff --git a/tests/HTMLPurifier/StrategyHarness.php b/tests/HTMLPurifier/StrategyHarness.php index 4c257aea..a39daf36 100644 --- a/tests/HTMLPurifier/StrategyHarness.php +++ b/tests/HTMLPurifier/StrategyHarness.php @@ -31,7 +31,7 @@ class HTMLPurifier_StrategyHarness extends UnitTestCase $config[$i] = HTMLPurifier_Config::createDefault(); } $result_tokens = $strategy->execute($tokens, $config[$i]); - $result = $this->gen->generateFromTokens($result_tokens); + $result = $this->gen->generateFromTokens($result_tokens, $config[$i]); $this->assertEqual($expect[$i], $result, "Test $i: %s"); paintIf($result, $result != $expect[$i]); }