0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-03-23 14:27:02 +00:00

Outfit a bunch of other classes so they can accept a configuration object. Put in basic scaffolding for extractBody() functionality.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@257 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-08-15 00:31:12 +00:00
parent 24c64dbbac
commit d7140f2e05
14 changed files with 75 additions and 26 deletions

View File

@ -9,3 +9,10 @@ are nevertheless error checking and a centralized configuration object.
Directives are divided into namespaces, indicating the major portion of Directives are divided into namespaces, indicating the major portion of
functionality they cover (although there may be overlaps. Please consult functionality they cover (although there may be overlaps. Please consult
the documentation in ConfigDef for more information on these namespaces. the documentation in ConfigDef for more information on these namespaces.
Since configuration is dependent on context, most of the internal classes
require a configuration object to be passed as a parameter. However, a few
make this optional: they will supply a default configuration object if none
are passed. These classes are: HTMLPurifier::*, Generator::generateFromTokens
and Lexer::tokenizeHTML. However, whenever a valid configuration object
is defined, that object should be used.

View File

@ -59,8 +59,10 @@ class HTMLPurifier
$generator = new HTMLPurifier_Generator(); $generator = new HTMLPurifier_Generator();
return $generator->generateFromTokens( return $generator->generateFromTokens(
$strategy->execute( $strategy->execute(
$lexer->tokenizeHTML($html), $config $lexer->tokenizeHTML($html, $config),
) $config
),
$config
); );
} }

View File

@ -137,7 +137,7 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
$is_deleting = true; $is_deleting = true;
if ($pcdata_allowed && $escape_invalid_children) { if ($pcdata_allowed && $escape_invalid_children) {
$result[] = new HTMLPurifier_Token_Text( $result[] = new HTMLPurifier_Token_Text(
$this->gen->generateFromToken($token) $this->gen->generateFromToken($token, $config)
); );
} }
continue; continue;
@ -148,7 +148,7 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
} elseif ($pcdata_allowed && $escape_invalid_children) { } elseif ($pcdata_allowed && $escape_invalid_children) {
$result[] = $result[] =
new HTMLPurifier_Token_Text( new HTMLPurifier_Token_Text(
$this->gen->generateFromToken( $token ) $this->gen->generateFromToken( $token, $config )
); );
} else { } else {
// drop silently // drop silently

View File

@ -5,26 +5,28 @@
class HTMLPurifier_Generator class HTMLPurifier_Generator
{ {
function generateFromTokens($tokens) { // only unit tests may omit configuration: internals MUST pass config
function generateFromTokens($tokens, $config = null) {
$html = ''; $html = '';
if (!$config) $config = HTMLPurifier_Config::createDefault();
if (!$tokens) return ''; if (!$tokens) return '';
foreach ($tokens as $token) { foreach ($tokens as $token) {
$html .= $this->generateFromToken($token); $html .= $this->generateFromToken($token, $config);
} }
return $html; return $html;
} }
function generateFromToken($token) { function generateFromToken($token, $config) {
if (!isset($token->type)) return ''; if (!isset($token->type)) return '';
if ($token->type == 'start') { if ($token->type == 'start') {
$attr = $this->generateAttributes($token->attributes); $attr = $this->generateAttributes($token->attributes, $config);
return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>'; return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
} elseif ($token->type == 'end') { } elseif ($token->type == 'end') {
return '</' . $token->name . '>'; return '</' . $token->name . '>';
} elseif ($token->type == 'empty') { } elseif ($token->type == 'empty') {
$attr = $this->generateAttributes($token->attributes); $attr = $this->generateAttributes($token->attributes, $config);
return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />'; return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />';
} elseif ($token->type == 'text') { } elseif ($token->type == 'text') {
@ -36,7 +38,7 @@ class HTMLPurifier_Generator
} }
} }
function generateAttributes($assoc_array_of_attributes) { function generateAttributes($assoc_array_of_attributes, $config) {
$html = ''; $html = '';
foreach ($assoc_array_of_attributes as $key => $value) { foreach ($assoc_array_of_attributes as $key => $value) {
$html .= $key.'="'.htmlspecialchars($value, ENT_COMPAT, 'UTF-8').'" '; $html .= $key.'="'.htmlspecialchars($value, ENT_COMPAT, 'UTF-8').'" ';

View File

@ -2,6 +2,14 @@
require_once 'HTMLPurifier/Token.php'; require_once 'HTMLPurifier/Token.php';
HTMLPurifier_ConfigDef::define(
'Core', 'AcceptFullDocuments', true,
'This parameter determines whether or not the filter should accept full '.
'HTML documents, not just HTML fragments. When on, it will '.
'drop all sections except the content between body. Depending on '.
'the implementation in use, this may speed up document parse times.'
);
/** /**
* Forgivingly lexes HTML (SGML-style) markup into tokens. * Forgivingly lexes HTML (SGML-style) markup into tokens.
* *
@ -52,7 +60,7 @@ class HTMLPurifier_Lexer
* @param $string String HTML. * @param $string String HTML.
* @return HTMLPurifier_Token array representation of HTML. * @return HTMLPurifier_Token array representation of HTML.
*/ */
function tokenizeHTML($string) { function tokenizeHTML($string, $config = null) {
trigger_error('Call to abstract class', E_USER_ERROR); trigger_error('Call to abstract class', E_USER_ERROR);
} }
@ -228,6 +236,14 @@ class HTMLPurifier_Lexer
return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8'); return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
} }
/**
* Takes a string of HTML (fragment or document) and returns the content
*/
function extractBody($html) {
if (strpos($html, '<html') === false) return $html; // already fragment
// ...
}
} }
?> ?>

View File

@ -25,7 +25,9 @@ require_once 'HTMLPurifier/Lexer.php';
class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
{ {
public function tokenizeHTML($string) { public function tokenizeHTML($string, $config = null) {
if (!$config) $config = HTMLPurifier_Config::createDefault();
$doc = new DOMDocument(); $doc = new DOMDocument();
$doc->encoding = 'UTF-8'; // technically does nothing, but comprehensive $doc->encoding = 'UTF-8'; // technically does nothing, but comprehensive

View File

@ -106,7 +106,9 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
} }
} }
function tokenizeHTML($string) { function tokenizeHTML($string, $config = null) {
if (!$config) $config = HTMLPurifier_Config::createDefault();
// some quick checking (if empty, return empty) // some quick checking (if empty, return empty)
$string = @ (string) $string; $string = @ (string) $string;

View File

@ -29,7 +29,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
*/ */
var $tokens = array(); var $tokens = array();
function tokenizeHTML($html) { function tokenizeHTML($html, $config = null) {
if (!$config) $config = HTMLPurifier_Config::createDefault();
$html = $this->escapeCDATA($html); $html = $this->escapeCDATA($html);
$html = $this->substituteNonSpecialEntities($html); $html = $this->substituteNonSpecialEntities($html);
$parser=& new XML_HTMLSax3(); $parser=& new XML_HTMLSax3();

View File

@ -87,7 +87,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// make sure that we have something open // make sure that we have something open
if (empty($current_nesting)) { if (empty($current_nesting)) {
$result[] = new HTMLPurifier_Token_Text( $result[] = new HTMLPurifier_Token_Text(
$this->generator->generateFromToken($token) $this->generator->generateFromToken($token, $config)
); );
continue; continue;
} }
@ -122,7 +122,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// we still didn't find the tag, so translate to text // we still didn't find the tag, so translate to text
if ($skipped_tags === false) { if ($skipped_tags === false) {
$result[] = new HTMLPurifier_Token_Text( $result[] = new HTMLPurifier_Token_Text(
$this->generator->generateFromToken($token) $this->generator->generateFromToken($token, $config)
); );
continue; continue;
} }

View File

@ -43,7 +43,7 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
} else { } else {
// invalid tag, generate HTML and insert in // invalid tag, generate HTML and insert in
$token = new HTMLPurifier_Token_Text( $token = new HTMLPurifier_Token_Text(
$this->generator->generateFromToken($token) $this->generator->generateFromToken($token, $config)
); );
} }
} elseif ($token->type == 'comment') { } elseif ($token->type == 'comment') {

View File

@ -19,7 +19,6 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
function assertSeries($inputs, $expect, $config, $context = array()) { function assertSeries($inputs, $expect, $config, $context = array()) {
foreach ($inputs as $i => $input) { foreach ($inputs as $i => $input) {
$tokens = $this->lex->tokenizeHTML($input);
if (!isset($context[$i])) { if (!isset($context[$i])) {
$context[$i] = null; $context[$i] = null;
@ -28,12 +27,13 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
$config[$i] = HTMLPurifier_Config::createDefault(); $config[$i] = HTMLPurifier_Config::createDefault();
} }
$tokens = $this->lex->tokenizeHTML($input, $config[$i]);
$result = $this->def->validateChildren($tokens, $config[$i], $context[$i]); $result = $this->def->validateChildren($tokens, $config[$i], $context[$i]);
if (is_bool($expect[$i])) { if (is_bool($expect[$i])) {
$this->assertIdentical($expect[$i], $result, "Test $i: %s"); $this->assertIdentical($expect[$i], $result, "Test $i: %s");
} else { } else {
$result_html = $this->gen->generateFromTokens($result); $result_html = $this->gen->generateFromTokens($result, $config[$i]);
$this->assertIdentical($expect[$i], $result_html, "Test $i: %s"); $this->assertIdentical($expect[$i], $result_html, "Test $i: %s");
paintIf($result_html, $result_html != $expect[$i]); paintIf($result_html, $result_html != $expect[$i]);
} }

View File

@ -19,6 +19,7 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase
$inputs = array(); $inputs = array();
$expect = array(); $expect = array();
$config = array();
$inputs[0] = new HTMLPurifier_Token_Text('Foobar.<>'); $inputs[0] = new HTMLPurifier_Token_Text('Foobar.<>');
$expect[0] = 'Foobar.&lt;&gt;'; $expect[0] = 'Foobar.&lt;&gt;';
@ -51,8 +52,10 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase
$inputs[7] = new HTMLPurifier_Token_Text($theta_char); $inputs[7] = new HTMLPurifier_Token_Text($theta_char);
$expect[7] = $theta_char; $expect[7] = $theta_char;
$default_config = HTMLPurifier_Config::createDefault();
foreach ($inputs as $i => $input) { foreach ($inputs as $i => $input) {
$result = $this->gen->generateFromToken($input); if (!isset($config[$i])) $config[$i] = $default_config;
$result = $this->gen->generateFromToken($input, $config[$i]);
$this->assertEqual($result, $expect[$i]); $this->assertEqual($result, $expect[$i]);
paintIf($result, $result != $expect[$i]); paintIf($result, $result != $expect[$i]);
} }
@ -63,6 +66,7 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase
$inputs = array(); $inputs = array();
$expect = array(); $expect = array();
$config = array();
$inputs[0] = array(); $inputs[0] = array();
$expect[0] = ''; $expect[0] = '';
@ -81,8 +85,10 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase
$inputs[4] = array('title' => 'Theta is ' . $theta_char); $inputs[4] = array('title' => 'Theta is ' . $theta_char);
$expect[4] = 'title="Theta is ' . $theta_char . '"'; $expect[4] = 'title="Theta is ' . $theta_char . '"';
$default_config = HTMLPurifier_Config::createDefault();
foreach ($inputs as $i => $input) { foreach ($inputs as $i => $input) {
$result = $this->gen->generateAttributes($input); if (!isset($config[$i])) $config[$i] = $default_config;
$result = $this->gen->generateAttributes($input, $config[$i]);
$this->assertEqual($result, $expect[$i]); $this->assertEqual($result, $expect[$i]);
paintIf($result, $result != $expect[$i]); paintIf($result, $result != $expect[$i]);
} }
@ -93,6 +99,7 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase
$inputs = array(); $inputs = array();
$expect = array(); $expect = array();
$config = array();
$inputs[0] = array( $inputs[0] = array(
new HTMLPurifier_Token_Start('b'), new HTMLPurifier_Token_Start('b'),
@ -104,8 +111,10 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase
$inputs[1] = array(); $inputs[1] = array();
$expect[1] = ''; $expect[1] = '';
$default_config = HTMLPurifier_Config::createDefault();
foreach ($inputs as $i => $input) { foreach ($inputs as $i => $input) {
$result = $this->gen->generateFromTokens($input); if (!isset($config[$i])) $config[$i] = $default_config;
$result = $this->gen->generateFromTokens($input, $config[$i]);
$this->assertEqual($expect[$i], $result); $this->assertEqual($expect[$i], $result);
paintIf($result, $result != $expect[$i]); paintIf($result, $result != $expect[$i]);
} }

View File

@ -40,11 +40,16 @@ class HTMLPurifier_LexerTest extends UnitTestCase
$this->Lexer->substituteNonSpecialEntities('"') ); $this->Lexer->substituteNonSpecialEntities('"') );
} }
function test_extractBody() {
}
function test_tokenizeHTML() { function test_tokenizeHTML() {
$input = array(); $input = array();
$expect = array(); $expect = array();
$sax_expect = array(); $sax_expect = array();
$config = array();
$input[0] = ''; $input[0] = '';
$expect[0] = array(); $expect[0] = array();
@ -221,14 +226,17 @@ class HTMLPurifier_LexerTest extends UnitTestCase
$input[17] = $char_hearts; $input[17] = $char_hearts;
$expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) ); $expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) );
$default_config = HTMLPurifier_Config::createDefault();
foreach($input as $i => $discard) { foreach($input as $i => $discard) {
$result = $this->DirectLex->tokenizeHTML($input[$i]); if (!isset($config[$i])) $config[$i] = $default_config;
$result = $this->DirectLex->tokenizeHTML($input[$i], $config[$i]);
$this->assertEqual($expect[$i], $result, 'DirectLexTest '.$i.': %s'); $this->assertEqual($expect[$i], $result, 'DirectLexTest '.$i.': %s');
paintIf($result, $expect[$i] != $result); paintIf($result, $expect[$i] != $result);
if ($this->_has_pear) { if ($this->_has_pear) {
// assert unless I say otherwise // assert unless I say otherwise
$sax_result = $this->PEARSax3->tokenizeHTML($input[$i]); $sax_result = $this->PEARSax3->tokenizeHTML($input[$i], $config[$i]);
if (!isset($sax_expect[$i])) { if (!isset($sax_expect[$i])) {
// by default, assert with normal result // by default, assert with normal result
$this->assertEqual($expect[$i], $sax_result, 'PEARSax3Test '.$i.': %s'); $this->assertEqual($expect[$i], $sax_result, 'PEARSax3Test '.$i.': %s');
@ -244,7 +252,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase
} }
if ($this->_has_dom) { if ($this->_has_dom) {
$dom_result = $this->DOMLex->tokenizeHTML($input[$i]); $dom_result = $this->DOMLex->tokenizeHTML($input[$i], $config[$i]);
// same structure as SAX // same structure as SAX
if (!isset($dom_expect[$i])) { if (!isset($dom_expect[$i])) {
$this->assertEqual($expect[$i], $dom_result, 'DOMLexTest '.$i.': %s'); $this->assertEqual($expect[$i], $dom_result, 'DOMLexTest '.$i.': %s');

View File

@ -31,7 +31,7 @@ class HTMLPurifier_StrategyHarness extends UnitTestCase
$config[$i] = HTMLPurifier_Config::createDefault(); $config[$i] = HTMLPurifier_Config::createDefault();
} }
$result_tokens = $strategy->execute($tokens, $config[$i]); $result_tokens = $strategy->execute($tokens, $config[$i]);
$result = $this->gen->generateFromTokens($result_tokens); $result = $this->gen->generateFromTokens($result_tokens, $config[$i]);
$this->assertEqual($expect[$i], $result, "Test $i: %s"); $this->assertEqual($expect[$i], $result, "Test $i: %s");
paintIf($result, $result != $expect[$i]); paintIf($result, $result != $expect[$i]);
} }