From 619d5d9bc140f942eb53ed38b7530725e9145e89 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Mon, 24 Jul 2006 02:49:37 +0000 Subject: [PATCH] Migrate strategies to separate classes complete. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@119 48356398-32a2-884e-a903-53898d9a118a --- TODO.txt | 12 +- library/HTMLPurifier.php | 24 +- library/HTMLPurifier/Definition.php | 282 +----------------- library/HTMLPurifier/Strategy/FixNesting.php | 2 +- .../Strategy/RemoveForeignElements.php | 2 +- tests/HTMLPurifier/DefinitionTest.php | 143 --------- ...eWellFormedTest => MakeWellFormedTest.php} | 0 .../Strategy/RemoveForeignElementsTest.php | 3 +- tests/HTMLPurifier/StrategyAbstractTest.php | 119 +------- tests/index.php | 4 +- 10 files changed, 33 insertions(+), 558 deletions(-) delete mode 100644 tests/HTMLPurifier/DefinitionTest.php rename tests/HTMLPurifier/Strategy/{MakeWellFormedTest => MakeWellFormedTest.php} (100%) diff --git a/TODO.txt b/TODO.txt index ac220de8..4f5f1c7d 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,11 +1,13 @@ -TODO +Todo List Primary: -- Finish attributes fixing + - Implement attribute validation + - Implement HTMLPurifier Secondary: -- Migrate all unit tests to use the lexer and generator + - Migrate all unit tests to use the lexer and generator Code issues: -- (In Progress) Factor PureHTMLDefinition into a set of strategies -- (?) Create a TokenFactory to prevent really long lines \ No newline at end of file + - Rename AbstractTest to Harness + - Reorganize Strategy hierarchy to minimize duplication + - (?) Create a TokenFactory to prevent really long lines \ No newline at end of file diff --git a/library/HTMLPurifier.php b/library/HTMLPurifier.php index 4b3f2eee..631e7bb8 100644 --- a/library/HTMLPurifier.php +++ b/library/HTMLPurifier.php @@ -8,10 +8,11 @@ * is safe for output onto webpages. It achieves this by: * * -# Lexing (parsing into tokens) the document, - * -# Removing all elements not in the whitelist, - * -# Making the tokens well-formed, - * -# Fixing the nesting of the nodes, - * -# Validating attributes of the nodes, and + * -# Executing various strategies on the tokens: + * -# Removing all elements not in the whitelist, + * -# Making the tokens well-formed, + * -# Fixing the nesting of the nodes, and + * -# Validating attributes of the nodes; and * -# Generating HTML from the purified tokens. * * See /docs/spec.txt for more details. @@ -31,24 +32,15 @@ require_once 'HTMLPurifier/Generator.php'; class HTMLPurifier { - var $lexer; /*!< @brief Instance of HTMLPurifier_Lexer concrete - implementation. */ - var $definition; /*!< @brief Instance of HTMLPurifier_Definition. */ - var $generator; /*!< @brief Instance of HTMLPurifier_Generator. */ - /** * Initializes the purifier. * * The constructor instantiates all necessary sub-objects to do the job, * because creating some of them (esp. HTMLPurifier_Definition) can be * expensive. - * - * @todo Accept Policy object to define configuration. */ function HTMLPurifier() { - $this->lexer = new HTMLPurifier_Lexer::create(); - $this->definition = new HTMLPurifier_Definition(); - $this->generator = new HTMLPurifier_Generator(); + // unimplemented } /** @@ -58,9 +50,7 @@ class HTMLPurifier * @return Purified HTML */ function purify($html) { - $tokens = $this->lexer->tokenizeHTML($html); - $tokens = $this->definition->purifyTokens($tokens); - return $this->generator->generateFromTokens($tokens); + // unimplemented } } diff --git a/library/HTMLPurifier/Definition.php b/library/HTMLPurifier/Definition.php index 32bc96e7..27cefc80 100644 --- a/library/HTMLPurifier/Definition.php +++ b/library/HTMLPurifier/Definition.php @@ -33,8 +33,17 @@ class HTMLPurifier_Definition 'ul' => true ); + function instance() { + static $instance = null; + if (!$instance) { + $instance = new HTMLPurifier_Definition(); + } + return $instance; + } + function HTMLPurifier_Definition() { $this->generator = new HTMLPurifier_Generator(); + $this->loadData(); } function loadData() { @@ -154,279 +163,6 @@ class HTMLPurifier_Definition } - function purifyTokens($tokens) { - if (empty($this->info)) $this->loadData(); - $tokens = $this->removeForeignElements($tokens); - $tokens = $this->makeWellFormed($tokens); - $tokens = $this->fixNesting($tokens); - $tokens = $this->validateAttributes($tokens); - return $tokens; - } - - function removeForeignElements($tokens) { - if (empty($this->info)) $this->loadData(); - $result = array(); - foreach($tokens as $token) { - if (!empty( $token->is_tag )) { - if (!isset($this->info[$token->name])) { - // invalid tag, generate HTML and insert in - $token = new HTMLPurifier_Token_Text( - $this->generator->generateFromToken($token) - ); - } - } elseif ($token->type == 'comment') { - // strip comments - continue; - } elseif ($token->type == 'text') { - } else { - continue; - } - $result[] = $token; - } - return $result; - } - - function makeWellFormed($tokens) { - if (empty($this->info)) $this->loadData(); - $result = array(); - $current_nesting = array(); - foreach ($tokens as $token) { - if (empty( $token->is_tag )) { - $result[] = $token; - continue; - } - $info = $this->info[$token->name]; // assumption but valid - - // test if it claims to be a start tag but is empty - if ($info->child_def->type == 'empty' && - $token->type == 'start' ) { - - $result[] = new HTMLPurifier_Token_Empty($token->name, - $token->attributes); - continue; - } - - // test if it claims to be empty but really is a start tag - if ($info->child_def->type != 'empty' && - $token->type == 'empty' ) { - - $result[] = new HTMLPurifier_Token_Start($token->name, - $token->attributes); - $result[] = new HTMLPurifier_Token_End($token->name); - - continue; - } - - // automatically insert empty tags - if ($token->type == 'empty') { - $result[] = $token; - continue; - } - - // we give start tags precedence, so automatically accept unless... - // it's one of those special cases - if ($token->type == 'start') { - - // if there's a parent, check for special case - if (!empty($current_nesting)) { - $current_parent = array_pop($current_nesting); - - // check if we're closing a P tag - if ($current_parent->name == 'p' && - isset($this->info_closes_p[$token->name]) - ) { - $result[] = new HTMLPurifier_Token_End('p'); - $result[] = $token; - $current_nesting[] = $token; - continue; - } - - // check if we're closing a LI tag - if ($current_parent->name == 'li' && - $token->name == 'li' - ) { - $result[] = new HTMLPurifier_Token_End('li'); - $result[] = $token; - $current_nesting[] = $token; - continue; - } - - // this is more TIDY stuff - // we should also get some TABLE related code - // mismatched h# - - $current_nesting[] = $current_parent; // undo the pop - } - - $result[] = $token; - $current_nesting[] = $token; - continue; - } - - // sanity check - if ($token->type != 'end') continue; - - // okay, we're dealing with a closing tag - - // make sure that we have something open - if (empty($current_nesting)) { - $result[] = new HTMLPurifier_Token_Text( - $this->generator->generateFromToken($token) - ); - continue; - } - - // first, check for the simplest case: everything closes neatly - - // current_nesting is modified - $current_parent = array_pop($current_nesting); - if ($current_parent->name == $token->name) { - $result[] = $token; - continue; - } - - // undo the array_pop - $current_nesting[] = $current_parent; - - // okay, so we're trying to close the wrong tag - - // scroll back the entire nest, trying to find our tag - // feature could be to specify how far you'd like to go - $size = count($current_nesting); - // -2 because -1 is the last element, but we already checked that - $skipped_tags = false; - for ($i = $size - 2; $i >= 0; $i--) { - if ($current_nesting[$i]->name == $token->name) { - // current nesting is modified - $skipped_tags = array_splice($current_nesting, $i); - break; - } - } - - // we still didn't find the tag, so translate to text - if ($skipped_tags === false) { - $result[] = new HTMLPurifier_Token_Text( - $this->generator->generateFromToken($token) - ); - continue; - } - - // okay, we found it, close all the skipped tags - // note that skipped tags contains the element we need closed - $size = count($skipped_tags); - for ($i = $size - 1; $i >= 0; $i--) { - $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name); - } - - // done! - - } - - // we're at the end now, fix all still unclosed tags - - if (!empty($current_nesting)) { - $size = count($current_nesting); - for ($i = $size - 1; $i >= 0; $i--) { - $result[] = - new HTMLPurifier_Token_End($current_nesting[$i]->name); - } - } - - return $result; - } - - function fixNesting($tokens) { - if (empty($this->info)) $this->loadData(); - - // insert implicit "parent" node, will be removed at end - array_unshift($tokens, new HTMLPurifier_Token_Start('div')); - $tokens[] = new HTMLPurifier_Token_End('div'); - - for ($i = 0, $size = count($tokens) ; $i < $size; ) { - - $child_tokens = array(); - - // scroll to the end of this node, and report number - for ($j = $i, $depth = 0; ; $j++) { - if ($tokens[$j]->type == 'start') { - $depth++; - // skip token assignment on first iteration - if ($depth == 1) continue; - } elseif ($tokens[$j]->type == 'end') { - $depth--; - // skip token assignment on last iteration - if ($depth == 0) break; - } - $child_tokens[] = $tokens[$j]; - } - - // $i is index of start token - // $j is index of end token - - // have DTD child def validate children - $element_def = $this->info[$tokens[$i]->name]; - $result = $element_def->child_def->validateChildren($child_tokens); - - // process result - if ($result === true) { - - // leave the nodes as is - - } elseif($result === false) { - - // WARNING WARNING WARNING!!! - // While for the original DTD, there will never be - // cascading removal, more complex ones may have such - // a problem. - - // If you modify the info array such that an element - // that requires children may contain a child that requires - // children, you need to also scroll back and re-check that - // elements parent node - - $length = $j - $i + 1; - - // remove entire node - array_splice($tokens, $i, $length); - - // change size - $size -= $length; - - // ensure that we scroll to the next node - $i--; - - } else { - - $length = $j - $i - 1; - - // replace node with $result - array_splice($tokens, $i + 1, $length, $result); - - // change size - $size -= $length; - $size += count($result); - - } - - // scroll to next node - $i++; - while ($i < $size and $tokens[$i]->type != 'start') $i++; - - } - - // remove implicit divs - array_shift($tokens); - array_pop($tokens); - - return $tokens; - - } - - function validateAttributes($tokens) { - if (empty($this->info)) $this->loadData(); - - } - } class HTMLPurifier_ElementDef diff --git a/library/HTMLPurifier/Strategy/FixNesting.php b/library/HTMLPurifier/Strategy/FixNesting.php index 831a1b57..b0b2ca0e 100644 --- a/library/HTMLPurifier/Strategy/FixNesting.php +++ b/library/HTMLPurifier/Strategy/FixNesting.php @@ -8,7 +8,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy var $definition; - function HTMLPurifier_Definition() { + function HTMLPurifier_Strategy_FixNesting() { $this->definition = HTMLPurifier_Definition::instance(); } diff --git a/library/HTMLPurifier/Strategy/RemoveForeignElements.php b/library/HTMLPurifier/Strategy/RemoveForeignElements.php index 51d73031..7418dfc6 100644 --- a/library/HTMLPurifier/Strategy/RemoveForeignElements.php +++ b/library/HTMLPurifier/Strategy/RemoveForeignElements.php @@ -10,7 +10,7 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy var $generator; var $definition; - function HTMLPurifier_Definition() { + function HTMLPurifier_Strategy_RemoveForeignElements() { $this->generator = new HTMLPurifier_Generator(); $this->definition = HTMLPurifier_Definition::instance(); } diff --git a/tests/HTMLPurifier/DefinitionTest.php b/tests/HTMLPurifier/DefinitionTest.php deleted file mode 100644 index f1c168e8..00000000 --- a/tests/HTMLPurifier/DefinitionTest.php +++ /dev/null @@ -1,143 +0,0 @@ -UnitTestCase(); - $this->def = new HTMLPurifier_Definition(); - $this->def->loadData(); - - // we can't use the DOM lexer since it does too much stuff - // automatically, however, we should be able to use it - // interchangeably if we wanted to... - - if (true) { - $this->lex = new HTMLPurifier_Lexer_DirectLex(); - } else { - require_once 'HTMLPurifier/Lexer/DOMLex.php'; - $this->lex = new HTMLPurifier_Lexer_DOMLex(); - } - - $this->gen = new HTMLPurifier_Generator(); - } - - function test_removeForeignElements() { - - $inputs = array(); - $expect = array(); - - $inputs[0] = ''; - $expect[0] = $inputs[0]; - - $inputs[1] = 'This is bold text.'; - $expect[1] = $inputs[1]; - - // [INVALID] - $inputs[2] = 'BlingBong'; - $expect[2] = htmlspecialchars($inputs[2]); - - foreach ($inputs as $i => $input) { - $tokens = $this->lex->tokenizeHTML($input); - $result_tokens = $this->def->removeForeignElements($tokens); - $result = $this->gen->generateFromTokens($result_tokens); - $this->assertEqual($expect[$i], $result, "Test $i: %s"); - paintIf($result, $result != $expect[$i]); - } - - } - - function test_makeWellFormed() { - - $inputs = array(); - $expect = array(); - - $inputs[0] = ''; - $expect[0] = $inputs[0]; - - $inputs[1] = 'This is bold text.'; - $expect[1] = $inputs[1]; - - $inputs[2] = 'Unclosed tag, gasp!'; - $expect[2] = 'Unclosed tag, gasp!'; - - $inputs[3] = 'Bold and italic?'; - $expect[3] = 'Bold and italic?'; - - // CHANGE THIS BEHAVIOR! - $inputs[4] = 'Unused end tags... recycle!'; - $expect[4] = 'Unused end tags... recycle!</b>'; - - $inputs[5] = '
'; - $expect[5] = '
'; - - $inputs[6] = '
'; - $expect[6] = '
'; - - // test automatic paragraph closing - - $inputs[7] = '

Paragraph 1

Paragraph 2'; - $expect[7] = '

Paragraph 1

Paragraph 2

'; - - $inputs[8] = '

Paragraphs

In

A

Div

'; - $expect[8] = '

Paragraphs

In

A

Div

'; - - // automatic list closing - - $inputs[9] = '
  1. Item 1
  2. Item 2
'; - $expect[9] = '
  1. Item 1
  2. Item 2
'; - - foreach ($inputs as $i => $input) { - $tokens = $this->lex->tokenizeHTML($input); - $result_tokens = $this->def->makeWellFormed($tokens); - $result = $this->gen->generateFromTokens($result_tokens); - $this->assertEqual($expect[$i], $result, "Test $i: %s"); - paintIf($result, $result != $expect[$i]); - } - - } - - function test_fixNesting() { - $inputs = array(); - $expect = array(); - - // next id = 4 - - // legal inline nesting - $inputs[0] = 'Bold text'; - $expect[0] = $inputs[0]; - - // legal inline and block - // as the parent element is considered FLOW - $inputs[1] = 'Blank
Block
'; - $expect[1] = $inputs[1]; - - // illegal block in inline, element -> text - $inputs[2] = '
Illegal div.
'; - $expect[2] = '<div>Illegal div.</div>'; - - // test of empty set that's required, resulting in removal of node - $inputs[3] = '
    '; - $expect[3] = ''; - - // test illegal text which gets removed - $inputs[4] = '
      Illegal text
    • Legal item
    '; - $expect[4] = '
    • Legal item
    '; - - foreach ($inputs as $i => $input) { - $tokens = $this->lex->tokenizeHTML($input); - $result_tokens = $this->def->fixNesting($tokens); - $result = $this->gen->generateFromTokens($result_tokens); - $this->assertEqual($expect[$i], $result, "Test $i: %s"); - paintIf($result, $result != $expect[$i]); - } - } - -} - -?> \ No newline at end of file diff --git a/tests/HTMLPurifier/Strategy/MakeWellFormedTest b/tests/HTMLPurifier/Strategy/MakeWellFormedTest.php similarity index 100% rename from tests/HTMLPurifier/Strategy/MakeWellFormedTest rename to tests/HTMLPurifier/Strategy/MakeWellFormedTest.php diff --git a/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php b/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php index cee1f8fc..5dee0a45 100644 --- a/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php +++ b/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php @@ -1,5 +1,6 @@ $input) { $tokens = $this->lex->tokenizeHTML($input); - $result_tokens = $this->strategy->execute($tokens); + $result_tokens = $strategy->execute($tokens); $result = $this->gen->generateFromTokens($result_tokens); $this->assertEqual($expect[$i], $result, "Test $i: %s"); paintIf($result, $result != $expect[$i]); diff --git a/tests/HTMLPurifier/StrategyAbstractTest.php b/tests/HTMLPurifier/StrategyAbstractTest.php index f1c168e8..28f81dce 100644 --- a/tests/HTMLPurifier/StrategyAbstractTest.php +++ b/tests/HTMLPurifier/StrategyAbstractTest.php @@ -3,15 +3,13 @@ require_once 'HTMLPurifier/Definition.php'; require_once 'HTMLPurifier/Lexer/DirectLex.php'; -class HTMLPurifier_DefinitionTest extends UnitTestCase +class HTMLPurifier_StrategyAbstractTest extends UnitTestCase { - var $def, $lex, $gen; + var $lex, $gen; - function HTMLPurifier_DefinitionTest() { + function HTMLPurifier_StrategyAbstractTest() { $this->UnitTestCase(); - $this->def = new HTMLPurifier_Definition(); - $this->def->loadData(); // we can't use the DOM lexer since it does too much stuff // automatically, however, we should be able to use it @@ -27,117 +25,6 @@ class HTMLPurifier_DefinitionTest extends UnitTestCase $this->gen = new HTMLPurifier_Generator(); } - function test_removeForeignElements() { - - $inputs = array(); - $expect = array(); - - $inputs[0] = ''; - $expect[0] = $inputs[0]; - - $inputs[1] = 'This is bold text.'; - $expect[1] = $inputs[1]; - - // [INVALID] - $inputs[2] = 'BlingBong'; - $expect[2] = htmlspecialchars($inputs[2]); - - foreach ($inputs as $i => $input) { - $tokens = $this->lex->tokenizeHTML($input); - $result_tokens = $this->def->removeForeignElements($tokens); - $result = $this->gen->generateFromTokens($result_tokens); - $this->assertEqual($expect[$i], $result, "Test $i: %s"); - paintIf($result, $result != $expect[$i]); - } - - } - - function test_makeWellFormed() { - - $inputs = array(); - $expect = array(); - - $inputs[0] = ''; - $expect[0] = $inputs[0]; - - $inputs[1] = 'This is bold text.'; - $expect[1] = $inputs[1]; - - $inputs[2] = 'Unclosed tag, gasp!'; - $expect[2] = 'Unclosed tag, gasp!'; - - $inputs[3] = 'Bold and italic?'; - $expect[3] = 'Bold and italic?'; - - // CHANGE THIS BEHAVIOR! - $inputs[4] = 'Unused end tags... recycle!'; - $expect[4] = 'Unused end tags... recycle!</b>'; - - $inputs[5] = '
    '; - $expect[5] = '
    '; - - $inputs[6] = '
    '; - $expect[6] = '
    '; - - // test automatic paragraph closing - - $inputs[7] = '

    Paragraph 1

    Paragraph 2'; - $expect[7] = '

    Paragraph 1

    Paragraph 2

    '; - - $inputs[8] = '

    Paragraphs

    In

    A

    Div

    '; - $expect[8] = '

    Paragraphs

    In

    A

    Div

    '; - - // automatic list closing - - $inputs[9] = '
    1. Item 1
    2. Item 2
    '; - $expect[9] = '
    1. Item 1
    2. Item 2
    '; - - foreach ($inputs as $i => $input) { - $tokens = $this->lex->tokenizeHTML($input); - $result_tokens = $this->def->makeWellFormed($tokens); - $result = $this->gen->generateFromTokens($result_tokens); - $this->assertEqual($expect[$i], $result, "Test $i: %s"); - paintIf($result, $result != $expect[$i]); - } - - } - - function test_fixNesting() { - $inputs = array(); - $expect = array(); - - // next id = 4 - - // legal inline nesting - $inputs[0] = 'Bold text'; - $expect[0] = $inputs[0]; - - // legal inline and block - // as the parent element is considered FLOW - $inputs[1] = 'Blank
    Block
    '; - $expect[1] = $inputs[1]; - - // illegal block in inline, element -> text - $inputs[2] = '
    Illegal div.
    '; - $expect[2] = '<div>Illegal div.</div>'; - - // test of empty set that's required, resulting in removal of node - $inputs[3] = '
      '; - $expect[3] = ''; - - // test illegal text which gets removed - $inputs[4] = '
        Illegal text
      • Legal item
      '; - $expect[4] = '
      • Legal item
      '; - - foreach ($inputs as $i => $input) { - $tokens = $this->lex->tokenizeHTML($input); - $result_tokens = $this->def->fixNesting($tokens); - $result = $this->gen->generateFromTokens($result_tokens); - $this->assertEqual($expect[$i], $result, "Test $i: %s"); - paintIf($result, $result != $expect[$i]); - } - } - } ?> \ No newline at end of file diff --git a/tests/index.php b/tests/index.php index 2dcd7694..c3283e76 100644 --- a/tests/index.php +++ b/tests/index.php @@ -16,10 +16,12 @@ $test = new GroupTest('HTMLPurifier'); $test->addTestFile('HTMLPurifier/LexerTest.php'); $test->addTestFile('HTMLPurifier/Lexer/DirectLexTest.php'); //$test->addTestFile('TokenTest.php'); -$test->addTestFile('HTMLPurifier/DefinitionTest.php'); $test->addTestFile('HTMLPurifier/ChildDefTest.php'); $test->addTestFile('HTMLPurifier/GeneratorTest.php'); $test->addTestFile('HTMLPurifier/EntityLookupTest.php'); +$test->addTestFile('HTMLPurifier/Strategy/RemoveForeignElementsTest.php'); +$test->addTestFile('HTMLPurifier/Strategy/MakeWellFormedTest.php'); +$test->addTestFile('HTMLPurifier/Strategy/FixNestingTest.php'); $test->run( new HtmlReporter() );