diff --git a/TODO.txt b/TODO.txt
index ac220de8..4f5f1c7d 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,11 +1,13 @@
-TODO
+Todo List
Primary:
-- Finish attributes fixing
+ - Implement attribute validation
+ - Implement HTMLPurifier
Secondary:
-- Migrate all unit tests to use the lexer and generator
+ - Migrate all unit tests to use the lexer and generator
Code issues:
-- (In Progress) Factor PureHTMLDefinition into a set of strategies
-- (?) Create a TokenFactory to prevent really long lines
\ No newline at end of file
+ - Rename AbstractTest to Harness
+ - Reorganize Strategy hierarchy to minimize duplication
+ - (?) Create a TokenFactory to prevent really long lines
\ No newline at end of file
diff --git a/library/HTMLPurifier.php b/library/HTMLPurifier.php
index 4b3f2eee..631e7bb8 100644
--- a/library/HTMLPurifier.php
+++ b/library/HTMLPurifier.php
@@ -8,10 +8,11 @@
* is safe for output onto webpages. It achieves this by:
*
* -# Lexing (parsing into tokens) the document,
- * -# Removing all elements not in the whitelist,
- * -# Making the tokens well-formed,
- * -# Fixing the nesting of the nodes,
- * -# Validating attributes of the nodes, and
+ * -# Executing various strategies on the tokens:
+ * -# Removing all elements not in the whitelist,
+ * -# Making the tokens well-formed,
+ * -# Fixing the nesting of the nodes, and
+ * -# Validating attributes of the nodes; and
* -# Generating HTML from the purified tokens.
*
* See /docs/spec.txt for more details.
@@ -31,24 +32,15 @@ require_once 'HTMLPurifier/Generator.php';
class HTMLPurifier
{
- var $lexer; /*!< @brief Instance of HTMLPurifier_Lexer concrete
- implementation. */
- var $definition; /*!< @brief Instance of HTMLPurifier_Definition. */
- var $generator; /*!< @brief Instance of HTMLPurifier_Generator. */
-
/**
* Initializes the purifier.
*
* The constructor instantiates all necessary sub-objects to do the job,
* because creating some of them (esp. HTMLPurifier_Definition) can be
* expensive.
- *
- * @todo Accept Policy object to define configuration.
*/
function HTMLPurifier() {
- $this->lexer = new HTMLPurifier_Lexer::create();
- $this->definition = new HTMLPurifier_Definition();
- $this->generator = new HTMLPurifier_Generator();
+ // unimplemented
}
/**
@@ -58,9 +50,7 @@ class HTMLPurifier
* @return Purified HTML
*/
function purify($html) {
- $tokens = $this->lexer->tokenizeHTML($html);
- $tokens = $this->definition->purifyTokens($tokens);
- return $this->generator->generateFromTokens($tokens);
+ // unimplemented
}
}
diff --git a/library/HTMLPurifier/Definition.php b/library/HTMLPurifier/Definition.php
index 32bc96e7..27cefc80 100644
--- a/library/HTMLPurifier/Definition.php
+++ b/library/HTMLPurifier/Definition.php
@@ -33,8 +33,17 @@ class HTMLPurifier_Definition
'ul' => true
);
+ function instance() {
+ static $instance = null;
+ if (!$instance) {
+ $instance = new HTMLPurifier_Definition();
+ }
+ return $instance;
+ }
+
function HTMLPurifier_Definition() {
$this->generator = new HTMLPurifier_Generator();
+ $this->loadData();
}
function loadData() {
@@ -154,279 +163,6 @@ class HTMLPurifier_Definition
}
- function purifyTokens($tokens) {
- if (empty($this->info)) $this->loadData();
- $tokens = $this->removeForeignElements($tokens);
- $tokens = $this->makeWellFormed($tokens);
- $tokens = $this->fixNesting($tokens);
- $tokens = $this->validateAttributes($tokens);
- return $tokens;
- }
-
- function removeForeignElements($tokens) {
- if (empty($this->info)) $this->loadData();
- $result = array();
- foreach($tokens as $token) {
- if (!empty( $token->is_tag )) {
- if (!isset($this->info[$token->name])) {
- // invalid tag, generate HTML and insert in
- $token = new HTMLPurifier_Token_Text(
- $this->generator->generateFromToken($token)
- );
- }
- } elseif ($token->type == 'comment') {
- // strip comments
- continue;
- } elseif ($token->type == 'text') {
- } else {
- continue;
- }
- $result[] = $token;
- }
- return $result;
- }
-
- function makeWellFormed($tokens) {
- if (empty($this->info)) $this->loadData();
- $result = array();
- $current_nesting = array();
- foreach ($tokens as $token) {
- if (empty( $token->is_tag )) {
- $result[] = $token;
- continue;
- }
- $info = $this->info[$token->name]; // assumption but valid
-
- // test if it claims to be a start tag but is empty
- if ($info->child_def->type == 'empty' &&
- $token->type == 'start' ) {
-
- $result[] = new HTMLPurifier_Token_Empty($token->name,
- $token->attributes);
- continue;
- }
-
- // test if it claims to be empty but really is a start tag
- if ($info->child_def->type != 'empty' &&
- $token->type == 'empty' ) {
-
- $result[] = new HTMLPurifier_Token_Start($token->name,
- $token->attributes);
- $result[] = new HTMLPurifier_Token_End($token->name);
-
- continue;
- }
-
- // automatically insert empty tags
- if ($token->type == 'empty') {
- $result[] = $token;
- continue;
- }
-
- // we give start tags precedence, so automatically accept unless...
- // it's one of those special cases
- if ($token->type == 'start') {
-
- // if there's a parent, check for special case
- if (!empty($current_nesting)) {
- $current_parent = array_pop($current_nesting);
-
- // check if we're closing a P tag
- if ($current_parent->name == 'p' &&
- isset($this->info_closes_p[$token->name])
- ) {
- $result[] = new HTMLPurifier_Token_End('p');
- $result[] = $token;
- $current_nesting[] = $token;
- continue;
- }
-
- // check if we're closing a LI tag
- if ($current_parent->name == 'li' &&
- $token->name == 'li'
- ) {
- $result[] = new HTMLPurifier_Token_End('li');
- $result[] = $token;
- $current_nesting[] = $token;
- continue;
- }
-
- // this is more TIDY stuff
- // we should also get some TABLE related code
- // mismatched h#
-
- $current_nesting[] = $current_parent; // undo the pop
- }
-
- $result[] = $token;
- $current_nesting[] = $token;
- continue;
- }
-
- // sanity check
- if ($token->type != 'end') continue;
-
- // okay, we're dealing with a closing tag
-
- // make sure that we have something open
- if (empty($current_nesting)) {
- $result[] = new HTMLPurifier_Token_Text(
- $this->generator->generateFromToken($token)
- );
- continue;
- }
-
- // first, check for the simplest case: everything closes neatly
-
- // current_nesting is modified
- $current_parent = array_pop($current_nesting);
- if ($current_parent->name == $token->name) {
- $result[] = $token;
- continue;
- }
-
- // undo the array_pop
- $current_nesting[] = $current_parent;
-
- // okay, so we're trying to close the wrong tag
-
- // scroll back the entire nest, trying to find our tag
- // feature could be to specify how far you'd like to go
- $size = count($current_nesting);
- // -2 because -1 is the last element, but we already checked that
- $skipped_tags = false;
- for ($i = $size - 2; $i >= 0; $i--) {
- if ($current_nesting[$i]->name == $token->name) {
- // current nesting is modified
- $skipped_tags = array_splice($current_nesting, $i);
- break;
- }
- }
-
- // we still didn't find the tag, so translate to text
- if ($skipped_tags === false) {
- $result[] = new HTMLPurifier_Token_Text(
- $this->generator->generateFromToken($token)
- );
- continue;
- }
-
- // okay, we found it, close all the skipped tags
- // note that skipped tags contains the element we need closed
- $size = count($skipped_tags);
- for ($i = $size - 1; $i >= 0; $i--) {
- $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
- }
-
- // done!
-
- }
-
- // we're at the end now, fix all still unclosed tags
-
- if (!empty($current_nesting)) {
- $size = count($current_nesting);
- for ($i = $size - 1; $i >= 0; $i--) {
- $result[] =
- new HTMLPurifier_Token_End($current_nesting[$i]->name);
- }
- }
-
- return $result;
- }
-
- function fixNesting($tokens) {
- if (empty($this->info)) $this->loadData();
-
- // insert implicit "parent" node, will be removed at end
- array_unshift($tokens, new HTMLPurifier_Token_Start('div'));
- $tokens[] = new HTMLPurifier_Token_End('div');
-
- for ($i = 0, $size = count($tokens) ; $i < $size; ) {
-
- $child_tokens = array();
-
- // scroll to the end of this node, and report number
- for ($j = $i, $depth = 0; ; $j++) {
- if ($tokens[$j]->type == 'start') {
- $depth++;
- // skip token assignment on first iteration
- if ($depth == 1) continue;
- } elseif ($tokens[$j]->type == 'end') {
- $depth--;
- // skip token assignment on last iteration
- if ($depth == 0) break;
- }
- $child_tokens[] = $tokens[$j];
- }
-
- // $i is index of start token
- // $j is index of end token
-
- // have DTD child def validate children
- $element_def = $this->info[$tokens[$i]->name];
- $result = $element_def->child_def->validateChildren($child_tokens);
-
- // process result
- if ($result === true) {
-
- // leave the nodes as is
-
- } elseif($result === false) {
-
- // WARNING WARNING WARNING!!!
- // While for the original DTD, there will never be
- // cascading removal, more complex ones may have such
- // a problem.
-
- // If you modify the info array such that an element
- // that requires children may contain a child that requires
- // children, you need to also scroll back and re-check that
- // elements parent node
-
- $length = $j - $i + 1;
-
- // remove entire node
- array_splice($tokens, $i, $length);
-
- // change size
- $size -= $length;
-
- // ensure that we scroll to the next node
- $i--;
-
- } else {
-
- $length = $j - $i - 1;
-
- // replace node with $result
- array_splice($tokens, $i + 1, $length, $result);
-
- // change size
- $size -= $length;
- $size += count($result);
-
- }
-
- // scroll to next node
- $i++;
- while ($i < $size and $tokens[$i]->type != 'start') $i++;
-
- }
-
- // remove implicit divs
- array_shift($tokens);
- array_pop($tokens);
-
- return $tokens;
-
- }
-
- function validateAttributes($tokens) {
- if (empty($this->info)) $this->loadData();
-
- }
-
}
class HTMLPurifier_ElementDef
diff --git a/library/HTMLPurifier/Strategy/FixNesting.php b/library/HTMLPurifier/Strategy/FixNesting.php
index 831a1b57..b0b2ca0e 100644
--- a/library/HTMLPurifier/Strategy/FixNesting.php
+++ b/library/HTMLPurifier/Strategy/FixNesting.php
@@ -8,7 +8,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
var $definition;
- function HTMLPurifier_Definition() {
+ function HTMLPurifier_Strategy_FixNesting() {
$this->definition = HTMLPurifier_Definition::instance();
}
diff --git a/library/HTMLPurifier/Strategy/RemoveForeignElements.php b/library/HTMLPurifier/Strategy/RemoveForeignElements.php
index 51d73031..7418dfc6 100644
--- a/library/HTMLPurifier/Strategy/RemoveForeignElements.php
+++ b/library/HTMLPurifier/Strategy/RemoveForeignElements.php
@@ -10,7 +10,7 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
var $generator;
var $definition;
- function HTMLPurifier_Definition() {
+ function HTMLPurifier_Strategy_RemoveForeignElements() {
$this->generator = new HTMLPurifier_Generator();
$this->definition = HTMLPurifier_Definition::instance();
}
diff --git a/tests/HTMLPurifier/DefinitionTest.php b/tests/HTMLPurifier/DefinitionTest.php
deleted file mode 100644
index f1c168e8..00000000
--- a/tests/HTMLPurifier/DefinitionTest.php
+++ /dev/null
@@ -1,143 +0,0 @@
-UnitTestCase();
- $this->def = new HTMLPurifier_Definition();
- $this->def->loadData();
-
- // we can't use the DOM lexer since it does too much stuff
- // automatically, however, we should be able to use it
- // interchangeably if we wanted to...
-
- if (true) {
- $this->lex = new HTMLPurifier_Lexer_DirectLex();
- } else {
- require_once 'HTMLPurifier/Lexer/DOMLex.php';
- $this->lex = new HTMLPurifier_Lexer_DOMLex();
- }
-
- $this->gen = new HTMLPurifier_Generator();
- }
-
- function test_removeForeignElements() {
-
- $inputs = array();
- $expect = array();
-
- $inputs[0] = '';
- $expect[0] = $inputs[0];
-
- $inputs[1] = 'This is bold text.';
- $expect[1] = $inputs[1];
-
- // [INVALID]
- $inputs[2] = '
';
- $expect[5] = '
';
-
- $inputs[6] = '
Paragraph 1
Paragraph 2'; - $expect[7] = '
Paragraph 1
Paragraph 2
'; - - $inputs[8] = 'Paragraphs
In
A
Div
Paragraphs
In
A
Div
Paragraph 1
Paragraph 2'; - $expect[7] = '
Paragraph 1
Paragraph 2
'; - - $inputs[8] = 'Paragraphs
In
A
Div
Paragraphs
In
A
Div