diff --git a/configdoc/usage.xml b/configdoc/usage.xml index 983d0bba..56eeafee 100644 --- a/configdoc/usage.xml +++ b/configdoc/usage.xml @@ -498,7 +498,7 @@ - 64 + 67 diff --git a/library/HTMLPurifier.includes.php b/library/HTMLPurifier.includes.php index d4904da0..28a93de9 100644 --- a/library/HTMLPurifier.includes.php +++ b/library/HTMLPurifier.includes.php @@ -19,6 +19,7 @@ */ require 'HTMLPurifier.php'; +require 'HTMLPurifier/Arborize.php'; require 'HTMLPurifier/AttrCollections.php'; require 'HTMLPurifier/AttrDef.php'; require 'HTMLPurifier/AttrTransform.php'; @@ -54,6 +55,7 @@ require 'HTMLPurifier/Language.php'; require 'HTMLPurifier/LanguageFactory.php'; require 'HTMLPurifier/Length.php'; require 'HTMLPurifier/Lexer.php'; +require 'HTMLPurifier/Node.php'; require 'HTMLPurifier/PercentEncoder.php'; require 'HTMLPurifier/PropertyList.php'; require 'HTMLPurifier/PropertyListIterator.php'; @@ -191,6 +193,9 @@ require 'HTMLPurifier/Injector/RemoveSpansWithoutAttributes.php'; require 'HTMLPurifier/Injector/SafeObject.php'; require 'HTMLPurifier/Lexer/DOMLex.php'; require 'HTMLPurifier/Lexer/DirectLex.php'; +require 'HTMLPurifier/Node/Comment.php'; +require 'HTMLPurifier/Node/Element.php'; +require 'HTMLPurifier/Node/Text.php'; require 'HTMLPurifier/Strategy/Composite.php'; require 'HTMLPurifier/Strategy/Core.php'; require 'HTMLPurifier/Strategy/FixNesting.php'; diff --git a/library/HTMLPurifier.safe-includes.php b/library/HTMLPurifier.safe-includes.php index c87d0411..9dea6d1e 100644 --- a/library/HTMLPurifier.safe-includes.php +++ b/library/HTMLPurifier.safe-includes.php @@ -13,6 +13,7 @@ $__dir = dirname(__FILE__); require_once $__dir . '/HTMLPurifier.php'; +require_once $__dir . '/HTMLPurifier/Arborize.php'; require_once $__dir . '/HTMLPurifier/AttrCollections.php'; require_once $__dir . '/HTMLPurifier/AttrDef.php'; require_once $__dir . '/HTMLPurifier/AttrTransform.php'; @@ -48,6 +49,7 @@ require_once $__dir . '/HTMLPurifier/Language.php'; require_once $__dir . '/HTMLPurifier/LanguageFactory.php'; require_once $__dir . '/HTMLPurifier/Length.php'; require_once $__dir . '/HTMLPurifier/Lexer.php'; +require_once $__dir . '/HTMLPurifier/Node.php'; require_once $__dir . '/HTMLPurifier/PercentEncoder.php'; require_once $__dir . '/HTMLPurifier/PropertyList.php'; require_once $__dir . '/HTMLPurifier/PropertyListIterator.php'; @@ -185,6 +187,9 @@ require_once $__dir . '/HTMLPurifier/Injector/RemoveSpansWithoutAttributes.php'; require_once $__dir . '/HTMLPurifier/Injector/SafeObject.php'; require_once $__dir . '/HTMLPurifier/Lexer/DOMLex.php'; require_once $__dir . '/HTMLPurifier/Lexer/DirectLex.php'; +require_once $__dir . '/HTMLPurifier/Node/Comment.php'; +require_once $__dir . '/HTMLPurifier/Node/Element.php'; +require_once $__dir . '/HTMLPurifier/Node/Text.php'; require_once $__dir . '/HTMLPurifier/Strategy/Composite.php'; require_once $__dir . '/HTMLPurifier/Strategy/Core.php'; require_once $__dir . '/HTMLPurifier/Strategy/FixNesting.php'; diff --git a/library/HTMLPurifier/Arborize.php b/library/HTMLPurifier/Arborize.php new file mode 100644 index 00000000..9e6617be --- /dev/null +++ b/library/HTMLPurifier/Arborize.php @@ -0,0 +1,71 @@ +getHTMLDefinition(); + $parent = new HTMLPurifier_Token_Start($definition->info_parent); + $stack = array($parent->toNode()); + foreach ($tokens as $token) { + $token->skip = null; // [MUT] + $token->carryover = null; // [MUT] + if ($token instanceof HTMLPurifier_Token_End) { + $token->start = null; // [MUT] + $r = array_pop($stack); + assert($r->name === $token->name); + assert(empty($token->attr)); + $r->endCol = $token->col; + $r->endLine = $token->line; + $r->endArmor = $token->armor; + continue; + } + $node = $token->toNode(); + $stack[count($stack)-1]->children[] = $node; + if ($token instanceof HTMLPurifier_Token_Start) { + $stack[] = $node; + } + } + assert(count($stack) == 1); + return $stack[0]; + } + + public static function flatten($node, $config, $context) { + $level = 0; + $nodes = array($level => new HTMLPurifier_Queue(array($node))); + $closingTokens = array(); + $tokens = array(); + do { + while (!$nodes[$level]->isEmpty()) { + $node = $nodes[$level]->shift(); // FIFO + list($start, $end) = $node->toTokenPair(); + if ($level > 0) { + $tokens[] = $start; + } + if ($end !== NULL) { + $closingTokens[$level][] = $end; + } + if ($node instanceof HTMLPurifier_Node_Element) { + $level++; + $nodes[$level] = new HTMLPurifier_Queue(); + foreach ($node->children as $childNode) { + $nodes[$level]->push($childNode); + } + } + } + $level--; + if ($level && isset($closingTokens[$level])) { + while ($token = array_pop($closingTokens[$level])) { + $tokens[] = $token; + } + } + } while ($level > 0); + return $tokens; + } +} diff --git a/library/HTMLPurifier/Node.php b/library/HTMLPurifier/Node.php new file mode 100644 index 00000000..9e239b3c --- /dev/null +++ b/library/HTMLPurifier/Node.php @@ -0,0 +1,40 @@ +data = $data; + $this->line = $line; + $this->col = $col; + } + + public function toTokenPair() { + return array(new HTMLPurifier_Token_Comment($this->data, $this->line, $this->col), null); + } +} diff --git a/library/HTMLPurifier/Node/Element.php b/library/HTMLPurifier/Node/Element.php new file mode 100644 index 00000000..6cbf56da --- /dev/null +++ b/library/HTMLPurifier/Node/Element.php @@ -0,0 +1,59 @@ + form or the form, i.e. + * is it a pair of start/end tokens or an empty token. + * @bool + */ + public $empty = false; + + public $endCol = null, $endLine = null, $endArmor = array(); + + public function __construct($name, $attr = array(), $line = null, $col = null, $armor = array()) { + $this->name = $name; + $this->attr = $attr; + $this->line = $line; + $this->col = $col; + $this->armor = $armor; + } + + public function toTokenPair() { + // XXX inefficiency here, normalization is not necessary + if ($this->empty) { + return array(new HTMLPurifier_Token_Empty($this->name, $this->attr, $this->line, $this->col, $this->armor), null); + } else { + $start = new HTMLPurifier_Token_Start($this->name, $this->attr, $this->line, $this->col, $this->armor); + $end = new HTMLPurifier_Token_End($this->name, array(), $this->endLine, $this->endCol, $this->endArmor); + //$end->start = $start; + return array($start, $end); + } + } +} + diff --git a/library/HTMLPurifier/Node/Text.php b/library/HTMLPurifier/Node/Text.php new file mode 100644 index 00000000..03dc1b20 --- /dev/null +++ b/library/HTMLPurifier/Node/Text.php @@ -0,0 +1,47 @@ +data = $data; + $this->is_whitespace = $is_whitespace; + $this->line = $line; + $this->col = $col; + } + + public function toTokenPair() { + return array(new HTMLPurifier_Token_Text($this->data, $this->line, $this->col), null); + } +} + +// vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/Strategy/FixNesting.php b/library/HTMLPurifier/Strategy/FixNesting.php index d00c6d04..f78ad086 100644 --- a/library/HTMLPurifier/Strategy/FixNesting.php +++ b/library/HTMLPurifier/Strategy/FixNesting.php @@ -58,6 +58,9 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy //####################################################################// // Pre-processing + //$node = HTMLPurifier_Arborize::arborize($tokens, $config, $context); + //$new_tokens = HTMLPurifier_Arborize::flatten($node, $config, $context); + // get a copy of the HTML definition $definition = $config->getHTMLDefinition(); diff --git a/library/HTMLPurifier/Token.php b/library/HTMLPurifier/Token.php index b1ff93c3..85b85e07 100644 --- a/library/HTMLPurifier/Token.php +++ b/library/HTMLPurifier/Token.php @@ -90,6 +90,11 @@ abstract class HTMLPurifier_Token $this->line = $l; $this->col = $c; } + + /** + * Converts a token into its corresponding node. + */ + abstract public function toNode(); } // vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/Token/Comment.php b/library/HTMLPurifier/Token/Comment.php index ae90ad21..23453c70 100644 --- a/library/HTMLPurifier/Token/Comment.php +++ b/library/HTMLPurifier/Token/Comment.php @@ -29,6 +29,10 @@ class HTMLPurifier_Token_Comment extends HTMLPurifier_Token $this->line = $line; $this->col = $col; } + + public function toNode() { + return new HTMLPurifier_Node_Comment($this->data, $this->line, $this->col); + } } // vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/Token/Empty.php b/library/HTMLPurifier/Token/Empty.php index c54fbc82..78a95f55 100644 --- a/library/HTMLPurifier/Token/Empty.php +++ b/library/HTMLPurifier/Token/Empty.php @@ -5,6 +5,11 @@ */ class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag { + public function toNode() { + $n = parent::toNode(); + $n->empty = true; + return $n; + } } // vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/Token/End.php b/library/HTMLPurifier/Token/End.php index bfc5e3f1..59b38fdc 100644 --- a/library/HTMLPurifier/Token/End.php +++ b/library/HTMLPurifier/Token/End.php @@ -15,6 +15,10 @@ class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag * @type HTMLPurifier_Token */ public $start; + + public function toNode() { + throw new Exception("HTMLPurifier_Token_End->toNode not supported!"); + } } // vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/Token/Tag.php b/library/HTMLPurifier/Token/Tag.php index f65235a9..d643fa64 100644 --- a/library/HTMLPurifier/Token/Tag.php +++ b/library/HTMLPurifier/Token/Tag.php @@ -59,6 +59,10 @@ abstract class HTMLPurifier_Token_Tag extends HTMLPurifier_Token $this->col = $col; $this->armor = $armor; } + + public function toNode() { + return new HTMLPurifier_Node_Element($this->name, $this->attr, $this->line, $this->col, $this->armor); + } } // vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/Token/Text.php b/library/HTMLPurifier/Token/Text.php index 75d25e3f..f26a1c21 100644 --- a/library/HTMLPurifier/Token/Text.php +++ b/library/HTMLPurifier/Token/Text.php @@ -44,6 +44,10 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token $this->line = $line; $this->col = $col; } + + public function toNode() { + return new HTMLPurifier_Node_Text($this->data, $this->is_whitespace, $this->line, $this->col); + } } // vim: et sw=4 sts=4