mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-11-09 15:28:40 +00:00
Add conversion functions for our own tree format.
Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
This commit is contained in:
parent
be5769804a
commit
b3640e1af6
@ -498,7 +498,7 @@
|
||||
</directive>
|
||||
<directive id="Core.DisableExcludes">
|
||||
<file name="HTMLPurifier/Strategy/FixNesting.php">
|
||||
<line>64</line>
|
||||
<line>67</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.EscapeInvalidTags">
|
||||
|
@ -19,6 +19,7 @@
|
||||
*/
|
||||
|
||||
require 'HTMLPurifier.php';
|
||||
require 'HTMLPurifier/Arborize.php';
|
||||
require 'HTMLPurifier/AttrCollections.php';
|
||||
require 'HTMLPurifier/AttrDef.php';
|
||||
require 'HTMLPurifier/AttrTransform.php';
|
||||
@ -54,6 +55,7 @@ require 'HTMLPurifier/Language.php';
|
||||
require 'HTMLPurifier/LanguageFactory.php';
|
||||
require 'HTMLPurifier/Length.php';
|
||||
require 'HTMLPurifier/Lexer.php';
|
||||
require 'HTMLPurifier/Node.php';
|
||||
require 'HTMLPurifier/PercentEncoder.php';
|
||||
require 'HTMLPurifier/PropertyList.php';
|
||||
require 'HTMLPurifier/PropertyListIterator.php';
|
||||
@ -191,6 +193,9 @@ require 'HTMLPurifier/Injector/RemoveSpansWithoutAttributes.php';
|
||||
require 'HTMLPurifier/Injector/SafeObject.php';
|
||||
require 'HTMLPurifier/Lexer/DOMLex.php';
|
||||
require 'HTMLPurifier/Lexer/DirectLex.php';
|
||||
require 'HTMLPurifier/Node/Comment.php';
|
||||
require 'HTMLPurifier/Node/Element.php';
|
||||
require 'HTMLPurifier/Node/Text.php';
|
||||
require 'HTMLPurifier/Strategy/Composite.php';
|
||||
require 'HTMLPurifier/Strategy/Core.php';
|
||||
require 'HTMLPurifier/Strategy/FixNesting.php';
|
||||
|
@ -13,6 +13,7 @@
|
||||
$__dir = dirname(__FILE__);
|
||||
|
||||
require_once $__dir . '/HTMLPurifier.php';
|
||||
require_once $__dir . '/HTMLPurifier/Arborize.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrCollections.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform.php';
|
||||
@ -48,6 +49,7 @@ require_once $__dir . '/HTMLPurifier/Language.php';
|
||||
require_once $__dir . '/HTMLPurifier/LanguageFactory.php';
|
||||
require_once $__dir . '/HTMLPurifier/Length.php';
|
||||
require_once $__dir . '/HTMLPurifier/Lexer.php';
|
||||
require_once $__dir . '/HTMLPurifier/Node.php';
|
||||
require_once $__dir . '/HTMLPurifier/PercentEncoder.php';
|
||||
require_once $__dir . '/HTMLPurifier/PropertyList.php';
|
||||
require_once $__dir . '/HTMLPurifier/PropertyListIterator.php';
|
||||
@ -185,6 +187,9 @@ require_once $__dir . '/HTMLPurifier/Injector/RemoveSpansWithoutAttributes.php';
|
||||
require_once $__dir . '/HTMLPurifier/Injector/SafeObject.php';
|
||||
require_once $__dir . '/HTMLPurifier/Lexer/DOMLex.php';
|
||||
require_once $__dir . '/HTMLPurifier/Lexer/DirectLex.php';
|
||||
require_once $__dir . '/HTMLPurifier/Node/Comment.php';
|
||||
require_once $__dir . '/HTMLPurifier/Node/Element.php';
|
||||
require_once $__dir . '/HTMLPurifier/Node/Text.php';
|
||||
require_once $__dir . '/HTMLPurifier/Strategy/Composite.php';
|
||||
require_once $__dir . '/HTMLPurifier/Strategy/Core.php';
|
||||
require_once $__dir . '/HTMLPurifier/Strategy/FixNesting.php';
|
||||
|
71
library/HTMLPurifier/Arborize.php
Normal file
71
library/HTMLPurifier/Arborize.php
Normal file
@ -0,0 +1,71 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Converts a stream of HTMLPurifier_Token into an HTMLPurifier_Node,
|
||||
* and back again.
|
||||
*
|
||||
* @note This transformation is not an equivalence. We mutate the input
|
||||
* token stream to make it so; see all [MUT] markers in code.
|
||||
*/
|
||||
class HTMLPurifier_Arborize
|
||||
{
|
||||
public static function arborize($tokens, $config, $context) {
|
||||
$definition = $config->getHTMLDefinition();
|
||||
$parent = new HTMLPurifier_Token_Start($definition->info_parent);
|
||||
$stack = array($parent->toNode());
|
||||
foreach ($tokens as $token) {
|
||||
$token->skip = null; // [MUT]
|
||||
$token->carryover = null; // [MUT]
|
||||
if ($token instanceof HTMLPurifier_Token_End) {
|
||||
$token->start = null; // [MUT]
|
||||
$r = array_pop($stack);
|
||||
assert($r->name === $token->name);
|
||||
assert(empty($token->attr));
|
||||
$r->endCol = $token->col;
|
||||
$r->endLine = $token->line;
|
||||
$r->endArmor = $token->armor;
|
||||
continue;
|
||||
}
|
||||
$node = $token->toNode();
|
||||
$stack[count($stack)-1]->children[] = $node;
|
||||
if ($token instanceof HTMLPurifier_Token_Start) {
|
||||
$stack[] = $node;
|
||||
}
|
||||
}
|
||||
assert(count($stack) == 1);
|
||||
return $stack[0];
|
||||
}
|
||||
|
||||
public static function flatten($node, $config, $context) {
|
||||
$level = 0;
|
||||
$nodes = array($level => new HTMLPurifier_Queue(array($node)));
|
||||
$closingTokens = array();
|
||||
$tokens = array();
|
||||
do {
|
||||
while (!$nodes[$level]->isEmpty()) {
|
||||
$node = $nodes[$level]->shift(); // FIFO
|
||||
list($start, $end) = $node->toTokenPair();
|
||||
if ($level > 0) {
|
||||
$tokens[] = $start;
|
||||
}
|
||||
if ($end !== NULL) {
|
||||
$closingTokens[$level][] = $end;
|
||||
}
|
||||
if ($node instanceof HTMLPurifier_Node_Element) {
|
||||
$level++;
|
||||
$nodes[$level] = new HTMLPurifier_Queue();
|
||||
foreach ($node->children as $childNode) {
|
||||
$nodes[$level]->push($childNode);
|
||||
}
|
||||
}
|
||||
}
|
||||
$level--;
|
||||
if ($level && isset($closingTokens[$level])) {
|
||||
while ($token = array_pop($closingTokens[$level])) {
|
||||
$tokens[] = $token;
|
||||
}
|
||||
}
|
||||
} while ($level > 0);
|
||||
return $tokens;
|
||||
}
|
||||
}
|
40
library/HTMLPurifier/Node.php
Normal file
40
library/HTMLPurifier/Node.php
Normal file
@ -0,0 +1,40 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Abstract base node class that all others inherit from.
|
||||
*
|
||||
* Why do we not use the DOM extension? (1) It is not always available,
|
||||
* (2) it has funny constraints on the data it can represent,
|
||||
* whereas we want a maximally flexible representation, and (3) its
|
||||
* interface is a bit cumbersome.
|
||||
*/
|
||||
abstract class HTMLPurifier_Node
|
||||
{
|
||||
/**
|
||||
* Line number of the start token in the source document
|
||||
* @type int
|
||||
*/
|
||||
public $line;
|
||||
|
||||
/**
|
||||
* Column number of the start token in the source document. Null if unknown.
|
||||
* @type int
|
||||
*/
|
||||
public $col;
|
||||
|
||||
/**
|
||||
* Lookup array of processing that this token is exempt from.
|
||||
* Currently, valid values are "ValidateAttributes".
|
||||
* @type array
|
||||
*/
|
||||
public $armor = array();
|
||||
|
||||
/**
|
||||
* Returns a pair of start and end tokens, where the end token
|
||||
* is null if it is not necessary. Does not include children.
|
||||
* @type array
|
||||
*/
|
||||
abstract public function toTokenPair();
|
||||
}
|
||||
|
||||
// vim: et sw=4 sts=4
|
36
library/HTMLPurifier/Node/Comment.php
Normal file
36
library/HTMLPurifier/Node/Comment.php
Normal file
@ -0,0 +1,36 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Concrete comment node class.
|
||||
*/
|
||||
class HTMLPurifier_Node_Comment extends HTMLPurifier_Node
|
||||
{
|
||||
/**
|
||||
* Character data within comment.
|
||||
* @type string
|
||||
*/
|
||||
public $data;
|
||||
|
||||
/**
|
||||
* @type bool
|
||||
*/
|
||||
public $is_whitespace = true;
|
||||
|
||||
/**
|
||||
* Transparent constructor.
|
||||
*
|
||||
* @param string $data String comment data.
|
||||
* @param int $line
|
||||
* @param int $col
|
||||
*/
|
||||
public function __construct($data, $line = null, $col = null)
|
||||
{
|
||||
$this->data = $data;
|
||||
$this->line = $line;
|
||||
$this->col = $col;
|
||||
}
|
||||
|
||||
public function toTokenPair() {
|
||||
return array(new HTMLPurifier_Token_Comment($this->data, $this->line, $this->col), null);
|
||||
}
|
||||
}
|
59
library/HTMLPurifier/Node/Element.php
Normal file
59
library/HTMLPurifier/Node/Element.php
Normal file
@ -0,0 +1,59 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Concrete element node class.
|
||||
*/
|
||||
class HTMLPurifier_Node_Element extends HTMLPurifier_Node
|
||||
{
|
||||
/**
|
||||
* The lower-case name of the tag, like 'a', 'b' or 'blockquote'.
|
||||
*
|
||||
* @note Strictly speaking, XML tags are case sensitive, so we shouldn't
|
||||
* be lower-casing them, but these tokens cater to HTML tags, which are
|
||||
* insensitive.
|
||||
* @type string
|
||||
*/
|
||||
public $name;
|
||||
|
||||
/**
|
||||
* Associative array of the node's attributes.
|
||||
* @type array
|
||||
*/
|
||||
public $attr = array();
|
||||
|
||||
/**
|
||||
* List of child elements.
|
||||
* @type array
|
||||
*/
|
||||
public $children = array();
|
||||
|
||||
/**
|
||||
* Does this use the <a></a> form or the </a> form, i.e.
|
||||
* is it a pair of start/end tokens or an empty token.
|
||||
* @bool
|
||||
*/
|
||||
public $empty = false;
|
||||
|
||||
public $endCol = null, $endLine = null, $endArmor = array();
|
||||
|
||||
public function __construct($name, $attr = array(), $line = null, $col = null, $armor = array()) {
|
||||
$this->name = $name;
|
||||
$this->attr = $attr;
|
||||
$this->line = $line;
|
||||
$this->col = $col;
|
||||
$this->armor = $armor;
|
||||
}
|
||||
|
||||
public function toTokenPair() {
|
||||
// XXX inefficiency here, normalization is not necessary
|
||||
if ($this->empty) {
|
||||
return array(new HTMLPurifier_Token_Empty($this->name, $this->attr, $this->line, $this->col, $this->armor), null);
|
||||
} else {
|
||||
$start = new HTMLPurifier_Token_Start($this->name, $this->attr, $this->line, $this->col, $this->armor);
|
||||
$end = new HTMLPurifier_Token_End($this->name, array(), $this->endLine, $this->endCol, $this->endArmor);
|
||||
//$end->start = $start;
|
||||
return array($start, $end);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
47
library/HTMLPurifier/Node/Text.php
Normal file
47
library/HTMLPurifier/Node/Text.php
Normal file
@ -0,0 +1,47 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Concrete text token class.
|
||||
*
|
||||
* Text tokens comprise of regular parsed character data (PCDATA) and raw
|
||||
* character data (from the CDATA sections). Internally, their
|
||||
* data is parsed with all entities expanded. Surprisingly, the text token
|
||||
* does have a "tag name" called #PCDATA, which is how the DTD represents it
|
||||
* in permissible child nodes.
|
||||
*/
|
||||
class HTMLPurifier_Node_Text extends HTMLPurifier_Node
|
||||
{
|
||||
|
||||
/**
|
||||
* @type string
|
||||
*/
|
||||
public $data;
|
||||
/**< Parsed character data of text. */
|
||||
|
||||
/**
|
||||
* @type bool
|
||||
*/
|
||||
public $is_whitespace;
|
||||
|
||||
/**< Bool indicating if node is whitespace. */
|
||||
|
||||
/**
|
||||
* Constructor, accepts data and determines if it is whitespace.
|
||||
* @param string $data String parsed character data.
|
||||
* @param int $line
|
||||
* @param int $col
|
||||
*/
|
||||
public function __construct($data, $is_whitespace, $line = null, $col = null)
|
||||
{
|
||||
$this->data = $data;
|
||||
$this->is_whitespace = $is_whitespace;
|
||||
$this->line = $line;
|
||||
$this->col = $col;
|
||||
}
|
||||
|
||||
public function toTokenPair() {
|
||||
return array(new HTMLPurifier_Token_Text($this->data, $this->line, $this->col), null);
|
||||
}
|
||||
}
|
||||
|
||||
// vim: et sw=4 sts=4
|
@ -58,6 +58,9 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
|
||||
//####################################################################//
|
||||
// Pre-processing
|
||||
|
||||
//$node = HTMLPurifier_Arborize::arborize($tokens, $config, $context);
|
||||
//$new_tokens = HTMLPurifier_Arborize::flatten($node, $config, $context);
|
||||
|
||||
// get a copy of the HTML definition
|
||||
$definition = $config->getHTMLDefinition();
|
||||
|
||||
|
@ -90,6 +90,11 @@ abstract class HTMLPurifier_Token
|
||||
$this->line = $l;
|
||||
$this->col = $c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a token into its corresponding node.
|
||||
*/
|
||||
abstract public function toNode();
|
||||
}
|
||||
|
||||
// vim: et sw=4 sts=4
|
||||
|
@ -29,6 +29,10 @@ class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
|
||||
$this->line = $line;
|
||||
$this->col = $col;
|
||||
}
|
||||
|
||||
public function toNode() {
|
||||
return new HTMLPurifier_Node_Comment($this->data, $this->line, $this->col);
|
||||
}
|
||||
}
|
||||
|
||||
// vim: et sw=4 sts=4
|
||||
|
@ -5,6 +5,11 @@
|
||||
*/
|
||||
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
|
||||
{
|
||||
public function toNode() {
|
||||
$n = parent::toNode();
|
||||
$n->empty = true;
|
||||
return $n;
|
||||
}
|
||||
}
|
||||
|
||||
// vim: et sw=4 sts=4
|
||||
|
@ -15,6 +15,10 @@ class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
|
||||
* @type HTMLPurifier_Token
|
||||
*/
|
||||
public $start;
|
||||
|
||||
public function toNode() {
|
||||
throw new Exception("HTMLPurifier_Token_End->toNode not supported!");
|
||||
}
|
||||
}
|
||||
|
||||
// vim: et sw=4 sts=4
|
||||
|
@ -59,6 +59,10 @@ abstract class HTMLPurifier_Token_Tag extends HTMLPurifier_Token
|
||||
$this->col = $col;
|
||||
$this->armor = $armor;
|
||||
}
|
||||
|
||||
public function toNode() {
|
||||
return new HTMLPurifier_Node_Element($this->name, $this->attr, $this->line, $this->col, $this->armor);
|
||||
}
|
||||
}
|
||||
|
||||
// vim: et sw=4 sts=4
|
||||
|
@ -44,6 +44,10 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token
|
||||
$this->line = $line;
|
||||
$this->col = $col;
|
||||
}
|
||||
|
||||
public function toNode() {
|
||||
return new HTMLPurifier_Node_Text($this->data, $this->is_whitespace, $this->line, $this->col);
|
||||
}
|
||||
}
|
||||
|
||||
// vim: et sw=4 sts=4
|
||||
|
Loading…
Reference in New Issue
Block a user