Add conversion functions for our own tree format.

Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
2025-04-13 06:44:36 +00:00 · 2013-10-20 15:05:11 -07:00 · 2013-10-20 15:05:11 -07:00 · b3640e1af6
commit b3640e1af6
parent be5769804a
15 changed files with 293 additions and 1 deletions
--- a/configdoc/usage.xml
+++ b/configdoc/usage.xml
@ -498,7 +498,7 @@
 </directive>
 <directive id="Core.DisableExcludes">
  <file name="HTMLPurifier/Strategy/FixNesting.php">
-   <line>64</line>
+   <line>67</line>
  </file>
 </directive>
 <directive id="Core.EscapeInvalidTags">
--- a/library/HTMLPurifier.includes.php
+++ b/library/HTMLPurifier.includes.php
@ -19,6 +19,7 @@
 */

 require 'HTMLPurifier.php';
+require 'HTMLPurifier/Arborize.php';
 require 'HTMLPurifier/AttrCollections.php';
 require 'HTMLPurifier/AttrDef.php';
 require 'HTMLPurifier/AttrTransform.php';
@ -54,6 +55,7 @@ require 'HTMLPurifier/Language.php';
 require 'HTMLPurifier/LanguageFactory.php';
 require 'HTMLPurifier/Length.php';
 require 'HTMLPurifier/Lexer.php';
+require 'HTMLPurifier/Node.php';
 require 'HTMLPurifier/PercentEncoder.php';
 require 'HTMLPurifier/PropertyList.php';
 require 'HTMLPurifier/PropertyListIterator.php';
@ -191,6 +193,9 @@ require 'HTMLPurifier/Injector/RemoveSpansWithoutAttributes.php';
 require 'HTMLPurifier/Injector/SafeObject.php';
 require 'HTMLPurifier/Lexer/DOMLex.php';
 require 'HTMLPurifier/Lexer/DirectLex.php';
+require 'HTMLPurifier/Node/Comment.php';
+require 'HTMLPurifier/Node/Element.php';
+require 'HTMLPurifier/Node/Text.php';
 require 'HTMLPurifier/Strategy/Composite.php';
 require 'HTMLPurifier/Strategy/Core.php';
 require 'HTMLPurifier/Strategy/FixNesting.php';
--- a/library/HTMLPurifier.safe-includes.php
+++ b/library/HTMLPurifier.safe-includes.php
@ -13,6 +13,7 @@
 $__dir = dirname(__FILE__);

 require_once $__dir . '/HTMLPurifier.php';
+require_once $__dir . '/HTMLPurifier/Arborize.php';
 require_once $__dir . '/HTMLPurifier/AttrCollections.php';
 require_once $__dir . '/HTMLPurifier/AttrDef.php';
 require_once $__dir . '/HTMLPurifier/AttrTransform.php';
@ -48,6 +49,7 @@ require_once $__dir . '/HTMLPurifier/Language.php';
 require_once $__dir . '/HTMLPurifier/LanguageFactory.php';
 require_once $__dir . '/HTMLPurifier/Length.php';
 require_once $__dir . '/HTMLPurifier/Lexer.php';
+require_once $__dir . '/HTMLPurifier/Node.php';
 require_once $__dir . '/HTMLPurifier/PercentEncoder.php';
 require_once $__dir . '/HTMLPurifier/PropertyList.php';
 require_once $__dir . '/HTMLPurifier/PropertyListIterator.php';
@ -185,6 +187,9 @@ require_once $__dir . '/HTMLPurifier/Injector/RemoveSpansWithoutAttributes.php';
 require_once $__dir . '/HTMLPurifier/Injector/SafeObject.php';
 require_once $__dir . '/HTMLPurifier/Lexer/DOMLex.php';
 require_once $__dir . '/HTMLPurifier/Lexer/DirectLex.php';
+require_once $__dir . '/HTMLPurifier/Node/Comment.php';
+require_once $__dir . '/HTMLPurifier/Node/Element.php';
+require_once $__dir . '/HTMLPurifier/Node/Text.php';
 require_once $__dir . '/HTMLPurifier/Strategy/Composite.php';
 require_once $__dir . '/HTMLPurifier/Strategy/Core.php';
 require_once $__dir . '/HTMLPurifier/Strategy/FixNesting.php';
--- a/library/HTMLPurifier/Arborize.php
+++ b/library/HTMLPurifier/Arborize.php
@ -0,0 +1,71 @@
+<?php
+
+/**
+ * Converts a stream of HTMLPurifier_Token into an HTMLPurifier_Node,
+ * and back again.
+ *
+ * @note This transformation is not an equivalence.  We mutate the input
+ * token stream to make it so; see all [MUT] markers in code.
+ */
+class HTMLPurifier_Arborize
+{
+    public static function arborize($tokens, $config, $context) {
+        $definition = $config->getHTMLDefinition();
+        $parent = new HTMLPurifier_Token_Start($definition->info_parent);
+        $stack = array($parent->toNode());
+        foreach ($tokens as $token) {
+            $token->skip = null; // [MUT]
+            $token->carryover = null; // [MUT]
+            if ($token instanceof HTMLPurifier_Token_End) {
+                $token->start = null; // [MUT]
+                $r = array_pop($stack);
+                assert($r->name === $token->name);
+                assert(empty($token->attr));
+                $r->endCol = $token->col;
+                $r->endLine = $token->line;
+                $r->endArmor = $token->armor;
+                continue;
+            }
+            $node = $token->toNode();
+            $stack[count($stack)-1]->children[] = $node;
+            if ($token instanceof HTMLPurifier_Token_Start) {
+                $stack[] = $node;
+            }
+        }
+        assert(count($stack) == 1);
+        return $stack[0];
+    }
+
+    public static function flatten($node, $config, $context) {
+        $level = 0;
+        $nodes = array($level => new HTMLPurifier_Queue(array($node)));
+        $closingTokens = array();
+        $tokens = array();
+        do {
+            while (!$nodes[$level]->isEmpty()) {
+                $node = $nodes[$level]->shift(); // FIFO
+                list($start, $end) = $node->toTokenPair();
+                if ($level > 0) {
+                    $tokens[] = $start;
+                }
+                if ($end !== NULL) {
+                    $closingTokens[$level][] = $end;
+                }
+                if ($node instanceof HTMLPurifier_Node_Element) {
+                    $level++;
+                    $nodes[$level] = new HTMLPurifier_Queue();
+                    foreach ($node->children as $childNode) {
+                        $nodes[$level]->push($childNode);
+                    }
+                }
+            }
+            $level--;
+            if ($level && isset($closingTokens[$level])) {
+                while ($token = array_pop($closingTokens[$level])) {
+                    $tokens[] = $token;
+                }
+            }
+        } while ($level > 0);
+        return $tokens;
+    }
+}
--- a/library/HTMLPurifier/Node.php
+++ b/library/HTMLPurifier/Node.php
@ -0,0 +1,40 @@
+<?php
+
+/**
+ * Abstract base node class that all others inherit from.
+ *
+ * Why do we not use the DOM extension?  (1) It is not always available,
+ * (2) it has funny constraints on the data it can represent,
+ * whereas we want a maximally flexible representation, and (3) its
+ * interface is a bit cumbersome.
+ */
+abstract class HTMLPurifier_Node
+{
+    /**
+     * Line number of the start token in the source document
+     * @type int
+     */
+    public $line;
+
+    /**
+     * Column number of the start token in the source document. Null if unknown.
+     * @type int
+     */
+    public $col;
+
+    /**
+     * Lookup array of processing that this token is exempt from.
+     * Currently, valid values are "ValidateAttributes".
+     * @type array
+     */
+    public $armor = array();
+
+    /**
+     * Returns a pair of start and end tokens, where the end token
+     * is null if it is not necessary. Does not include children.
+     * @type array
+     */
+    abstract public function toTokenPair();
+}
+
+// vim: et sw=4 sts=4
--- a/library/HTMLPurifier/Node/Comment.php
+++ b/library/HTMLPurifier/Node/Comment.php
@ -0,0 +1,36 @@
+<?php
+
+/**
+ * Concrete comment node class.
+ */
+class HTMLPurifier_Node_Comment extends HTMLPurifier_Node
+{
+    /**
+     * Character data within comment.
+     * @type string
+     */
+    public $data;
+
+    /**
+     * @type bool
+     */
+    public $is_whitespace = true;
+
+    /**
+     * Transparent constructor.
+     *
+     * @param string $data String comment data.
+     * @param int $line
+     * @param int $col
+     */
+    public function __construct($data, $line = null, $col = null)
+    {
+        $this->data = $data;
+        $this->line = $line;
+        $this->col = $col;
+    }
+
+    public function toTokenPair() {
+        return array(new HTMLPurifier_Token_Comment($this->data, $this->line, $this->col), null);
+    }
+}
--- a/library/HTMLPurifier/Node/Element.php
+++ b/library/HTMLPurifier/Node/Element.php
@ -0,0 +1,59 @@
+<?php
+
+/**
+ * Concrete element node class.
+ */
+class HTMLPurifier_Node_Element extends HTMLPurifier_Node
+{
+    /**
+     * The lower-case name of the tag, like 'a', 'b' or 'blockquote'.
+     *
+     * @note Strictly speaking, XML tags are case sensitive, so we shouldn't
+     * be lower-casing them, but these tokens cater to HTML tags, which are
+     * insensitive.
+     * @type string
+     */
+    public $name;
+
+    /**
+     * Associative array of the node's attributes.
+     * @type array
+     */
+    public $attr = array();
+
+    /**
+     * List of child elements.
+     * @type array
+     */
+    public $children = array();
+
+    /**
+     * Does this use the <a></a> form or the </a> form, i.e.
+     * is it a pair of start/end tokens or an empty token.
+     * @bool
+     */
+    public $empty = false;
+
+    public $endCol = null, $endLine = null, $endArmor = array();
+
+    public function __construct($name, $attr = array(), $line = null, $col = null, $armor = array()) {
+        $this->name = $name;
+        $this->attr = $attr;
+        $this->line = $line;
+        $this->col = $col;
+        $this->armor = $armor;
+    }
+
+    public function toTokenPair() {
+        // XXX inefficiency here, normalization is not necessary
+        if ($this->empty) {
+            return array(new HTMLPurifier_Token_Empty($this->name, $this->attr, $this->line, $this->col, $this->armor), null);
+        } else {
+            $start = new HTMLPurifier_Token_Start($this->name, $this->attr, $this->line, $this->col, $this->armor);
+            $end = new HTMLPurifier_Token_End($this->name, array(), $this->endLine, $this->endCol, $this->endArmor);
+            //$end->start = $start;
+            return array($start, $end);
+        }
+    }
+}
+
--- a/library/HTMLPurifier/Node/Text.php
+++ b/library/HTMLPurifier/Node/Text.php
@ -0,0 +1,47 @@
+<?php
+
+/**
+ * Concrete text token class.
+ *
+ * Text tokens comprise of regular parsed character data (PCDATA) and raw
+ * character data (from the CDATA sections). Internally, their
+ * data is parsed with all entities expanded. Surprisingly, the text token
+ * does have a "tag name" called #PCDATA, which is how the DTD represents it
+ * in permissible child nodes.
+ */
+class HTMLPurifier_Node_Text extends HTMLPurifier_Node
+{
+
+    /**
+     * @type string
+     */
+    public $data;
+    /**< Parsed character data of text. */
+
+    /**
+     * @type bool
+     */
+    public $is_whitespace;
+
+    /**< Bool indicating if node is whitespace. */
+
+    /**
+     * Constructor, accepts data and determines if it is whitespace.
+     * @param string $data String parsed character data.
+     * @param int $line
+     * @param int $col
+     */
+    public function __construct($data, $is_whitespace, $line = null, $col = null)
+    {
+        $this->data = $data;
+        $this->is_whitespace = $is_whitespace;
+        $this->line = $line;
+        $this->col = $col;
+    }
+
+    public function toTokenPair() {
+        return array(new HTMLPurifier_Token_Text($this->data, $this->line, $this->col), null);
+    }
+}
+
+// vim: et sw=4 sts=4
--- a/library/HTMLPurifier/Strategy/FixNesting.php
+++ b/library/HTMLPurifier/Strategy/FixNesting.php
@ -58,6 +58,9 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
        //####################################################################//
        // Pre-processing

+        //$node = HTMLPurifier_Arborize::arborize($tokens, $config, $context);
+        //$new_tokens = HTMLPurifier_Arborize::flatten($node, $config, $context);
+
        // get a copy of the HTML definition
        $definition = $config->getHTMLDefinition();

--- a/library/HTMLPurifier/Token.php
+++ b/library/HTMLPurifier/Token.php
@ -90,6 +90,11 @@ abstract class HTMLPurifier_Token
        $this->line = $l;
        $this->col = $c;
    }
+
+    /**
+     * Converts a token into its corresponding node.
+     */
+    abstract public function toNode();
 }

 // vim: et sw=4 sts=4
--- a/library/HTMLPurifier/Token/Comment.php
+++ b/library/HTMLPurifier/Token/Comment.php
@ -29,6 +29,10 @@ class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
        $this->line = $line;
        $this->col = $col;
    }
+
+    public function toNode() {
+        return new HTMLPurifier_Node_Comment($this->data, $this->line, $this->col);
+    }
 }

 // vim: et sw=4 sts=4
--- a/library/HTMLPurifier/Token/Empty.php
+++ b/library/HTMLPurifier/Token/Empty.php
@ -5,6 +5,11 @@
 */
 class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
 {
+    public function toNode() {
+        $n = parent::toNode();
+        $n->empty = true;
+        return $n;
+    }
 }

 // vim: et sw=4 sts=4
--- a/library/HTMLPurifier/Token/End.php
+++ b/library/HTMLPurifier/Token/End.php
@ -15,6 +15,10 @@ class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
     * @type HTMLPurifier_Token
     */
    public $start;
+
+    public function toNode() {
+        throw new Exception("HTMLPurifier_Token_End->toNode not supported!");
+    }
 }

 // vim: et sw=4 sts=4
--- a/library/HTMLPurifier/Token/Tag.php
+++ b/library/HTMLPurifier/Token/Tag.php
@ -59,6 +59,10 @@ abstract class HTMLPurifier_Token_Tag extends HTMLPurifier_Token
        $this->col = $col;
        $this->armor = $armor;
    }
+
+    public function toNode() {
+        return new HTMLPurifier_Node_Element($this->name, $this->attr, $this->line, $this->col, $this->armor);
+    }
 }

 // vim: et sw=4 sts=4
--- a/library/HTMLPurifier/Token/Text.php
+++ b/library/HTMLPurifier/Token/Text.php
@ -44,6 +44,10 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token
        $this->line = $line;
        $this->col = $col;
    }
+
+    public function toNode() {
+        return new HTMLPurifier_Node_Text($this->data, $this->is_whitespace, $this->line, $this->col);
+    }
 }

 // vim: et sw=4 sts=4