mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-01-18 11:41:52 +00:00
0767bbc12d
This mega-patch rips out the FixNesting implementation and the related ChildDef components. The primary algorithmic change is to convert from use of tokens to tree nodes, which are far more amenable to the style of processing that FixNesting uses. Additionally, FixNesting has been changed to go bottom-up rather than top-down, in order to avoid needing to implement backtracking. This patch simplifies a good deal of the relevant logic, since we no longer need to continually recalculate the nesting structure when processing things. However, the conversion to the alternate format incurs some overhead, so for small inputs these changes are not a win. One possibility to greatly reduce the constant factors here is to switch to entirely using libxml's representation, and never serializing tokens; this would require one to rewrite injectors, however. The iterative post-order traversal in FixNesting is a bit subtle, but we have essentially reified the stack and continuations. We've removed support for %Core.EscapeInvalidChildren. Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
182 lines
7.4 KiB
PHP
182 lines
7.4 KiB
PHP
<?php
|
|
|
|
/**
|
|
* Takes a well formed list of tokens and fixes their nesting.
|
|
*
|
|
* HTML elements dictate which elements are allowed to be their children,
|
|
* for example, you can't have a p tag in a span tag. Other elements have
|
|
* much more rigorous definitions: tables, for instance, require a specific
|
|
* order for their elements. There are also constraints not expressible by
|
|
* document type definitions, such as the chameleon nature of ins/del
|
|
* tags and global child exclusions.
|
|
*
|
|
* The first major objective of this strategy is to iterate through all
|
|
* the nodes and determine whether or not their children conform to the
|
|
* element's definition. If they do not, the child definition may
|
|
* optionally supply an amended list of elements that is valid or
|
|
* require that the entire node be deleted (and the previous node
|
|
* rescanned).
|
|
*
|
|
* The second objective is to ensure that explicitly excluded elements of
|
|
* an element do not appear in its children. Code that accomplishes this
|
|
* task is pervasive through the strategy, though the two are distinct tasks
|
|
* and could, theoretically, be seperated (although it's not recommended).
|
|
*
|
|
* @note Whether or not unrecognized children are silently dropped or
|
|
* translated into text depends on the child definitions.
|
|
*
|
|
* @todo Enable nodes to be bubbled out of the structure. This is
|
|
* easier with our new algorithm.
|
|
*/
|
|
|
|
class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
|
|
{
|
|
|
|
/**
|
|
* @param HTMLPurifier_Token[] $tokens
|
|
* @param HTMLPurifier_Config $config
|
|
* @param HTMLPurifier_Context $context
|
|
* @return array|HTMLPurifier_Token[]
|
|
*/
|
|
public function execute($tokens, $config, $context)
|
|
{
|
|
|
|
//####################################################################//
|
|
// Pre-processing
|
|
|
|
// O(n) pass to convert to a tree, so that we can efficiently
|
|
// refer to substrings
|
|
$top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context);
|
|
|
|
// get a copy of the HTML definition
|
|
$definition = $config->getHTMLDefinition();
|
|
|
|
$excludes_enabled = !$config->get('Core.DisableExcludes');
|
|
|
|
// setup the context variable 'IsInline', for chameleon processing
|
|
// is 'false' when we are not inline, 'true' when it must always
|
|
// be inline, and an integer when it is inline for a certain
|
|
// branch of the document tree
|
|
$is_inline = $definition->info_parent_def->descendants_are_inline;
|
|
$context->register('IsInline', $is_inline);
|
|
|
|
// setup error collector
|
|
$e =& $context->get('ErrorCollector', true);
|
|
|
|
//####################################################################//
|
|
// Loop initialization
|
|
|
|
// stack that contains all elements that are excluded
|
|
// it is organized by parent elements, similar to $stack,
|
|
// but it is only populated when an element with exclusions is
|
|
// processed, i.e. there won't be empty exclusions.
|
|
$exclude_stack = array($definition->info_parent_def->excludes);
|
|
|
|
// variable that contains the start token while we are processing
|
|
// nodes. This enables error reporting to do its job
|
|
$node = $top_node;
|
|
// dummy token
|
|
list($token, $d) = $node->toTokenPair();
|
|
$context->register('CurrentNode', $node);
|
|
$context->register('CurrentToken', $token);
|
|
|
|
//####################################################################//
|
|
// Loop
|
|
|
|
// We need to implement a post-order traversal iteratively, to
|
|
// avoid running into stack space limits. This is pretty tricky
|
|
// to reason about, so we just manually stack-ify the recursive
|
|
// variant:
|
|
//
|
|
// function f($node) {
|
|
// foreach ($node->children as $child) {
|
|
// f($child);
|
|
// }
|
|
// validate($node);
|
|
// }
|
|
//
|
|
// Thus, we will represent a stack frame as array($node,
|
|
// $is_inline, stack of children)
|
|
// e.g. array_reverse($node->children) - already processed
|
|
// children.
|
|
|
|
$parent_def = $definition->info_parent_def;
|
|
$stack = array(
|
|
array($top_node,
|
|
$parent_def->descendants_are_inline,
|
|
$parent_def->excludes, // exclusions
|
|
0)
|
|
);
|
|
|
|
while (!empty($stack)) {
|
|
list($node, $is_inline, $excludes, $ix) = array_pop($stack);
|
|
// recursive call
|
|
$go = false;
|
|
$def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name];
|
|
while (isset($node->children[$ix])) {
|
|
$child = $node->children[$ix++];
|
|
if ($child instanceof HTMLPurifier_Node_Element) {
|
|
$go = true;
|
|
$stack[] = array($node, $is_inline, $excludes, $ix);
|
|
$stack[] = array($child,
|
|
// ToDo: I don't think it matters if it's def or
|
|
// child_def, but double check this...
|
|
$is_inline || $def->descendants_are_inline,
|
|
empty($def->excludes) ? $excludes
|
|
: array_merge($excludes, $def->excludes),
|
|
0);
|
|
break;
|
|
}
|
|
};
|
|
if ($go) continue;
|
|
list($token, $d) = $node->toTokenPair();
|
|
// base case
|
|
if ($excludes_enabled && isset($excludes[$node->name])) {
|
|
$node->dead = true;
|
|
if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
|
|
} else {
|
|
// XXX I suppose it would be slightly more efficient to
|
|
// avoid the allocation here and have children
|
|
// strategies handle it
|
|
$children = array();
|
|
foreach ($node->children as $child) {
|
|
if (!$child->dead) $children[] = $child;
|
|
}
|
|
$result = $def->child->validateChildren($children, $config, $context);
|
|
if ($result === true) {
|
|
// nop
|
|
$node->children = $children;
|
|
} elseif ($result === false) {
|
|
$node->dead = true;
|
|
if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
|
|
} else {
|
|
$node->children = $result;
|
|
if ($e) {
|
|
// XXX This will miss mutations of internal nodes. Perhaps defer to the child validators
|
|
if (empty($result) && !empty($children)) {
|
|
$e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
|
|
} else if ($result != $children) {
|
|
$e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//####################################################################//
|
|
// Post-processing
|
|
|
|
// remove context variables
|
|
$context->destroy('IsInline');
|
|
$context->destroy('CurrentNode');
|
|
$context->destroy('CurrentToken');
|
|
|
|
//####################################################################//
|
|
// Return
|
|
|
|
return HTMLPurifier_Arborize::flatten($node, $config, $context);
|
|
}
|
|
}
|
|
|
|
// vim: et sw=4 sts=4
|