mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-01-22 13:21:52 +00:00
77d9e05a07
- Fixed buggy chameleon-support for ins and del . Removed context variable ParentType, replaced with IsInline, which is false when you're not inline and an integer of the parent that caused you to become inline when you are (so possibly zero) . Removed ElementDef->type in favor of ElementDef->descendants_are_inline and HTMLDefinition->content_sets . StrictBlockquote now reports what elements its supposed to allow, rather than what it does allow . Removed HTMLDefinition->info_flow_elements in favor of HTMLDefinition->content_sets['Flow'] . Removed redundant "exclusionary" definitions from DTD roster . StrictBlockquote now requires a construction parameter as if it were an Required ChildDef, this is the "real" set of allowed elements git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@710 48356398-32a2-884e-a903-53898d9a118a
293 lines
12 KiB
PHP
293 lines
12 KiB
PHP
<?php
|
|
|
|
require_once 'HTMLPurifier/Strategy.php';
|
|
require_once 'HTMLPurifier/HTMLDefinition.php';
|
|
|
|
/**
|
|
* Takes a well formed list of tokens and fixes their nesting.
|
|
*
|
|
* HTML elements dictate which elements are allowed to be their children,
|
|
* for example, you can't have a p tag in a span tag. Other elements have
|
|
* much more rigorous definitions: tables, for instance, require a specific
|
|
* order for their elements. There are also constraints not expressible by
|
|
* document type definitions, such as the chameleon nature of ins/del
|
|
* tags and global child exclusions.
|
|
*
|
|
* The first major objective of this strategy is to iterate through all the
|
|
* nodes (not tokens) of the list of tokens and determine whether or not
|
|
* their children conform to the element's definition. If they do not, the
|
|
* child definition may optionally supply an amended list of elements that
|
|
* is valid or require that the entire node be deleted (and the previous
|
|
* node rescanned).
|
|
*
|
|
* The second objective is to ensure that explicitly excluded elements of
|
|
* an element do not appear in its children. Code that accomplishes this
|
|
* task is pervasive through the strategy, though the two are distinct tasks
|
|
* and could, theoretically, be seperated (although it's not recommended).
|
|
*
|
|
* @note Whether or not unrecognized children are silently dropped or
|
|
* translated into text depends on the child definitions.
|
|
*
|
|
* @todo Enable nodes to be bubbled out of the structure.
|
|
*/
|
|
|
|
class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
|
|
{
|
|
|
|
function execute($tokens, $config, &$context) {
|
|
//####################################################################//
|
|
// Pre-processing
|
|
|
|
// get a copy of the HTML definition
|
|
$definition = $config->getHTMLDefinition();
|
|
|
|
// insert implicit "parent" node, will be removed at end.
|
|
// ! we might want to move this to configuration
|
|
// DEFINITION CALL
|
|
$parent_name = $definition->info_parent;
|
|
array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name));
|
|
$tokens[] = new HTMLPurifier_Token_End($parent_name);
|
|
|
|
// setup the context variables
|
|
$is_inline = false; // reference var that we alter
|
|
$context->register('IsInline', $is_inline);
|
|
|
|
//####################################################################//
|
|
// Loop initialization
|
|
|
|
// stack that contains the indexes of all parents,
|
|
// $stack[count($stack)-1] being the current parent
|
|
$stack = array();
|
|
|
|
// stack that contains all elements that are excluded
|
|
// same structure as $stack, but it is only populated when an element
|
|
// with exclusions is processed, i.e. there won't be empty exclusions.
|
|
$exclude_stack = array();
|
|
|
|
//####################################################################//
|
|
// Loop
|
|
|
|
// iterate through all start nodes. Determining the start node
|
|
// is complicated so it has been omitted from the loop construct
|
|
for ($i = 0, $size = count($tokens) ; $i < $size; ) {
|
|
|
|
//################################################################//
|
|
// Gather information on children
|
|
|
|
// child token accumulator
|
|
$child_tokens = array();
|
|
|
|
// scroll to the end of this node, report number, and collect
|
|
// all children
|
|
for ($j = $i, $depth = 0; ; $j++) {
|
|
if ($tokens[$j]->type == 'start') {
|
|
$depth++;
|
|
// skip token assignment on first iteration, this is the
|
|
// token we currently are on
|
|
if ($depth == 1) continue;
|
|
} elseif ($tokens[$j]->type == 'end') {
|
|
$depth--;
|
|
// skip token assignment on last iteration, this is the
|
|
// end token of the token we're currently on
|
|
if ($depth == 0) break;
|
|
}
|
|
$child_tokens[] = $tokens[$j];
|
|
}
|
|
|
|
// $i is index of start token
|
|
// $j is index of end token
|
|
|
|
//################################################################//
|
|
// Gather information on parent
|
|
|
|
// calculate parent information
|
|
if ($count = count($stack)) {
|
|
$parent_index = $stack[$count-1];
|
|
$parent_name = $tokens[$parent_index]->name;
|
|
if ($parent_index == 0) {
|
|
$parent_def = $definition->info_parent_def;
|
|
} else {
|
|
$parent_def = $definition->info[$parent_name];
|
|
}
|
|
} else {
|
|
// unknown info, it won't be used anyway
|
|
$parent_index = $parent_name = $parent_def = null;
|
|
}
|
|
|
|
// calculate context
|
|
if ($is_inline === false) {
|
|
// check if conditions make it inline
|
|
if (!empty($parent_def) && $parent_def->descendants_are_inline) {
|
|
$is_inline = $count - 1;
|
|
}
|
|
} else {
|
|
// check if we're out of inline
|
|
if ($count === $is_inline) {
|
|
$is_inline = false;
|
|
}
|
|
}
|
|
|
|
//################################################################//
|
|
// Determine whether element is explicitly excluded SGML-style
|
|
|
|
// determine whether or not element is excluded by checking all
|
|
// parent exclusions. The array should not be very large, two
|
|
// elements at most.
|
|
$excluded = false;
|
|
if (!empty($exclude_stack)) {
|
|
foreach ($exclude_stack as $lookup) {
|
|
if (isset($lookup[$tokens[$i]->name])) {
|
|
$excluded = true;
|
|
// no need to continue processing
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
//################################################################//
|
|
// Perform child validation
|
|
|
|
if ($excluded) {
|
|
// there is an exclusion, remove the entire node
|
|
$result = false;
|
|
$excludes = array(); // not used, but good to initialize anyway
|
|
} else {
|
|
// DEFINITION CALL
|
|
if ($i === 0) {
|
|
// special processing for the first node
|
|
$def = $definition->info_parent_def;
|
|
} else {
|
|
$def = $definition->info[$tokens[$i]->name];
|
|
|
|
}
|
|
|
|
if (!empty($def->child)) {
|
|
// have DTD child def validate children
|
|
$result = $def->child->validateChildren(
|
|
$child_tokens, $config, $context);
|
|
} else {
|
|
// weird, no child definition, get rid of everything
|
|
$result = false;
|
|
}
|
|
|
|
// determine whether or not this element has any exclusions
|
|
$excludes = $def->excludes;
|
|
}
|
|
|
|
// $result is now a bool or array
|
|
|
|
//################################################################//
|
|
// Process result by interpreting $result
|
|
|
|
if ($result === true) {
|
|
// leave the node as is
|
|
|
|
// register start token as a parental node start
|
|
$stack[] = $i;
|
|
|
|
// register exclusions if there are any
|
|
if (!empty($excludes)) $exclude_stack[] = $excludes;
|
|
|
|
// move cursor to next possible start node
|
|
$i++;
|
|
|
|
} elseif($result === false) {
|
|
// remove entire node
|
|
|
|
// calculate length of inner tokens and current tokens
|
|
$length = $j - $i + 1;
|
|
|
|
// perform removal
|
|
array_splice($tokens, $i, $length);
|
|
|
|
// update size
|
|
$size -= $length;
|
|
|
|
// there is no start token to register,
|
|
// current node is now the next possible start node
|
|
// unless it turns out that we need to do a double-check
|
|
|
|
if (!$parent_def->child->allow_empty) {
|
|
// we need to do a double-check
|
|
$i = $parent_index;
|
|
array_pop($stack);
|
|
}
|
|
|
|
// PROJECTED OPTIMIZATION: Process all children elements before
|
|
// reprocessing parent node.
|
|
|
|
} else {
|
|
// replace node with $result
|
|
|
|
// calculate length of inner tokens
|
|
$length = $j - $i - 1;
|
|
|
|
// perform replacement
|
|
array_splice($tokens, $i + 1, $length, $result);
|
|
|
|
// update size
|
|
$size -= $length;
|
|
$size += count($result);
|
|
|
|
// register start token as a parental node start
|
|
$stack[] = $i;
|
|
|
|
// register exclusions if there are any
|
|
if (!empty($excludes)) $exclude_stack[] = $excludes;
|
|
|
|
// move cursor to next possible start node
|
|
$i++;
|
|
|
|
}
|
|
|
|
//################################################################//
|
|
// Scroll to next start node
|
|
|
|
// We assume, at this point, that $i is the index of the token
|
|
// that is the first possible new start point for a node.
|
|
|
|
// Test if the token indeed is a start tag, if not, move forward
|
|
// and test again.
|
|
$size = count($tokens);
|
|
while ($i < $size and $tokens[$i]->type != 'start') {
|
|
if ($tokens[$i]->type == 'end') {
|
|
// pop a token index off the stack if we ended a node
|
|
array_pop($stack);
|
|
// pop an exclusion lookup off exclusion stack if
|
|
// we ended node and that node had exclusions
|
|
if ($i == 0 || $i == $size - 1) {
|
|
// use specialized var if it's the super-parent
|
|
$s_excludes = $definition->info_parent_def->excludes;
|
|
} else {
|
|
$s_excludes = $definition->info[$tokens[$i]->name]->excludes;
|
|
}
|
|
if ($s_excludes) {
|
|
array_pop($exclude_stack);
|
|
}
|
|
}
|
|
$i++;
|
|
}
|
|
|
|
}
|
|
|
|
//####################################################################//
|
|
// Post-processing
|
|
|
|
// remove implicit parent tokens at the beginning and end
|
|
array_shift($tokens);
|
|
array_pop($tokens);
|
|
|
|
// remove context variables
|
|
$context->destroy('IsInline');
|
|
|
|
//####################################################################//
|
|
// Return
|
|
|
|
return $tokens;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
?>
|