mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-23 17:01:51 +00:00
0767bbc12d
This mega-patch rips out the FixNesting implementation and the related ChildDef components. The primary algorithmic change is to convert from use of tokens to tree nodes, which are far more amenable to the style of processing that FixNesting uses. Additionally, FixNesting has been changed to go bottom-up rather than top-down, in order to avoid needing to implement backtracking. This patch simplifies a good deal of the relevant logic, since we no longer need to continually recalculate the nesting structure when processing things. However, the conversion to the alternate format incurs some overhead, so for small inputs these changes are not a win. One possibility to greatly reduce the constant factors here is to switch to entirely using libxml's representation, and never serializing tokens; this would require one to rewrite injectors, however. The iterative post-order traversal in FixNesting is a bit subtle, but we have essentially reified the stack and continuations. We've removed support for %Core.EscapeInvalidChildren. Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
103 lines
2.7 KiB
PHP
103 lines
2.7 KiB
PHP
<?php
|
|
|
|
/**
|
|
* Custom validation class, accepts DTD child definitions
|
|
*
|
|
* @warning Currently this class is an all or nothing proposition, that is,
|
|
* it will only give a bool return value.
|
|
*/
|
|
class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
|
|
{
|
|
/**
|
|
* @type string
|
|
*/
|
|
public $type = 'custom';
|
|
|
|
/**
|
|
* @type bool
|
|
*/
|
|
public $allow_empty = false;
|
|
|
|
/**
|
|
* Allowed child pattern as defined by the DTD.
|
|
* @type string
|
|
*/
|
|
public $dtd_regex;
|
|
|
|
/**
|
|
* PCRE regex derived from $dtd_regex.
|
|
* @type string
|
|
*/
|
|
private $_pcre_regex;
|
|
|
|
/**
|
|
* @param $dtd_regex Allowed child pattern from the DTD
|
|
*/
|
|
public function __construct($dtd_regex)
|
|
{
|
|
$this->dtd_regex = $dtd_regex;
|
|
$this->_compileRegex();
|
|
}
|
|
|
|
/**
|
|
* Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex)
|
|
*/
|
|
protected function _compileRegex()
|
|
{
|
|
$raw = str_replace(' ', '', $this->dtd_regex);
|
|
if ($raw{0} != '(') {
|
|
$raw = "($raw)";
|
|
}
|
|
$el = '[#a-zA-Z0-9_.-]+';
|
|
$reg = $raw;
|
|
|
|
// COMPLICATED! AND MIGHT BE BUGGY! I HAVE NO CLUE WHAT I'M
|
|
// DOING! Seriously: if there's problems, please report them.
|
|
|
|
// collect all elements into the $elements array
|
|
preg_match_all("/$el/", $reg, $matches);
|
|
foreach ($matches[0] as $match) {
|
|
$this->elements[$match] = true;
|
|
}
|
|
|
|
// setup all elements as parentheticals with leading commas
|
|
$reg = preg_replace("/$el/", '(,\\0)', $reg);
|
|
|
|
// remove commas when they were not solicited
|
|
$reg = preg_replace("/([^,(|]\(+),/", '\\1', $reg);
|
|
|
|
// remove all non-paranthetical commas: they are handled by first regex
|
|
$reg = preg_replace("/,\(/", '(', $reg);
|
|
|
|
$this->_pcre_regex = $reg;
|
|
}
|
|
|
|
/**
|
|
* @param HTMLPurifier_Node[] $children
|
|
* @param HTMLPurifier_Config $config
|
|
* @param HTMLPurifier_Context $context
|
|
* @return bool
|
|
*/
|
|
public function validateChildren($children, $config, $context)
|
|
{
|
|
$list_of_children = '';
|
|
$nesting = 0; // depth into the nest
|
|
foreach ($children as $node) {
|
|
if (!empty($node->is_whitespace)) {
|
|
continue;
|
|
}
|
|
$list_of_children .= $node->name . ',';
|
|
}
|
|
// add leading comma to deal with stray comma declarations
|
|
$list_of_children = ',' . rtrim($list_of_children, ',');
|
|
$okay =
|
|
preg_match(
|
|
'/^,?' . $this->_pcre_regex . '$/',
|
|
$list_of_children
|
|
);
|
|
return (bool)$okay;
|
|
}
|
|
}
|
|
|
|
// vim: et sw=4 sts=4
|