0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-02 21:01:52 +00:00

Use a Zipper to process MakeWellFormed, removing quadratic behavior.

Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
This commit is contained in:
Edward Z. Yang 2013-10-13 12:53:51 -07:00
parent 82bcc62058
commit 8f401f769e
10 changed files with 158 additions and 149 deletions

View File

@ -503,7 +503,7 @@
</directive>
<directive id="Core.EscapeInvalidTags">
<file name="HTMLPurifier/Strategy/MakeWellFormed.php">
<line>66</line>
<line>72</line>
</file>
<file name="HTMLPurifier/Strategy/RemoveForeignElements.php">
<line>26</line>

View File

@ -35,19 +35,16 @@ abstract class HTMLPurifier_Injector
protected $currentNesting;
/**
* Reference to InputTokens variable in Context. This is an array
* list of the input tokens that are being processed.
* @type array
* Reference to current token.
* @type HTMLPurifier_Token
*/
protected $inputTokens;
protected $currentToken;
/**
* Reference to InputIndex variable in Context. This is an integer
* array index for $this->inputTokens that indicates what token
* is currently being processed.
* @type int
* Reference to InputZipper variable in Context.
* @type HTMLPurifier_Zipper
*/
protected $inputIndex;
protected $inputZipper;
/**
* Array of elements and attributes this injector creates and therefore
@ -58,33 +55,33 @@ abstract class HTMLPurifier_Injector
public $needed = array();
/**
* Index of inputTokens to rewind to.
* Number of elements to rewind backwards (relative).
* @type bool|int
*/
protected $rewind = false;
protected $rewindOffset = false;
/**
* Rewind to a spot to re-perform processing. This is useful if you
* deleted a node, and now need to see if this change affected any
* earlier nodes. Rewinding does not affect other injectors, and can
* result in infinite loops if not used carefully.
* @param bool|int $index
* @param bool|int $offset
* @warning HTML Purifier will prevent you from fast-forwarding with this
* function.
*/
public function rewind($index)
public function rewindOffset($offset)
{
$this->rewind = $index;
$this->rewindOffset = $offset;
}
/**
* Retrieves rewind, and then unsets it.
* Retrieves rewind offset, and then unsets it.
* @return bool|int
*/
public function getRewind()
public function getRewindOffset()
{
$r = $this->rewind;
$this->rewind = false;
$r = $this->rewindOffset;
$this->rewindOffset = false;
return $r;
}
@ -108,8 +105,8 @@ abstract class HTMLPurifier_Injector
return $result;
}
$this->currentNesting =& $context->get('CurrentNesting');
$this->inputTokens =& $context->get('InputTokens');
$this->inputIndex =& $context->get('InputIndex');
$this->currentToken =& $context->get('CurrentToken');
$this->inputZipper =& $context->get('InputZipper');
return false;
}
@ -183,14 +180,14 @@ abstract class HTMLPurifier_Injector
protected function forward(&$i, &$current)
{
if ($i === null) {
$i = $this->inputIndex + 1;
$i = count($this->inputZipper->back) - 1;
} else {
$i++;
$i--;
}
if (!isset($this->inputTokens[$i])) {
if ($i < 0) {
return false;
}
$current = $this->inputTokens[$i];
$current = $this->inputZipper->back[$i];
return true;
}
@ -237,35 +234,17 @@ abstract class HTMLPurifier_Injector
protected function backward(&$i, &$current)
{
if ($i === null) {
$i = $this->inputIndex - 1;
$i = count($this->inputZipper->front) - 1;
} else {
$i--;
}
if ($i < 0) {
return false;
}
$current = $this->inputTokens[$i];
$current = $this->inputZipper->front[$i];
return true;
}
/**
* Initializes the iterator at the current position. Use in a do {} while;
* loop to force the _forward and _backward functions to start at the
* current location.
* @warning Please prevent previous references from interfering with this
* functions by setting $i = null beforehand!
* @param int $i Current integer index variable for inputTokens
* @param HTMLPurifier_Token $current Current token variable.
* Do NOT use $token, as that variable is also a reference
*/
protected function current(&$i, &$current)
{
if ($i === null) {
$i = $this->inputIndex;
}
$current = $this->inputTokens[$i];
}
/**
* Handler that is called when a text token is processed
*/

View File

@ -307,13 +307,13 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
*/
private function _pLookAhead()
{
$this->current($i, $current);
if ($current instanceof HTMLPurifier_Token_Start) {
if ($this->currentToken instanceof HTMLPurifier_Token_Start) {
$nesting = 1;
} else {
$nesting = 0;
}
$ok = false;
$i = null;
while ($this->forwardUntilEndToken($i, $current, $nesting)) {
$result = $this->_checkNeedsP($current);
if ($result !== null) {

View File

@ -57,8 +57,9 @@ class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector
return;
}
$next = false;
for ($i = $this->inputIndex + 1, $c = count($this->inputTokens); $i < $c; $i++) {
$next = $this->inputTokens[$i];
$deleted = 1; // the current tag
for ($i = count($this->inputZipper->back) - 1; $i >= 0; $i--, $deleted++) {
$next = $this->inputZipper->back[$i];
if ($next instanceof HTMLPurifier_Token_Text) {
if ($next->is_whitespace) {
continue;
@ -82,16 +83,16 @@ class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector
if (isset($token->attr['id']) || isset($token->attr['name'])) {
return;
}
$token = $i - $this->inputIndex + 1;
for ($b = $this->inputIndex - 1; $b > 0; $b--) {
$prev = $this->inputTokens[$b];
$token = $deleted + 1;
for ($b = 0, $c = count($this->inputZipper->front); $b < $c; $b++) {
$prev = $this->inputZipper->front[$b];
if ($prev instanceof HTMLPurifier_Token_Text && $prev->is_whitespace) {
continue;
}
break;
}
// This is safe because we removed the token that triggered this.
$this->rewind($b - 1);
$this->rewindOffset($b+$deleted);
return;
}
}

View File

@ -21,10 +21,16 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
protected $tokens;
/**
* Current index in $tokens.
* @type int
* Current token.
* @type HTMLPurifier_Token
*/
protected $t;
protected $token;
/**
* Zipper managing the true state.
* @type HTMLPurifier_Zipper
*/
protected $zipper;
/**
* Current nesting of elements.
@ -67,23 +73,25 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// used for autoclose early abortion
$global_parent_allowed_elements = $definition->info_parent_def->child->getAllowedElements($config);
$e = $context->get('ErrorCollector', true);
$t = false; // token index
$i = false; // injector index
$token = false; // the current token
list($zipper, $token) = HTMLPurifier_Zipper::fromArray($tokens);
if ($token === NULL) {
return array();
}
$reprocess = false; // whether or not to reprocess the same token
$stack = array();
// member variables
$this->stack =& $stack;
$this->t =& $t;
$this->tokens =& $tokens;
$this->token =& $token;
$this->zipper =& $zipper;
$this->config = $config;
$this->context = $context;
// context variables
$context->register('CurrentNesting', $stack);
$context->register('InputIndex', $t);
$context->register('InputTokens', $tokens);
$context->register('InputZipper', $zipper);
$context->register('CurrentToken', $token);
// -- begin INJECTOR --
@ -142,32 +150,28 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// punt ($reprocess = true; continue;) and it does that for us.
// isset is in loop because $tokens size changes during loop exec
for ($t = 0;
$t == 0 || isset($tokens[$t - 1]);
for (;;
// only increment if we don't need to reprocess
$reprocess ? $reprocess = false : $t++) {
$reprocess ? $reprocess = false : $token = $zipper->next($token)) {
// check for a rewind
if (is_int($i) && $i >= 0) {
if (is_int($i)) {
// possibility: disable rewinding if the current token has a
// rewind set on it already. This would offer protection from
// infinite loop, but might hinder some advanced rewinding.
$rewind_to = $this->injectors[$i]->getRewind();
if (is_int($rewind_to) && $rewind_to < $t) {
if ($rewind_to < 0) {
$rewind_to = 0;
}
while ($t > $rewind_to) {
$t--;
$prev = $tokens[$t];
$rewind_offset = $this->injectors[$i]->getRewindOffset();
if (is_int($rewind_offset)) {
for ($j = 0; $j < $rewind_offset; $j++) {
if (empty($zipper->front)) break;
$token = $zipper->prev($token);
// indicate that other injectors should not process this token,
// but we need to reprocess it
unset($prev->skip[$i]);
$prev->rewind = $i;
if ($prev instanceof HTMLPurifier_Token_Start) {
unset($token->skip[$i]);
$token->rewind = $i;
if ($token instanceof HTMLPurifier_Token_Start) {
array_pop($this->stack);
} elseif ($prev instanceof HTMLPurifier_Token_End) {
$this->stack[] = $prev->start;
} elseif ($token instanceof HTMLPurifier_Token_End) {
$this->stack[] = $token->start;
}
}
}
@ -175,7 +179,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
}
// handle case of document end
if (!isset($tokens[$t])) {
if ($token === NULL) {
// kill processing if stack is empty
if (empty($this->stack)) {
break;
@ -191,16 +195,14 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
}
// append, don't splice, since this is the end
$tokens[] = new HTMLPurifier_Token_End($top_nesting->name);
$token = new HTMLPurifier_Token_End($top_nesting->name);
// punt!
$reprocess = true;
continue;
}
$token = $tokens[$t];
//echo '<br>'; printTokens($tokens, $t); printTokens($this->stack);
//echo '<br>'; printZipper($zipper, $token);//printTokens($this->stack);
//flush();
// quick-check: if it's not a tag, no need to process
@ -213,8 +215,10 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
if ($token->rewind !== null && $token->rewind !== $i) {
continue;
}
$injector->handleText($token);
$this->processToken($token, $i);
// XXX fuckup
$r = $token;
$injector->handleText($r);
$token = $this->processToken($r, $i);
$reprocess = true;
break;
}
@ -243,9 +247,11 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$ok = true;
} elseif ($type && $type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) {
// claims to be empty but really is a start tag
$this->swap(new HTMLPurifier_Token_End($token->name));
$this->insertBefore(
new HTMLPurifier_Token_Start($token->name, $token->attr, $token->line, $token->col, $token->armor)
// NB: this assignment is required
$old_token = $token;
$token = new HTMLPurifier_Token_End($token->name);
$token = $this->insertBefore(
new HTMLPurifier_Token_Start($old_token->name, $old_token->attr, $old_token->line, $old_token->col, $old_token->armor)
);
// punt (since we had to modify the input stream in a non-trivial way)
$reprocess = true;
@ -293,7 +299,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$elements = $wrapdef->child->getAllowedElements($config);
if (isset($elements[$token->name]) && isset($parent_elements[$wrapname])) {
$newtoken = new HTMLPurifier_Token_Start($wrapname);
$this->insertBefore($newtoken);
$token = $this->insertBefore($newtoken);
$reprocess = true;
continue;
}
@ -330,15 +336,6 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// errors need to be updated
$new_token = new HTMLPurifier_Token_End($parent->name);
$new_token->start = $parent;
if ($carryover) {
$element = clone $parent;
// [TagClosedAuto]
$element->armor['MakeWellFormed_TagClosedError'] = true;
$element->carryover = true;
$this->processToken(array($new_token, $token, $element));
} else {
$this->insertBefore($new_token);
}
// [TagClosedSuppress]
if ($e && !isset($parent->armor['MakeWellFormed_TagClosedError'])) {
if (!$carryover) {
@ -347,8 +344,17 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag carryover', $parent);
}
}
if ($carryover) {
$element = clone $parent;
// [TagClosedAuto]
$element->armor['MakeWellFormed_TagClosedError'] = true;
$element->carryover = true;
$token = $this->processToken(array($new_token, $token, $element));
} else {
$this->remove();
$token = $this->insertBefore($new_token);
}
} else {
$token = $this->remove();
}
$reprocess = true;
continue;
@ -366,14 +372,14 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
if ($token->rewind !== null && $token->rewind !== $i) {
continue;
}
$injector->handleElement($token);
$this->processToken($token, $i);
$r = $token;
$injector->handleElement($r);
$token = $this->processToken($r, $i);
$reprocess = true;
break;
}
if (!$reprocess) {
// ah, nothing interesting happened; do normal processing
$this->swap($token);
if ($token instanceof HTMLPurifier_Token_Start) {
$this->stack[] = $token;
} elseif ($token instanceof HTMLPurifier_Token_End) {
@ -396,16 +402,12 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
if ($e) {
$e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text');
}
$this->swap(
new HTMLPurifier_Token_Text(
$generator->generateFromToken($token)
)
);
$token = new HTMLPurifier_Token_Text($generator->generateFromToken($token));
} else {
$this->remove();
if ($e) {
$e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed');
}
$token = $this->remove();
}
$reprocess = true;
continue;
@ -425,8 +427,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
if ($token->rewind !== null && $token->rewind !== $i) {
continue;
}
$injector->handleEnd($token);
$this->processToken($token, $i);
$r = $token;
$injector->handleEnd($r);
$token = $this->processToken($r, $i);
$this->stack[] = $current_parent;
$reprocess = true;
break;
@ -454,19 +457,15 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// we didn't find the tag, so remove
if ($skipped_tags === false) {
if ($escape_invalid_tags) {
$this->swap(
new HTMLPurifier_Token_Text(
$generator->generateFromToken($token)
)
);
if ($e) {
$e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text');
}
$token = new HTMLPurifier_Token_Text($generator->generateFromToken($token));
} else {
$this->remove();
if ($e) {
$e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed');
}
$token = $this->remove();
}
$reprocess = true;
continue;
@ -499,18 +498,17 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$replace[] = $element;
}
}
$this->processToken($replace);
$token = $this->processToken($replace);
$reprocess = true;
continue;
}
$context->destroy('CurrentNesting');
$context->destroy('InputTokens');
$context->destroy('InputIndex');
$context->destroy('CurrentToken');
$context->destroy('CurrentNesting');
$context->destroy('InputZipper');
unset($this->injectors, $this->stack, $this->tokens, $this->t);
return $tokens;
unset($this->injectors, $this->stack, $this->tokens);
return $zipper->toArray($token);
}
/**
@ -560,7 +558,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// array(number nodes to delete, new node 1, new node 2, ...)
$delete = array_shift($token);
$old = array_splice($this->tokens, $this->t, $delete, $token);
list($old, $r) = $this->zipper->splice($this->token, $delete, $token);
if ($injector > -1) {
// determine appropriate skips
@ -571,6 +569,8 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
}
}
return $r;
}
/**
@ -580,7 +580,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
*/
private function insertBefore($token)
{
array_splice($this->tokens, $this->t, 0, array($token));
// NB not $this->zipper->insertBefore(), due to positioning
// differences
return $this->zipper->splice($this->token, 0, array($token))[1];
}
/**
@ -589,17 +591,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
*/
private function remove()
{
array_splice($this->tokens, $this->t, 1);
}
/**
* Swap current token with new token. Cursor points to new token (no
* change). You must reprocess after this.
* @param HTMLPurifier_Token $token
*/
private function swap($token)
{
$this->tokens[$this->t] = $token;
return $this->zipper->delete();
}
}

View File

@ -20,7 +20,7 @@
class HTMLPurifier_Zipper
{
private $front, $back;
public $front, $back;
public function __construct($front, $back) {
$this->front = $front;
@ -95,6 +95,14 @@ class HTMLPurifier_Zipper
return empty($this->back) ? NULL : array_pop($this->back);
}
/**
* Returns true if we are at the end of the list.
* @return bool
*/
public function done() {
return empty($this->back);
}
/**
* Insert element before hole.
* @param Element to insert
@ -115,14 +123,16 @@ class HTMLPurifier_Zipper
* Splice in multiple elements at hole. Functional specification
* in terms of array_splice:
*
* $r1 = array_splice($arr, $i, $delete, $replacement);
* $arr1 = $arr;
* $old1 = array_splice($arr1, $i, $delete, $replacement);
*
* list($z, $t) = HTMLPurifier_Zipper::fromArray($arr);
* $t = $z->advance($t, $i);
* $t = $z->splice($t, $delete, $replacement);
* $r2 = $z->toArray($t);
* list($old2, $t) = $z->splice($t, $delete, $replacement);
* $arr2 = $z->toArray($t);
*
* assert($r1 === $r2);
* assert($old1 === $old2);
* assert($arr1 === $arr2);
*
* NB: the absolute index location after this operation is
* *unchanged!*
@ -131,8 +141,10 @@ class HTMLPurifier_Zipper
*/
public function splice($t, $delete, $replacement) {
// delete
$old = array();
$r = $t;
for ($i = $delete; $i > 0; $i--) {
$old[] = $r;
$r = $this->delete();
}
// insert
@ -140,6 +152,6 @@ class HTMLPurifier_Zipper
$this->insertAfter($r);
$r = $replacement[$i];
}
return $r;
return array($old, $r);
}
}

View File

@ -24,7 +24,7 @@ class HTMLPurifier_Strategy_MakeWellFormed_EndRewindInjector extends HTMLPurifie
) {
$token = false;
$prev->_InjectorTest_EndRewindInjector_delete = true;
$this->rewind($i);
$this->rewindOffset(1);
}
}
}

View File

@ -27,7 +27,7 @@ class HTMLPurifier_Strategy_MakeWellFormed_InjectorTest extends HTMLPurifier_Str
$i->start->skip = array(0 => true, 1 => true);
$mock->expectAt(1, 'handleEnd', array($i));
$mock->expectCallCount('handleEnd', 2);
$mock->setReturnValue('getRewind', false);
$mock->setReturnValue('getRewindOffset', false);
$this->config->set('AutoFormat.AutoParagraph', false);
$this->config->set('AutoFormat.Linkify', false);
$this->config->set('AutoFormat.Custom', array($mock));

View File

@ -16,7 +16,8 @@ class HTMLPurifier_ZipperTest extends HTMLPurifier_Harness
$z->insertBefore(4);
$z->insertAfter(5);
$this->assertIdentical($z->toArray($t), array(0,1,4,3,5));
$t = $z->splice($t, 2, array(6,7));
list($old, $t) = $z->splice($t, 2, array(6,7));
$this->assertIdentical($old, array(3,5));
$this->assertIdentical($t, 6);
$this->assertIdentical($z->toArray($t), array(0,1,4,6,7));
}

View File

@ -182,10 +182,34 @@ function printTokens($tokens, $index = null)
$string = '<pre>';
$generator = new HTMLPurifier_Generator(HTMLPurifier_Config::createDefault(), new HTMLPurifier_Context);
foreach ($tokens as $i => $token) {
if ($index === $i) $string .= '[<strong>';
$string .= printToken($generator, $token, $i, $index == $i);
}
$string .= '</pre>';
echo $string;
}
function printToken($generator, $token, $i, $isCursor)
{
$string = "";
if ($isCursor) $string .= '[<strong>';
$string .= "<sup>$i</sup>";
$string .= $generator->escape($generator->generateFromToken($token));
if ($index === $i) $string .= '</strong>]';
if ($isCursor) $string .= '</strong>]';
return $string;
}
function printZipper($zipper, $token)
{
$string = '<pre>';
$generator = new HTMLPurifier_Generator(HTMLPurifier_Config::createDefault(), new HTMLPurifier_Context);
foreach ($zipper->front as $i => $t) {
$string .= printToken($generator, $t, $i, false);
}
if ($token !== NULL) {
$string .= printToken($generator, $token, "", true);
}
for ($i = count($zipper->back)-1; $i >= 0; $i--) {
$string .= printToken($generator, $zipper->back[$i], $i, false);
}
$string .= '</pre>';
echo $string;