0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-03 05:11:52 +00:00

More refactoring to MakeWellFormed and Injectors; they work better than ever now!

Major paradigm shift in this commit is bailing ship on the "skip" integers, which
were extremely buggy and error prone, and simply mark tokens as processed or
not processed by injectors. Other notable changes:

- Removed ad hoc decrements to inputIndex in favor of $reprocess flag variable
- Moved rewind outside of processToken()
- Make rewind properly ignore all other injectors
- Cleanup end of document code
- Reconfigure injector loops to account for skips and rewinds
- Punt the empty to start/end transformation
- Completely rewrite processToken to be array based
- Added skip and rewind member variables to tokens
- Fixed a longstanding bug with remove empty!

Signed-off-by: Edward Z. Yang <edwardzyang@thewritingpot.com>
This commit is contained in:
Edward Z. Yang 2008-10-01 03:14:28 -04:00
parent fa413e96ac
commit cd4500457e
4 changed files with 138 additions and 166 deletions

View File

@ -16,13 +16,6 @@ abstract class HTMLPurifier_Injector
*/
public $name;
/**
* Amount of tokens the injector needs to skip + 1. Because
* the decrement is the first thing that happens, this needs to
* be one greater than the "real" skip count.
*/
public $skip = 1;
/**
* Instance of HTMLPurifier_HTMLDefinition
*/

View File

@ -74,67 +74,89 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
trigger_error("Cannot enable {$injector->name} injector because $error is not allowed", E_USER_WARNING);
}
// warning: most foreach loops follow the convention $i => $injector.
// Don't define these as loop-wide variables, please!
// -- end INJECTOR --
$token = false;
$context->register('CurrentToken', $token);
$reprocess = false;
$i = false; // injector index
// isset is in loop because $tokens size changes during loop exec
for (
$this->inputIndex = 0;
$this->inputIndex == 0 || isset($tokens[$this->inputIndex - 1]);
$this->inputIndex++
// only increment if we don't need to reprocess
$reprocess ? $reprocess = false : $this->inputIndex++
) {
foreach ($this->injectors as $injector) {
if ($injector->skip > 0) $injector->skip--;
// check for a rewind
if (is_int($i) && $i >= 0) {
$rewind_to = $this->injectors[$i]->getRewind();
if (is_int($rewind_to) && $rewind_to < $this->inputIndex) {
if ($rewind_to < 0) $rewind_to = 0;
while ($this->inputIndex > $rewind_to) {
$this->inputIndex--;
$prev = $this->inputTokens[$this->inputIndex];
// indicate that other injectors should not process this token,
// but we need to reprocess it
unset($prev->skip[$i]);
$prev->rewind = $i;
if ($prev instanceof HTMLPurifier_Token_Start) array_pop($this->currentNesting);
elseif ($prev instanceof HTMLPurifier_Token_End) $this->currentNesting[] = $prev->start;
}
}
$i = false;
}
// handle case of document end
if (!isset($tokens[$this->inputIndex])) {
// we're at the end now, fix all still unclosed tags (this is
// duplicated from the end of the loop with some slight modifications)
// not using $skipped_tags since it would invariably be all of them
if (!empty($this->currentNesting)) {
// We're at the end now, fix all still unclosed tags.
// This would logically go at the end of the loop, but because
// of all of the callbacks we need to be able to run the loop
// again.
// kill processing if stack is empty
if (empty($this->currentNesting)) {
break;
}
// peek
$top_nesting = array_pop($this->currentNesting);
// please don't redefine $i!
$this->currentNesting[] = $top_nesting;
// send error
if ($e && !isset($top_nesting->armor['MakeWellFormed_TagClosedError'])) {
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $top_nesting);
}
// instead of splice, since we know this is the end
$new_token = new HTMLPurifier_Token_End($top_nesting->name);
$tokens[] = $new_token;
$this->currentNesting[] = $top_nesting;
--$this->inputIndex;
// punt to the regular code to handle the new token
// append, don't splice, since this is the end
$tokens[] = new HTMLPurifier_Token_End($top_nesting->name);
// punt!
$reprocess = true;
continue;
}
break;
}
// if all goes well, this token will be passed through unharmed
$token = $tokens[$this->inputIndex];
//echo '<hr>';
//printTokens($tokens, $this->inputIndex);
//printTokens($this->inputTokens, $this->inputIndex);
//var_dump($this->currentNesting);
// quick-check: if it's not a tag, no need to process
if (empty( $token->is_tag )) {
if (empty($token->is_tag)) {
if ($token instanceof HTMLPurifier_Token_Text) {
// injector handler code; duplicated for performance reasons
foreach ($this->injectors as $i => $injector) {
if (!$injector->skip) $injector->handleText($token);
if (is_array($token) || is_int($token)) {
$this->currentInjector = $i;
if (isset($token->skip[$i])) continue;
if ($token->rewind !== null && $token->rewind !== $i) continue;
$injector->handleText($token);
$this->processToken($token, $i);
$reprocess = true;
break;
}
}
}
$this->processToken($token, $config, $context);
continue;
}
@ -152,11 +174,11 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$ok = true;
} elseif ($type && $type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) {
// claims to be empty but really is a start tag
$token = array(
new HTMLPurifier_Token_Start($token->name, $token->attr),
new HTMLPurifier_Token_End($token->name)
);
$ok = true;
$this->swap(new HTMLPurifier_Token_End($token->name));
$this->insertBefore(new HTMLPurifier_Token_Start($token->name, $token->attr));
// punt
$reprocess = true;
continue;
} elseif ($token instanceof HTMLPurifier_Token_Empty) {
// real empty token
$ok = true;
@ -192,13 +214,22 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// injector handler code; duplicated for performance reasons
if ($ok) {
foreach ($this->injectors as $i => $injector) {
if (!$injector->skip) $injector->handleElement($token);
if (is_array($token) || is_int($token)) {
$this->currentInjector = $i;
if (isset($token->skip[$i])) continue;
if ($token->rewind !== null && $token->rewind !== $i) continue;
$injector->handleElement($token);
$this->processToken($token, $i);
$reprocess = true;
break;
}
if (!$reprocess) {
// ah, nothing interesting happened; do normal processing
$this->swap($token);
if ($token instanceof HTMLPurifier_Token_Start) {
$this->currentNesting[] = $token;
} elseif ($token instanceof HTMLPurifier_Token_End) {
throw new HTMLPurifier_Exception('Improper handling of end tag in start code; possible error in MakeWellFormed');
}
}
$this->processToken($token, $config, $context);
continue;
}
@ -221,8 +252,15 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
}
continue;
}
if (!$this->handleEnd($token)) continue;
foreach ($this->injectors as $i => $injector) {
if (isset($token->skip[$i])) continue;
if ($token->rewind !== null && $token->rewind !== $i) continue;
$injector->handleEnd($token);
$this->processToken($token, $i);
$reprocess = true;
break;
}
if ($reprocess) continue;
// first, check for the simplest case: everything closes neatly
$current_parent = array_pop($this->currentNesting);
@ -267,15 +305,12 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// note that skipped tags contains the element we need closed
$this->remove();
for ($i = count($skipped_tags) - 1; $i >= 0; $i--) {
// please don't redefine $i!
if ($i && $e && !isset($skipped_tags[$i]->armor['MakeWellFormed_TagClosedError'])) {
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$i]);
}
$new_token = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
$new_token->start = $skipped_tags[$i];
$this->insertAfter($new_token);
//printTokens($tokens, $this->inputIndex);
//var_dump($this->currentNesting);
}
}
@ -290,14 +325,53 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
}
/**
* Inserts a token before the current token. Cursor now points to this token.
* Processes arbitrary token values for complicated substitution patterns.
* In general:
*
* If $token is an array, it is a list of tokens to substitute for the
* current token. These tokens then get individually processed.
*
* If $token is a regular token, it is swapped with the current token,
* and the stack is updated.
*
* If $token is false, the current token is deleted.
*/
protected function processToken($token, $injector = -1) {
// normalize forms of token
if (is_object($token)) $token = array(1, $token);
if (is_int($token)) $token = array($token);
if ($token === false) $token = array(1);
if (!is_array($token)) throw new HTMLPurifier_Exception('Invalid token type from injector');
if (!is_int($token[0])) array_unshift($token, 1);
if ($token[0] === 0) throw new HTMLPurifier_Exception('Deleting zero tokens is not valid');
// $token is now an array with the following form:
// array(number nodes to delete, new node 1, new node 2, ...)
$delete = array_shift($token);
$old = array_splice($this->inputTokens, $this->inputIndex, $delete, $token);
if ($injector > -1) {
// determine appropriate skips
$oldskip = isset($old[0]) ? $old[0]->skip : array();
foreach ($token as $object) {
$object->skip = $oldskip;
$object->skip[$injector] = true;
}
}
}
/**
* Inserts a token before the current token. Cursor now points to this token
*/
protected function insertBefore($token) {
array_splice($this->inputTokens, $this->inputIndex, 0, array($token));
}
/**
* Inserts a token after the current token. Cursor now points to this token.
* Inserts a token after the current token. Cursor now points to this token
*/
protected function insertAfter($token) {
array_splice($this->inputTokens, ++$this->inputIndex, 0, array($token));
@ -311,112 +385,11 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
}
/**
* Swap current token with new token. Cursor points to new token (no change).
* Swap current token with new token. Cursor points to new token (no change
*/
protected function swap($token) {
array_splice($this->inputTokens, $this->inputIndex, 1, array($token));
}
/**
* Processes arbitrary token values for complicated substitution patterns.
* In general:
*
* If $token is an array, it is a list of tokens to substitute for the
* current token. These tokens then get individually processed.
*
* If $token is a regular token, it is swapped with the current token,
* and the stack is updated.
*
* If $token is false, the current token is deleted.
*/
protected function processToken($token, $config, $context, $is_end = false) {
if (is_array($token) || is_int($token)) {
// the original token was overloaded by an injector, time
// to some fancy acrobatics
if (is_array($token)) {
array_splice($this->inputTokens, $this->inputIndex, 1, $token);
} else {
array_splice($this->inputTokens, $this->inputIndex, $token, array());
}
if ($this->injectors) {
if (!$this->checkRewind()) {
// adjust the injector skips based on the array substitution
$offset = is_array($token) ? count($token) : 0;
for ($i = 0; $i <= $this->currentInjector; $i++) {
// because of the skip back, we need to add one more
// for uninitialized injectors. I'm not exactly
// sure why this is the case, but I think it has to
// do with the fact that we're decrementing skips
// before re-checking text
if (!$this->injectors[$i]->skip) $this->injectors[$i]->skip++;
$this->injectors[$i]->skip += $offset;
}
}
}
// ensure that we reprocess these tokens with the other injectors
--$this->inputIndex;
} elseif ($token) {
if ($is_end) {
$this->swap($token);
if (!$token instanceof HTMLPurifier_Token_End) {
--$this->inputIndex;
}
} else {
// regular case
$this->swap($token);
if ($token instanceof HTMLPurifier_Token_Start) {
$this->currentNesting[] = $token;
} elseif ($token instanceof HTMLPurifier_Token_End) {
// not actually used
$token->start = array_pop($this->currentNesting);
}
}
} else {
$this->remove();
}
}
/**
* Checks for a rewind, adjusts the input index and skips accordingly.
*/
protected function checkRewind() {
$rewind = $this->injectors[$this->currentInjector]->getRewind();
if ($rewind < 0) $rewind = 0;
if (is_int($rewind)) {
$offset = $this->inputIndex - $rewind;
if ($this->injectors) {
foreach ($this->injectors as $i => $injector) {
if ($i == $this->currentInjector) {
$injector->skip = 0;
} else {
$injector->skip += $offset;
}
}
}
for ($this->inputIndex--; $this->inputIndex >= $rewind; $this->inputIndex--) {
$prev = $this->inputTokens[$this->inputIndex];
if ($prev instanceof HTMLPurifier_Token_Start) array_pop($this->currentNesting);
elseif ($prev instanceof HTMLPurifier_Token_End) $this->currentNesting[] = $prev->start;
}
$this->inputIndex++;
return true;
} else {
return false;
}
}
protected function handleEnd($token) {
foreach ($this->injectors as $i => $injector) {
if (!$injector->skip) $injector->handleEnd($token);
if (is_array($token) || is_int($token)) {
$this->currentInjector = $i;
break;
}
}
$this->processToken($token, $this->config, $this->context, true);
return $token instanceof HTMLPurifier_Token_End;
}
}

View File

@ -14,6 +14,12 @@ class HTMLPurifier_Token {
*/
public $armor = array();
/**
* Used during MakeWellFormed.
*/
public $skip;
public $rewind;
public function __get($n) {
if ($n === 'type') {
trigger_error('Deprecated type property called; use instanceof', E_USER_NOTICE);

View File

@ -14,10 +14,11 @@ class HTMLPurifier_Strategy_MakeWellFormed_InjectorTest extends HTMLPurifier_Str
function testEndHandler() {
$mock = new HTMLPurifier_InjectorMock();
$mock->skip = false;
$b = new HTMLPurifier_Token_End('b');
$b->skip = array(0 => true);
$mock->expectAt(0, 'handleEnd', array($b));
$i = new HTMLPurifier_Token_End('i');
$i->skip = array(0 => true);
$mock->expectAt(1, 'handleEnd', array($i));
$mock->expectCallCount('handleEnd', 2);
$mock->setReturnValue('getRewind', false);
@ -125,7 +126,6 @@ asdf<b></b></p>
}
function testRewindAndParagraph() {
// perhaps change the behavior of this?
$this->assertResult(
"bar
@ -136,7 +136,7 @@ asdf<b></b></p>
foo",
"<p>bar</p>
<p></p>
<p>foo</p>"
);