mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-22 16:31:53 +00:00
Iterative traversal of DOM.
There are some deep DOMs you can hit the maximum nesting level limit in tokenizeDOM (we've experienced this even with maximum nesting level of 300). Here is an iterative version of the same function with simple queue/dequeue approach. Signed-off-by: Maxim Krizhanovsky <darhazer@gmail.com>
This commit is contained in:
parent
77982bd61d
commit
a3d71fe606
3
NEWS
3
NEWS
@ -20,6 +20,9 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
when %CSS.Trusted is on.
|
when %CSS.Trusted is on.
|
||||||
! Add %Cache.SerializerPermissions option for custom serializer
|
! Add %Cache.SerializerPermissions option for custom serializer
|
||||||
directory/file permissions
|
directory/file permissions
|
||||||
|
- Switch to an iterative traversal of the DOM, which prevents us
|
||||||
|
from running out of stack space for deeply nested documents.
|
||||||
|
Thanks Maxim Krizhanovsky for contributing a patch.
|
||||||
- Make removal of conditional IE comments ungreedy; thanks Bernd
|
- Make removal of conditional IE comments ungreedy; thanks Bernd
|
||||||
for reporting.
|
for reporting.
|
||||||
- Escape CDATA before removing Internet Explorer comments.
|
- Escape CDATA before removing Internet Explorer comments.
|
||||||
|
@ -72,23 +72,57 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Recursive function that tokenizes a node, putting it into an accumulator.
|
* Iterative function that tokenizes a node, putting it into an accumulator.
|
||||||
*
|
* To iterate is human, to recurse divine - L. Peter Deutsch
|
||||||
|
* @param $node DOMNode to be tokenized.
|
||||||
|
* @param $tokens Array-list of already tokenized tokens.
|
||||||
|
* @returns Tokens of node appended to previously passed tokens.
|
||||||
|
*/
|
||||||
|
protected function tokenizeDOM($node, &$tokens) {
|
||||||
|
|
||||||
|
$level = 0;
|
||||||
|
$nodes = array($level => array($node));
|
||||||
|
$closingNodes = array();
|
||||||
|
do {
|
||||||
|
while (!empty($nodes[$level])) {
|
||||||
|
$node = array_shift($nodes[$level]); // FIFO
|
||||||
|
$collect = $level > 0 ? true : false;
|
||||||
|
$needEndingTag = $this->createStartNode($node, $tokens, $collect);
|
||||||
|
if ($needEndingTag) {
|
||||||
|
$closingNodes[$level][] = $node;
|
||||||
|
}
|
||||||
|
if ($node->childNodes && $node->childNodes->length) {
|
||||||
|
$level++;
|
||||||
|
$nodes[$level] = array();
|
||||||
|
foreach ($node->childNodes as $childNode) {
|
||||||
|
array_push($nodes[$level], $childNode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$level--;
|
||||||
|
if ($level && isset($closingNodes[$level])) {
|
||||||
|
while($node = array_pop($closingNodes[$level])) {
|
||||||
|
$this->createEndNode($node, $tokens);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while ($level > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
* @param $node DOMNode to be tokenized.
|
* @param $node DOMNode to be tokenized.
|
||||||
* @param $tokens Array-list of already tokenized tokens.
|
* @param $tokens Array-list of already tokenized tokens.
|
||||||
* @param $collect Says whether or start and close are collected, set to
|
* @param $collect Says whether or start and close are collected, set to
|
||||||
* false at first recursion because it's the implicit DIV
|
* false at first recursion because it's the implicit DIV
|
||||||
* tag you're dealing with.
|
* tag you're dealing with.
|
||||||
* @returns Tokens of node appended to previously passed tokens.
|
* @returns bool if the token needs an endtoken
|
||||||
*/
|
*/
|
||||||
protected function tokenizeDOM($node, &$tokens, $collect = false) {
|
protected function createStartNode($node, &$tokens, $collect) {
|
||||||
|
|
||||||
// intercept non element nodes. WE MUST catch all of them,
|
// intercept non element nodes. WE MUST catch all of them,
|
||||||
// but we're not getting the character reference nodes because
|
// but we're not getting the character reference nodes because
|
||||||
// those should have been preprocessed
|
// those should have been preprocessed
|
||||||
if ($node->nodeType === XML_TEXT_NODE) {
|
if ($node->nodeType === XML_TEXT_NODE) {
|
||||||
$tokens[] = $this->factory->createText($node->data);
|
$tokens[] = $this->factory->createText($node->data);
|
||||||
return;
|
return false;
|
||||||
} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
|
} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
|
||||||
// undo libxml's special treatment of <script> and <style> tags
|
// undo libxml's special treatment of <script> and <style> tags
|
||||||
$last = end($tokens);
|
$last = end($tokens);
|
||||||
@ -106,48 +140,44 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
$tokens[] = $this->factory->createText($this->parseData($data));
|
$tokens[] = $this->factory->createText($this->parseData($data));
|
||||||
return;
|
return false;
|
||||||
} elseif ($node->nodeType === XML_COMMENT_NODE) {
|
} elseif ($node->nodeType === XML_COMMENT_NODE) {
|
||||||
// this is code is only invoked for comments in script/style in versions
|
// this is code is only invoked for comments in script/style in versions
|
||||||
// of libxml pre-2.6.28 (regular comments, of course, are still
|
// of libxml pre-2.6.28 (regular comments, of course, are still
|
||||||
// handled regularly)
|
// handled regularly)
|
||||||
$tokens[] = $this->factory->createComment($node->data);
|
$tokens[] = $this->factory->createComment($node->data);
|
||||||
return;
|
return false;
|
||||||
} elseif (
|
} elseif (
|
||||||
// not-well tested: there may be other nodes we have to grab
|
// not-well tested: there may be other nodes we have to grab
|
||||||
$node->nodeType !== XML_ELEMENT_NODE
|
$node->nodeType !== XML_ELEMENT_NODE
|
||||||
) {
|
) {
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
$attr = $node->hasAttributes() ?
|
$attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
|
||||||
$this->transformAttrToAssoc($node->attributes) :
|
|
||||||
array();
|
|
||||||
|
|
||||||
// We still have to make sure that the element actually IS empty
|
// We still have to make sure that the element actually IS empty
|
||||||
if (!$node->childNodes->length) {
|
if (!$node->childNodes->length) {
|
||||||
if ($collect) {
|
if ($collect) {
|
||||||
$tokens[] = $this->factory->createEmpty($node->tagName, $attr);
|
$tokens[] = $this->factory->createEmpty($node->tagName, $attr);
|
||||||
}
|
}
|
||||||
|
return false;
|
||||||
} else {
|
} else {
|
||||||
if ($collect) { // don't wrap on first iteration
|
if ($collect) {
|
||||||
$tokens[] = $this->factory->createStart(
|
$tokens[] = $this->factory->createStart(
|
||||||
$tag_name = $node->tagName, // somehow, it get's dropped
|
$tag_name = $node->tagName, // somehow, it get's dropped
|
||||||
$attr
|
$attr
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
foreach ($node->childNodes as $node) {
|
return true;
|
||||||
// remember, it's an accumulator. Otherwise, we'd have
|
|
||||||
// to use array_merge
|
|
||||||
$this->tokenizeDOM($node, $tokens, true);
|
|
||||||
}
|
|
||||||
if ($collect) {
|
|
||||||
$tokens[] = $this->factory->createEnd($tag_name);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected function createEndNode($node, &$tokens) {
|
||||||
|
$tokens[] = $this->factory->createEnd($node->tagName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
|
* Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
|
||||||
*
|
*
|
||||||
|
Loading…
Reference in New Issue
Block a user