From a3d71fe606eba154b28874e725e95f102a06d0ff Mon Sep 17 00:00:00 2001 From: Maxim Krizhanovsky Date: Wed, 19 Jan 2011 22:06:40 +0000 Subject: [PATCH] Iterative traversal of DOM. There are some deep DOMs you can hit the maximum nesting level limit in tokenizeDOM (we've experienced this even with maximum nesting level of 300). Here is an iterative version of the same function with simple queue/dequeue approach. Signed-off-by: Maxim Krizhanovsky --- NEWS | 3 ++ library/HTMLPurifier/Lexer/DOMLex.php | 76 +++++++++++++++++++-------- 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/NEWS b/NEWS index 3ded73e5..3e9bada3 100644 --- a/NEWS +++ b/NEWS @@ -20,6 +20,9 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier when %CSS.Trusted is on. ! Add %Cache.SerializerPermissions option for custom serializer directory/file permissions +- Switch to an iterative traversal of the DOM, which prevents us + from running out of stack space for deeply nested documents. + Thanks Maxim Krizhanovsky for contributing a patch. - Make removal of conditional IE comments ungreedy; thanks Bernd for reporting. - Escape CDATA before removing Internet Explorer comments. diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index 20dc2ed4..82f37745 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -72,23 +72,57 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer } /** - * Recursive function that tokenizes a node, putting it into an accumulator. - * + * Iterative function that tokenizes a node, putting it into an accumulator. + * To iterate is human, to recurse divine - L. Peter Deutsch * @param $node DOMNode to be tokenized. * @param $tokens Array-list of already tokenized tokens. - * @param $collect Says whether or start and close are collected, set to - * false at first recursion because it's the implicit DIV - * tag you're dealing with. * @returns Tokens of node appended to previously passed tokens. */ - protected function tokenizeDOM($node, &$tokens, $collect = false) { + protected function tokenizeDOM($node, &$tokens) { + $level = 0; + $nodes = array($level => array($node)); + $closingNodes = array(); + do { + while (!empty($nodes[$level])) { + $node = array_shift($nodes[$level]); // FIFO + $collect = $level > 0 ? true : false; + $needEndingTag = $this->createStartNode($node, $tokens, $collect); + if ($needEndingTag) { + $closingNodes[$level][] = $node; + } + if ($node->childNodes && $node->childNodes->length) { + $level++; + $nodes[$level] = array(); + foreach ($node->childNodes as $childNode) { + array_push($nodes[$level], $childNode); + } + } + } + $level--; + if ($level && isset($closingNodes[$level])) { + while($node = array_pop($closingNodes[$level])) { + $this->createEndNode($node, $tokens); + } + } + } while ($level > 0); + } + + /** + * @param $node DOMNode to be tokenized. + * @param $tokens Array-list of already tokenized tokens. + * @param $collect Says whether or start and close are collected, set to + * false at first recursion because it's the implicit DIV + * tag you're dealing with. + * @returns bool if the token needs an endtoken + */ + protected function createStartNode($node, &$tokens, $collect) { // intercept non element nodes. WE MUST catch all of them, // but we're not getting the character reference nodes because // those should have been preprocessed if ($node->nodeType === XML_TEXT_NODE) { $tokens[] = $this->factory->createText($node->data); - return; + return false; } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { // undo libxml's special treatment of