From acd7ceb9408c5a42d088473e9e650a127edf2e86 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Tue, 15 Aug 2006 20:19:16 +0000 Subject: [PATCH] Major optimization on tokenizeDOM(), reduce execution time from 75% to 20% by passing tokens by reference and using a token factory. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@265 48356398-32a2-884e-a903-53898d9a118a --- library/HTMLPurifier/Lexer/DOMLex.php | 32 ++++++++++------ library/HTMLPurifier/Token.php | 1 + library/HTMLPurifier/TokenFactory.php | 51 +++++++++++++++++++++++++ tests/HTMLPurifier/TokenFactoryTest.php | 19 +++++++++ tests/index.php | 4 ++ 5 files changed, 95 insertions(+), 12 deletions(-) create mode 100644 library/HTMLPurifier/TokenFactory.php create mode 100644 tests/HTMLPurifier/TokenFactoryTest.php diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index 0df13ae5..07b12d22 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -1,6 +1,7 @@ factory = new HTMLPurifier_TokenFactory(); + } + public function tokenizeHTML($string, $config = null) { if (!$config) $config = HTMLPurifier_Config::createDefault(); @@ -50,10 +58,12 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer @$doc->loadHTML($string); // mute all errors, handle it transparently - return $this->tokenizeDOM( + $tokens = array(); + $this->tokenizeDOM( $doc->childNodes->item(1)-> // html getElementsByTagName('body')->item(0) // body - ); + , $tokens); + return $tokens; } /** @@ -66,33 +76,33 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer * tag you're dealing with. * @returns Tokens of node appended to previously passed tokens. */ - protected function tokenizeDOM($node, $tokens = array(), $collect = false) { + protected function tokenizeDOM($node, &$tokens, $collect = false) { // recursive goodness! // intercept non element nodes if ( !($node instanceof DOMElement) ) { if ($node instanceof DOMComment) { - $tokens[] = new HTMLPurifier_Token_Comment($node->data); + $tokens[] = $this->factory->createComment($node->data); } elseif ($node instanceof DOMText || $node instanceof DOMCharacterData) { - $tokens[] = new HTMLPurifier_Token_Text($node->data); + $tokens[] = $this->factory->createText($node->data); } // quite possibly, the object wasn't handled, that's fine - return $tokens; + return; } // We still have to make sure that the element actually IS empty if (!$node->hasChildNodes()) { if ($collect) { - $tokens[] = new HTMLPurifier_Token_Empty( + $tokens[] = $this->factory->createEmpty( $node->tagName, $this->transformAttrToAssoc($node->attributes) ); } } else { if ($collect) { // don't wrap on first iteration - $tokens[] = new HTMLPurifier_Token_Start( + $tokens[] = $this->factory->createStart( $tag_name = $node->tagName, // somehow, it get's dropped $this->transformAttrToAssoc($node->attributes) ); @@ -100,15 +110,13 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer foreach ($node->childNodes as $node) { // remember, it's an accumulator. Otherwise, we'd have // to use array_merge - $tokens = $this->tokenizeDOM($node, $tokens, true); + $this->tokenizeDOM($node, $tokens, true); } if ($collect) { - $tokens[] = new HTMLPurifier_Token_End($tag_name); + $tokens[] = $this->factory->createEnd($tag_name); } } - return $tokens; - } /** diff --git a/library/HTMLPurifier/Token.php b/library/HTMLPurifier/Token.php index ed46621b..f53743b9 100644 --- a/library/HTMLPurifier/Token.php +++ b/library/HTMLPurifier/Token.php @@ -51,6 +51,7 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract * @param $attributes Associative array of attributes. */ function HTMLPurifier_Token_Tag($name, $attributes = array()) { + //if ($attributes === null) var_dump(debug_backtrace()); $this->name = ctype_lower($name) ? $name : strtolower($name); foreach ($attributes as $key => $value) { // normalization only necessary when key is not lowercase diff --git a/library/HTMLPurifier/TokenFactory.php b/library/HTMLPurifier/TokenFactory.php new file mode 100644 index 00000000..8c761fbb --- /dev/null +++ b/library/HTMLPurifier/TokenFactory.php @@ -0,0 +1,51 @@ +p_start = new HTMLPurifier_Token_Start('', array()); + $this->p_end = new HTMLPurifier_Token_End(''); + $this->p_empty = new HTMLPurifier_Token_Empty('', array()); + $this->p_text = new HTMLPurifier_Token_Text(''); + $this->p_comment= new HTMLPurifier_Token_Comment(''); + } + + public function createStart($name, $attributes = array()) { + $p = clone $this->p_start; + $p->HTMLPurifier_Token_Tag($name, $attributes); + return $p; + } + + public function createEnd($name) { + $p = clone $this->p_end; + $p->HTMLPurifier_Token_Tag($name); + return $p; + } + + public function createEmpty($name, $attributes = array()) { + $p = clone $this->p_empty; + $p->HTMLPurifier_Token_Tag($name, $attributes); + return $p; + } + + public function createText($data) { + $p = clone $this->p_text; + $p->HTMLPurifier_Token_Text($data); + return $p; + } + + public function createComment($data) { + $p = clone $this->p_comment; + $p->HTMLPurifier_Token_Comment($data); + return $p; + } + +} + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/TokenFactoryTest.php b/tests/HTMLPurifier/TokenFactoryTest.php new file mode 100644 index 00000000..9995ef74 --- /dev/null +++ b/tests/HTMLPurifier/TokenFactoryTest.php @@ -0,0 +1,19 @@ + 'about:blank')); + $generated = $factory->createStart('a', array('href' => 'about:blank')); + + $this->assertEqual($regular, $generated); + + } +} + +?> \ No newline at end of file diff --git a/tests/index.php b/tests/index.php index ae6566ef..8aab3589 100644 --- a/tests/index.php +++ b/tests/index.php @@ -79,6 +79,10 @@ $test_files[] = 'AttrTransform/ImgRequiredTest.php'; $test_files[] = 'URISchemeRegistryTest.php'; $test_files[] = 'URISchemeTest.php'; +if (version_compare(PHP_VERSION, '5', '>=')) { + $test_files[] = 'TokenFactoryTest.php'; +} + $test_file_lookup = array_flip($test_files); function htmlpurifier_path2class($path) {