0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-11-10 07:38:41 +00:00

Major optimization on tokenizeDOM(), reduce execution time from 75% to 20% by passing tokens by reference and using a token factory.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@265 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-08-15 20:19:16 +00:00
parent ed79facadf
commit acd7ceb940
5 changed files with 95 additions and 12 deletions

View File

@ -1,6 +1,7 @@
<?php <?php
require_once 'HTMLPurifier/Lexer.php'; require_once 'HTMLPurifier/Lexer.php';
require_once 'HTMLPurifier/TokenFactory.php';
/** /**
* Parser that uses PHP 5's DOM extension (part of the core). * Parser that uses PHP 5's DOM extension (part of the core).
@ -25,6 +26,13 @@ require_once 'HTMLPurifier/Lexer.php';
class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
{ {
private $factory;
public function __construct() {
// setup the factory
$this->factory = new HTMLPurifier_TokenFactory();
}
public function tokenizeHTML($string, $config = null) { public function tokenizeHTML($string, $config = null) {
if (!$config) $config = HTMLPurifier_Config::createDefault(); if (!$config) $config = HTMLPurifier_Config::createDefault();
@ -50,10 +58,12 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
@$doc->loadHTML($string); // mute all errors, handle it transparently @$doc->loadHTML($string); // mute all errors, handle it transparently
return $this->tokenizeDOM( $tokens = array();
$this->tokenizeDOM(
$doc->childNodes->item(1)-> // html $doc->childNodes->item(1)-> // html
getElementsByTagName('body')->item(0) // body getElementsByTagName('body')->item(0) // body
); , $tokens);
return $tokens;
} }
/** /**
@ -66,33 +76,33 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
* tag you're dealing with. * tag you're dealing with.
* @returns Tokens of node appended to previously passed tokens. * @returns Tokens of node appended to previously passed tokens.
*/ */
protected function tokenizeDOM($node, $tokens = array(), $collect = false) { protected function tokenizeDOM($node, &$tokens, $collect = false) {
// recursive goodness! // recursive goodness!
// intercept non element nodes // intercept non element nodes
if ( !($node instanceof DOMElement) ) { if ( !($node instanceof DOMElement) ) {
if ($node instanceof DOMComment) { if ($node instanceof DOMComment) {
$tokens[] = new HTMLPurifier_Token_Comment($node->data); $tokens[] = $this->factory->createComment($node->data);
} elseif ($node instanceof DOMText || } elseif ($node instanceof DOMText ||
$node instanceof DOMCharacterData) { $node instanceof DOMCharacterData) {
$tokens[] = new HTMLPurifier_Token_Text($node->data); $tokens[] = $this->factory->createText($node->data);
} }
// quite possibly, the object wasn't handled, that's fine // quite possibly, the object wasn't handled, that's fine
return $tokens; return;
} }
// We still have to make sure that the element actually IS empty // We still have to make sure that the element actually IS empty
if (!$node->hasChildNodes()) { if (!$node->hasChildNodes()) {
if ($collect) { if ($collect) {
$tokens[] = new HTMLPurifier_Token_Empty( $tokens[] = $this->factory->createEmpty(
$node->tagName, $node->tagName,
$this->transformAttrToAssoc($node->attributes) $this->transformAttrToAssoc($node->attributes)
); );
} }
} else { } else {
if ($collect) { // don't wrap on first iteration if ($collect) { // don't wrap on first iteration
$tokens[] = new HTMLPurifier_Token_Start( $tokens[] = $this->factory->createStart(
$tag_name = $node->tagName, // somehow, it get's dropped $tag_name = $node->tagName, // somehow, it get's dropped
$this->transformAttrToAssoc($node->attributes) $this->transformAttrToAssoc($node->attributes)
); );
@ -100,15 +110,13 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
foreach ($node->childNodes as $node) { foreach ($node->childNodes as $node) {
// remember, it's an accumulator. Otherwise, we'd have // remember, it's an accumulator. Otherwise, we'd have
// to use array_merge // to use array_merge
$tokens = $this->tokenizeDOM($node, $tokens, true); $this->tokenizeDOM($node, $tokens, true);
} }
if ($collect) { if ($collect) {
$tokens[] = new HTMLPurifier_Token_End($tag_name); $tokens[] = $this->factory->createEnd($tag_name);
} }
} }
return $tokens;
} }
/** /**

View File

@ -51,6 +51,7 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
* @param $attributes Associative array of attributes. * @param $attributes Associative array of attributes.
*/ */
function HTMLPurifier_Token_Tag($name, $attributes = array()) { function HTMLPurifier_Token_Tag($name, $attributes = array()) {
//if ($attributes === null) var_dump(debug_backtrace());
$this->name = ctype_lower($name) ? $name : strtolower($name); $this->name = ctype_lower($name) ? $name : strtolower($name);
foreach ($attributes as $key => $value) { foreach ($attributes as $key => $value) {
// normalization only necessary when key is not lowercase // normalization only necessary when key is not lowercase

View File

@ -0,0 +1,51 @@
<?php
require_once 'HTMLPurifier/Token.php';
class HTMLPurifier_TokenFactory
{
// p stands for prototype
private $p_start, $p_end, $p_empty, $p_text, $p_comment;
public function __construct() {
$this->p_start = new HTMLPurifier_Token_Start('', array());
$this->p_end = new HTMLPurifier_Token_End('');
$this->p_empty = new HTMLPurifier_Token_Empty('', array());
$this->p_text = new HTMLPurifier_Token_Text('');
$this->p_comment= new HTMLPurifier_Token_Comment('');
}
public function createStart($name, $attributes = array()) {
$p = clone $this->p_start;
$p->HTMLPurifier_Token_Tag($name, $attributes);
return $p;
}
public function createEnd($name) {
$p = clone $this->p_end;
$p->HTMLPurifier_Token_Tag($name);
return $p;
}
public function createEmpty($name, $attributes = array()) {
$p = clone $this->p_empty;
$p->HTMLPurifier_Token_Tag($name, $attributes);
return $p;
}
public function createText($data) {
$p = clone $this->p_text;
$p->HTMLPurifier_Token_Text($data);
return $p;
}
public function createComment($data) {
$p = clone $this->p_comment;
$p->HTMLPurifier_Token_Comment($data);
return $p;
}
}
?>

View File

@ -0,0 +1,19 @@
<?php
require_once 'HTMLPurifier/TokenFactory.php';
class HTMLPurifier_TokenFactoryTest extends UnitTestCase
{
public function test() {
$factory = new HTMLPurifier_TokenFactory();
$regular = new HTMLPurifier_Token_Start('a', array('href' => 'about:blank'));
$generated = $factory->createStart('a', array('href' => 'about:blank'));
$this->assertEqual($regular, $generated);
}
}
?>

View File

@ -79,6 +79,10 @@ $test_files[] = 'AttrTransform/ImgRequiredTest.php';
$test_files[] = 'URISchemeRegistryTest.php'; $test_files[] = 'URISchemeRegistryTest.php';
$test_files[] = 'URISchemeTest.php'; $test_files[] = 'URISchemeTest.php';
if (version_compare(PHP_VERSION, '5', '>=')) {
$test_files[] = 'TokenFactoryTest.php';
}
$test_file_lookup = array_flip($test_files); $test_file_lookup = array_flip($test_files);
function htmlpurifier_path2class($path) { function htmlpurifier_path2class($path) {