0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-09-19 18:55:19 +00:00

Major optimization on tokenizeDOM(), reduce execution time from 75% to 20% by passing tokens by reference and using a token factory.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@265 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-08-15 20:19:16 +00:00
parent ed79facadf
commit acd7ceb940
5 changed files with 95 additions and 12 deletions

View File

@ -1,6 +1,7 @@
<?php
require_once 'HTMLPurifier/Lexer.php';
require_once 'HTMLPurifier/TokenFactory.php';
/**
* Parser that uses PHP 5's DOM extension (part of the core).
@ -25,6 +26,13 @@ require_once 'HTMLPurifier/Lexer.php';
class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
{
private $factory;
public function __construct() {
// setup the factory
$this->factory = new HTMLPurifier_TokenFactory();
}
public function tokenizeHTML($string, $config = null) {
if (!$config) $config = HTMLPurifier_Config::createDefault();
@ -50,10 +58,12 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
@$doc->loadHTML($string); // mute all errors, handle it transparently
return $this->tokenizeDOM(
$tokens = array();
$this->tokenizeDOM(
$doc->childNodes->item(1)-> // html
getElementsByTagName('body')->item(0) // body
);
, $tokens);
return $tokens;
}
/**
@ -66,33 +76,33 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
* tag you're dealing with.
* @returns Tokens of node appended to previously passed tokens.
*/
protected function tokenizeDOM($node, $tokens = array(), $collect = false) {
protected function tokenizeDOM($node, &$tokens, $collect = false) {
// recursive goodness!
// intercept non element nodes
if ( !($node instanceof DOMElement) ) {
if ($node instanceof DOMComment) {
$tokens[] = new HTMLPurifier_Token_Comment($node->data);
$tokens[] = $this->factory->createComment($node->data);
} elseif ($node instanceof DOMText ||
$node instanceof DOMCharacterData) {
$tokens[] = new HTMLPurifier_Token_Text($node->data);
$tokens[] = $this->factory->createText($node->data);
}
// quite possibly, the object wasn't handled, that's fine
return $tokens;
return;
}
// We still have to make sure that the element actually IS empty
if (!$node->hasChildNodes()) {
if ($collect) {
$tokens[] = new HTMLPurifier_Token_Empty(
$tokens[] = $this->factory->createEmpty(
$node->tagName,
$this->transformAttrToAssoc($node->attributes)
);
}
} else {
if ($collect) { // don't wrap on first iteration
$tokens[] = new HTMLPurifier_Token_Start(
$tokens[] = $this->factory->createStart(
$tag_name = $node->tagName, // somehow, it get's dropped
$this->transformAttrToAssoc($node->attributes)
);
@ -100,15 +110,13 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
foreach ($node->childNodes as $node) {
// remember, it's an accumulator. Otherwise, we'd have
// to use array_merge
$tokens = $this->tokenizeDOM($node, $tokens, true);
$this->tokenizeDOM($node, $tokens, true);
}
if ($collect) {
$tokens[] = new HTMLPurifier_Token_End($tag_name);
$tokens[] = $this->factory->createEnd($tag_name);
}
}
return $tokens;
}
/**

View File

@ -51,6 +51,7 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
* @param $attributes Associative array of attributes.
*/
function HTMLPurifier_Token_Tag($name, $attributes = array()) {
//if ($attributes === null) var_dump(debug_backtrace());
$this->name = ctype_lower($name) ? $name : strtolower($name);
foreach ($attributes as $key => $value) {
// normalization only necessary when key is not lowercase

View File

@ -0,0 +1,51 @@
<?php
require_once 'HTMLPurifier/Token.php';
class HTMLPurifier_TokenFactory
{
// p stands for prototype
private $p_start, $p_end, $p_empty, $p_text, $p_comment;
public function __construct() {
$this->p_start = new HTMLPurifier_Token_Start('', array());
$this->p_end = new HTMLPurifier_Token_End('');
$this->p_empty = new HTMLPurifier_Token_Empty('', array());
$this->p_text = new HTMLPurifier_Token_Text('');
$this->p_comment= new HTMLPurifier_Token_Comment('');
}
public function createStart($name, $attributes = array()) {
$p = clone $this->p_start;
$p->HTMLPurifier_Token_Tag($name, $attributes);
return $p;
}
public function createEnd($name) {
$p = clone $this->p_end;
$p->HTMLPurifier_Token_Tag($name);
return $p;
}
public function createEmpty($name, $attributes = array()) {
$p = clone $this->p_empty;
$p->HTMLPurifier_Token_Tag($name, $attributes);
return $p;
}
public function createText($data) {
$p = clone $this->p_text;
$p->HTMLPurifier_Token_Text($data);
return $p;
}
public function createComment($data) {
$p = clone $this->p_comment;
$p->HTMLPurifier_Token_Comment($data);
return $p;
}
}
?>

View File

@ -0,0 +1,19 @@
<?php
require_once 'HTMLPurifier/TokenFactory.php';
class HTMLPurifier_TokenFactoryTest extends UnitTestCase
{
public function test() {
$factory = new HTMLPurifier_TokenFactory();
$regular = new HTMLPurifier_Token_Start('a', array('href' => 'about:blank'));
$generated = $factory->createStart('a', array('href' => 'about:blank'));
$this->assertEqual($regular, $generated);
}
}
?>

View File

@ -79,6 +79,10 @@ $test_files[] = 'AttrTransform/ImgRequiredTest.php';
$test_files[] = 'URISchemeRegistryTest.php';
$test_files[] = 'URISchemeTest.php';
if (version_compare(PHP_VERSION, '5', '>=')) {
$test_files[] = 'TokenFactoryTest.php';
}
$test_file_lookup = array_flip($test_files);
function htmlpurifier_path2class($path) {