From acd7ceb9408c5a42d088473e9e650a127edf2e86 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <edwardzyang@thewritingpot.com>
Date: Tue, 15 Aug 2006 20:19:16 +0000
Subject: [PATCH] Major optimization on tokenizeDOM(), reduce execution time
 from 75% to 20% by passing tokens by reference and using a token factory.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@265 48356398-32a2-884e-a903-53898d9a118a
---
 library/HTMLPurifier/Lexer/DOMLex.php   | 32 ++++++++++------
 library/HTMLPurifier/Token.php          |  1 +
 library/HTMLPurifier/TokenFactory.php   | 51 +++++++++++++++++++++++++
 tests/HTMLPurifier/TokenFactoryTest.php | 19 +++++++++
 tests/index.php                         |  4 ++
 5 files changed, 95 insertions(+), 12 deletions(-)
 create mode 100644 library/HTMLPurifier/TokenFactory.php
 create mode 100644 tests/HTMLPurifier/TokenFactoryTest.php

diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php
index 0df13ae5..07b12d22 100644
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -1,6 +1,7 @@
 <?php
 
 require_once 'HTMLPurifier/Lexer.php';
+require_once 'HTMLPurifier/TokenFactory.php';
 
 /**
  * Parser that uses PHP 5's DOM extension (part of the core).
@@ -25,6 +26,13 @@ require_once 'HTMLPurifier/Lexer.php';
 class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
 {
     
+    private $factory;
+    
+    public function __construct() {
+        // setup the factory
+        $this->factory = new HTMLPurifier_TokenFactory();
+    }
+    
     public function tokenizeHTML($string, $config = null) {
         if (!$config) $config = HTMLPurifier_Config::createDefault();
         
@@ -50,10 +58,12 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
         
         @$doc->loadHTML($string); // mute all errors, handle it transparently
         
-        return $this->tokenizeDOM(
+        $tokens = array();
+        $this->tokenizeDOM(
             $doc->childNodes->item(1)-> // html
                   getElementsByTagName('body')->item(0) // body
-            );
+            , $tokens);
+        return $tokens;
     }
     
     /**
@@ -66,33 +76,33 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
      *                  tag you're dealing with.
      * @returns Tokens of node appended to previously passed tokens.
      */
-    protected function tokenizeDOM($node, $tokens = array(), $collect = false) {
+    protected function tokenizeDOM($node, &$tokens, $collect = false) {
         // recursive goodness!
         
         // intercept non element nodes
         
         if ( !($node instanceof DOMElement) ) {
             if ($node instanceof DOMComment) {
-                $tokens[] = new HTMLPurifier_Token_Comment($node->data);
+                $tokens[] = $this->factory->createComment($node->data);
             } elseif ($node instanceof DOMText ||
                       $node instanceof DOMCharacterData) {
-                $tokens[] = new HTMLPurifier_Token_Text($node->data);
+                $tokens[] = $this->factory->createText($node->data);
             }
             // quite possibly, the object wasn't handled, that's fine
-            return $tokens;
+            return;
         }
         
         // We still have to make sure that the element actually IS empty
         if (!$node->hasChildNodes()) {
             if ($collect) {
-                $tokens[] = new HTMLPurifier_Token_Empty(
+                $tokens[] = $this->factory->createEmpty(
                     $node->tagName,
                     $this->transformAttrToAssoc($node->attributes)
                 );
             }
         } else {
             if ($collect) { // don't wrap on first iteration
-                $tokens[] = new HTMLPurifier_Token_Start(
+                $tokens[] = $this->factory->createStart(
                     $tag_name = $node->tagName, // somehow, it get's dropped
                     $this->transformAttrToAssoc($node->attributes)
                 );
@@ -100,15 +110,13 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
             foreach ($node->childNodes as $node) {
                 // remember, it's an accumulator. Otherwise, we'd have
                 // to use array_merge
-                $tokens = $this->tokenizeDOM($node, $tokens, true);
+                $this->tokenizeDOM($node, $tokens, true);
             }
             if ($collect) {
-                $tokens[] = new HTMLPurifier_Token_End($tag_name);
+                $tokens[] = $this->factory->createEnd($tag_name);
             }
         }
         
-        return $tokens;
-        
     }
     
     /**
diff --git a/library/HTMLPurifier/Token.php b/library/HTMLPurifier/Token.php
index ed46621b..f53743b9 100644
--- a/library/HTMLPurifier/Token.php
+++ b/library/HTMLPurifier/Token.php
@@ -51,6 +51,7 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
      * @param $attributes   Associative array of attributes.
      */
     function HTMLPurifier_Token_Tag($name, $attributes = array()) {
+        //if ($attributes === null) var_dump(debug_backtrace());
         $this->name = ctype_lower($name) ? $name : strtolower($name);
         foreach ($attributes as $key => $value) {
             // normalization only necessary when key is not lowercase
diff --git a/library/HTMLPurifier/TokenFactory.php b/library/HTMLPurifier/TokenFactory.php
new file mode 100644
index 00000000..8c761fbb
--- /dev/null
+++ b/library/HTMLPurifier/TokenFactory.php
@@ -0,0 +1,51 @@
+<?php
+
+require_once 'HTMLPurifier/Token.php';
+
+class HTMLPurifier_TokenFactory
+{
+    
+    // p stands for prototype
+    private $p_start, $p_end, $p_empty, $p_text, $p_comment;
+    
+    public function __construct() {
+        $this->p_start  = new HTMLPurifier_Token_Start('', array());
+        $this->p_end    = new HTMLPurifier_Token_End('');
+        $this->p_empty  = new HTMLPurifier_Token_Empty('', array());
+        $this->p_text   = new HTMLPurifier_Token_Text('');
+        $this->p_comment= new HTMLPurifier_Token_Comment('');
+    }
+    
+    public function createStart($name, $attributes = array()) {
+        $p = clone $this->p_start;
+        $p->HTMLPurifier_Token_Tag($name, $attributes);
+        return $p;
+    }
+    
+    public function createEnd($name) {
+        $p = clone $this->p_end;
+        $p->HTMLPurifier_Token_Tag($name);
+        return $p;
+    }
+    
+    public function createEmpty($name, $attributes = array()) {
+        $p = clone $this->p_empty;
+        $p->HTMLPurifier_Token_Tag($name, $attributes);
+        return $p;
+    }
+    
+    public function createText($data) {
+        $p = clone $this->p_text;
+        $p->HTMLPurifier_Token_Text($data);
+        return $p;
+    }
+    
+    public function createComment($data) {
+        $p = clone $this->p_comment;
+        $p->HTMLPurifier_Token_Comment($data);
+        return $p;
+    }
+    
+}
+
+?>
\ No newline at end of file
diff --git a/tests/HTMLPurifier/TokenFactoryTest.php b/tests/HTMLPurifier/TokenFactoryTest.php
new file mode 100644
index 00000000..9995ef74
--- /dev/null
+++ b/tests/HTMLPurifier/TokenFactoryTest.php
@@ -0,0 +1,19 @@
+<?php
+
+require_once 'HTMLPurifier/TokenFactory.php';
+
+class HTMLPurifier_TokenFactoryTest extends UnitTestCase
+{
+    public function test() {
+        
+        $factory = new HTMLPurifier_TokenFactory();
+        
+        $regular = new HTMLPurifier_Token_Start('a', array('href' => 'about:blank'));
+        $generated = $factory->createStart('a', array('href' => 'about:blank'));
+        
+        $this->assertEqual($regular, $generated);
+        
+    }
+}
+
+?>
\ No newline at end of file
diff --git a/tests/index.php b/tests/index.php
index ae6566ef..8aab3589 100644
--- a/tests/index.php
+++ b/tests/index.php
@@ -79,6 +79,10 @@ $test_files[] = 'AttrTransform/ImgRequiredTest.php';
 $test_files[] = 'URISchemeRegistryTest.php';
 $test_files[] = 'URISchemeTest.php';
 
+if (version_compare(PHP_VERSION, '5', '>=')) {
+    $test_files[] = 'TokenFactoryTest.php';
+}
+
 $test_file_lookup = array_flip($test_files);
 
 function htmlpurifier_path2class($path) {