From ac1e62e0432621ff35121df9d6348cec22797ddb Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <edwardzyang@thewritingpot.com>
Date: Sat, 22 Jul 2006 18:55:34 +0000
Subject: [PATCH] Optimize next*() functions in DirectLex, add test for offset.
 Update Lexer documents.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@90 48356398-32a2-884e-a903-53898d9a118a
---
 docs/lexer.txt                           | 37 ++++++++++++++++--------
 library/HTMLPurifier/Lexer/DirectLex.php | 27 +++--------------
 tests/HTMLPurifier/LexerTest.php         | 31 ++++++++++----------
 3 files changed, 45 insertions(+), 50 deletions(-)

diff --git a/docs/lexer.txt b/docs/lexer.txt
index a59557ac..31b55ba7 100644
--- a/docs/lexer.txt
+++ b/docs/lexer.txt
@@ -2,27 +2,40 @@
 Lexer
 
 The lexer parses a string of SGML-style markup and converts them into
-corresponding tokens. It doesn't check for correctness, although it's
+corresponding tokens. It doesn't check for well-formedness, although it's
 internal mechanism may make this automatic (such as the case of DOMLex).
 
 We have several implementations of the Lexer:
 
-DirectLex - our in-house implementation
+DirectLex [4,5] - our in-house implementation
     DirectLex has absolutely no dependencies, making it a reasonably good
-    default for PHP4.  Written with efficiency in mind, it is generally
-    faster than the PEAR parser, although the two are very close and usually
-    overlap a bit.  It will support UTF-8 completely eventually.
+    default for PHP4.  Written with efficiency in mind, it is up to two
+    times faster than the PEAR parser.  It will support UTF-8 completely
+    eventually.
 
-PEARSax3 - uses the PEAR package XML_HTMLSax3 to parse
+PEARSax3 [4,5] - uses the PEAR package XML_HTMLSax3 to parse
     PEAR, not suprisingly, also has a SAX parser for HTML.  I don't know
-    very much about implementation, but it's fairly well written.  You need
-    to have PEAR added to your path to use it though.  Not sure whether or
-    not it's UTF-8 aware.
+    very much about implementation, but it's fairly well written.  However, that
+    abstraction comes at a price: performance. You need to have it installed,
+    and if the API changes, it might break our adapter. Not sure whether or not
+    it's UTF-8 aware, but it has some entity parsing trouble.
 
-DOMLex - uses the PHP5 core extension DOM to parse
+DOMLex [5] - uses the PHP5 core extension DOM to parse
     In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
     It gives us a forgiving HTML parser, which we use to transform the HTML
-    into a DOM, and then into the tokens.  It is extremely fast, and is the
+    into a DOM, and then into the tokens.  It is blazingly fast, and is the
     default choice for PHP 5.  However, entity resolution may be troublesome,
-    though it's UTF-8 is excellent.
+    though its UTF-8 is excellent.  Also, any empty elements will have empty
+    tokens associated with them, even if this is prohibited.
 
+We use tokens because creating a DOM representation would:
+
+1. Require more processing power to create,
+2. Require recursion to iterate,
+3. Must be compatible with PHP 5's DOM,
+4. Has the entire document structure (html and body not needed), and
+5. Has unknown readability improvement.
+
+What the last item means is that the functions for manipulating tokens are
+already fairly compact, and when well-commented, more abstraction may not
+be needed.
\ No newline at end of file
diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php
index e974d4a6..b24416b6 100644
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -62,32 +62,13 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
     }
     
     function nextQuote($string, $offset = 0) {
-        $quotes = array('"', "'");
-        return $this->next($string, $quotes, $offset);
+        $next = strcspn($string, '"\'', $offset) + $offset;
+        return strlen($string) == $next ? false : $next;
     }
     
     function nextWhiteSpace($string, $offset = 0) {
-        $spaces = array(chr(0x20), chr(0x9), chr(0xD), chr(0xA));
-        return $this->next($string, $spaces, $offset);
-    }
-    
-    function next($haystack, $needles, $offset = 0) {
-        if (is_string($needles)) {
-            $string_needles = $needles;
-            $needles = array();
-            $size = strlen($string_needles);
-            for ($i = 0; $i < $size; $i++) {
-                $needles[] = $string_needles{$i};
-            }
-        }
-        $positions = array();
-        foreach ($needles as $needle) {
-            $position = strpos($haystack, $needle, $offset);
-            if ($position !== false) {
-                $positions[] = $position;
-            }
-        }
-        return empty($positions) ? false : min($positions);
+        $next = strcspn($string, "\x20\x09\x0D\x0A", $offset) + $offset;
+        return strlen($string) == $next ? false : $next;
     }
     
     function tokenizeHTML($string) {
diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php
index 421805e9..59685eac 100644
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@@ -30,6 +30,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase
         $this->assertIdentical(1, $HP->nextWhiteSpace("a\tsdf"));
         $this->assertIdentical(4, $HP->nextWhiteSpace("asdf\r"));
         $this->assertIdentical(2, $HP->nextWhiteSpace("as\t\r\nasdf as"));
+        $this->assertIdentical(3, $HP->nextWhiteSpace('a a ', 2));
     }
     
     function test_parseData() {
@@ -210,31 +211,31 @@ class HTMLPurifier_LexerTest extends UnitTestCase
     // internals testing
     function test_tokenizeAttributeString() {
         
-        $input[] = 'href="asdf" boom="assdf"';
-        $expect[] = array('href'=>'asdf', 'boom'=>'assdf');
+        $input[0] = 'href="asdf" boom="assdf"';
+        $expect[0] = array('href'=>'asdf', 'boom'=>'assdf');
         
-        $input[] = "href='r'";
-        $expect[] = array('href'=>'r');
+        $input[1] = "href='r'";
+        $expect[1] = array('href'=>'r');
         
-        $input[] = 'onclick="javascript:alert(\'asdf\');"';
-        $expect[] = array('onclick' => "javascript:alert('asdf');");
+        $input[2] = 'onclick="javascript:alert(\'asdf\');"';
+        $expect[2] = array('onclick' => "javascript:alert('asdf');");
         
-        $input[] = 'selected';
-        $expect[] = array('selected'=>'selected');
+        $input[3] = 'selected';
+        $expect[3] = array('selected'=>'selected');
         
-        $input[] = '="asdf"';
-        $expect[] = array();
+        $input[4] = '="asdf"';
+        $expect[4] = array();
         
-        $input[] = 'missile=launch';
-        $expect[] = array('missile' => 'launch');
+        $input[5] = 'missile=launch';
+        $expect[5] = array('missile' => 'launch');
         
-        $input[] = 'href="foo';
-        $expect[] = array('href' => 'foo');
+        $input[6] = 'href="foo';
+        $expect[6] = array('href' => 'foo');
         
         $size = count($input);
         for($i = 0; $i < $size; $i++) {
             $result = $this->DirectLex->tokenizeAttributeString($input[$i]);
-            $this->assertEqual($expect[$i], $result);
+            $this->assertEqual($expect[$i], $result, 'Test ' . $i . ': %s');
             paintIf($result, $expect[$i] != $result);
         }