Commit various optimizations to the Lexer, and add stub file for profiling the lexer.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@92 48356398-32a2-884e-a903-53898d9a118a
2025-01-03 13:21:51 +00:00 · 2006-07-22 22:48:07 +00:00 · 2006-07-22 22:48:07 +00:00 · ca1aefe271
commit ca1aefe271
parent de5ab5e6a0
5 changed files with 141 additions and 92 deletions
--- a/benchmarks/ProfileDirectLex.php
+++ b/benchmarks/ProfileDirectLex.php
@ -0,0 +1,14 @@
 <?php
 set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
 require_once 'HTMLPurifier/Lexer/DirectLex.php';
 $input = file_get_contents('samples/Lexer/4.html');
 $lexer = new HTMLPurifier_Lexer_DirectLex();
 for ($i = 0; $i < 10; $i++) {
    $tokens = $lexer->tokenizeHTML($input);
 }
 ?>
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@ -1,7 +1,7 @@
 <?php
 /**
- * Forgivingly lexes SGML style documents: HTML, XML, XHTML, etc.
+ * Forgivingly lexes HTML (not XML, since it doesn't adhere to spec exactly)
 */
 require_once 'HTMLPurifier/Token.php';
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@ -190,7 +190,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                            $segment, $position_first_space
                        )
                    );
-                $attributes = $this->tokenizeAttributeString($attribute_string);
+                if ($attribute_string) {
                    $attributes = $this->tokenizeAttributeString(
                                        $attribute_string
                                  );
                } else {
                    $attributes = array();
                }
                if ($is_self_closing) {
                    $array[] = new HTMLPurifier_Token_Empty($type, $attributes);
                } else {
@ -216,13 +223,47 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
    }
    function tokenizeAttributeString($string) {
-        $string = (string) $string;
+        $string = (string) $string; // quick typecast
-        if ($string == '') return array();
+        
-        $array = array();
+        if ($string == '') return array(); // no attributes
-        $cursor = 0;
+        
-        $in_value = false;
+        // let's see if we can abort as quickly as possible
-        $i = 0;
+        // one equal sign, no spaces => one attribute
-        $size = strlen($string);
+        $num_equal = substr_count($string, '=');
        $has_space = strpos($string, ' ');
        if ($num_equal === 0 && !$has_space) {
            // bool attribute
            return array($string => $string);
        } elseif ($num_equal === 1 && !$has_space) {
            // only one attribute
            list($key, $quoted_value) = explode('=', $string);
            $quoted_value = trim($quoted_value);
            if (!$key) return array();
            if (!$quoted_value) return array($key => '');
            $first_char = @$quoted_value[0];
            $last_char  = @$quoted_value[strlen($quoted_value)-1];
            $same_quote = ($first_char == $last_char);
            $open_quote = ($first_char == '"' || $first_char == "'");
            if ( $same_quote && $open_quote) {
                // well behaved
                $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
            } else {
                // not well behaved
                if ($open_quote) {
                    $value = substr($quoted_value, 1);
                } else {
                    $value = $quoted_value;
                }
            }
            return array($key => $value);
        }
        // setup loop environment
        $array  = array(); // return assoc array of attributes
        $cursor = 0; // current position in string (moves forward)
        $size   = strlen($string); // size of the string (stays the same)
        // if we have unquoted attributes, the parser expects a terminating
        // space, so let's guarantee that there's always a terminating space.
@ -234,88 +275,75 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
        while(true) {
            // infinite loop protection
            // if we've looped 1000 times, abort. Nothing good can come of this 
            if (++$loops > 1000) return array();
            if ($cursor >= $size) {
                break;
            }
            $cursor += ($value = strspn($string, "\x20\x09\x0D\x0A", $cursor));
            $position_next_space = $this->nextWhiteSpace($string, $cursor);
            //scroll to the last whitespace before text
            while ($position_next_space === $cursor) {
                $cursor++;
                $position_next_space = $this->nextWhiteSpace($string, $cursor);
            }
            $position_next_equal = strpos($string, '=', $cursor);
-            if ($position_next_equal !== false &&
+            
                 ($position_next_equal < $position_next_space ||
                  $position_next_space === false)) {
                //attr="asdf"
            // grab the key
                $key = trim(
                    substr(
                        $string, $cursor, $position_next_equal - $cursor
                    )
                );
-                // set cursor right after the equal sign
+            $key_begin = $cursor; //we're currently at the start of the key
                $cursor = $position_next_equal + 1;
-                // consume all spaces after the equal sign
+            // scroll past all characters that are the key (not whitespace or =)
-                $position_next_space = $this->nextWhiteSpace($string, $cursor);
+            $cursor += strcspn($string, "\x20\x09\x0D\x0A=", $cursor);
-                while ($position_next_space === $cursor) {
+            
-                    $cursor++;
+            $key_end = $cursor; // now at the end of the key
-                    $position_next_space=$this->nextWhiteSpace($string,$cursor);
+            
-                }
+            $key = substr($string, $key_begin, $key_end - $key_begin);
            if (!$key) continue; // empty key
            // scroll past all whitespace
            $cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
                // if we've hit the end, assign the key an empty value and abort
            if ($cursor >= $size) {
-                    $array[$key] = '';
+                $array[$key] = $key;
                break;
            }
-                // find the next quote
+            // if the next character is an equal sign, we've got a regular
-                $position_next_quote = $this->nextQuote($string, $cursor);
+            // pair, otherwise, it's a bool attribute
            $first_char = @$string[$cursor];
-                // if the quote is not where the cursor is, we're dealing
+            if ($first_char == '=') {
-                // with an unquoted attribute
+                // key="value"
                if ($position_next_quote !== $cursor) {
                    if ($key) {
                        $array[$key] = trim(substr($string, $cursor,
                          $position_next_space - $cursor));
                    }
                    $cursor = $position_next_space + 1;
                    continue;
                }
-                // otherwise, regular attribute
+                $cursor++;
-                $quote = $string{$position_next_quote};
+                $cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
                $position_end_quote = strpos(
                    $string, $quote, $position_next_quote + 1
                );
-                // check if the ending quote is missing
+                // we might be in front of a quote right now
                if ($position_end_quote === false) {
                    // it is, assign it to the end of the string
                    $position_end_quote = $size;
                }
-                $value = substr($string, $position_next_quote + 1,
+                $char = @$string[$cursor];
-                  $position_end_quote - $position_next_quote - 1);
+                
-                if ($key) {
+                if ($char == '"' || $char == "'") {
-                    $array[$key] = html_entity_decode($value, ENT_QUOTES);
+                    // it's quoted, end bound is $char
-                }
+                    $cursor++;
-                $cursor = $position_end_quote + 1;
+                    $value_begin = $cursor;
                    $cursor = strpos($string, $char, $cursor);
                    $value_end = $cursor;
                } else {
-                //boolattr
+                    // it's not quoted, end bound is whitespace
-                if ($position_next_space === false) {
+                    $value_begin = $cursor;
-                    $position_next_space = $size;
+                    $cursor += strcspn($string, "\x20\x09\x0D\x0A", $cursor);
                    $value_end = $cursor;
                }
-                $key = substr($string, $cursor, $position_next_space - $cursor);
+                
-                if ($key) {
+                $value = substr($string, $value_begin, $value_end - $value_begin);
                $array[$key] = $value;
                $cursor++;
            } else {
                // boolattr
                if ($key !== '') {
                    $array[$key] = $key;
                }
-                $cursor = $position_next_space + 1;
+                
            }
        }
        return $array;
--- a/library/HTMLPurifier/Token.php
+++ b/library/HTMLPurifier/Token.php
@ -8,36 +8,26 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
 {
    var $is_tag = true;
    var $name;
    function HTMLPurifier_Token_Tag($name) {
        // watch out, actually XML is case-sensitive, while HTML
        // is case insensitive, which means we can't use this for XML
        $this->name = strtolower($name); // for some reason, the SAX parser
                                         // uses uppercase. Investigate?
    }
 }
 // a rich tag has attributes
 class HTMLPurifier_Token_RichTag extends HTMLPurifier_Token_Tag // abstract
 {
    var $attributes = array();
-    function HTMLPurifier_Token_RichTag($name, $attributes = array()) {
+    function HTMLPurifier_Token_Tag($name, $attributes = array()) {
-        $this->HTMLPurifier_Token_Tag($name);
+        $this->name = ctype_lower($name) ? $name : strtolower($name);
        $this->attributes = $attributes;
    }
 }
 // start CONCRETE ones
-class HTMLPurifier_Token_Start extends HTMLPurifier_Token_RichTag
+class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
 {
    var $type = 'start';
 }
-class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_RichTag
+class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
 {
    var $type = 'empty';
 }
 // accepts attributes even though it really can't, for optimization reasons
 class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
 {
    var $type = 'end';
@ -51,7 +41,7 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token
    var $is_whitespace = false;
    function HTMLPurifier_Token_Text($data) {
        $this->data = $data;
-        if (trim($data, " \n\r\t") === '') $this->is_whitespace = true;
+        if (ctype_space($data)) $this->is_whitespace = true;
    }
    function append($text) {
        return new HTMLPurifier_Token_Text($this->data . $text->data);
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -153,13 +153,18 @@ class HTMLPurifier_LexerTest extends UnitTestCase
        // [SGML-INVALID]
        $input[10] = '<a "=>';
        // We barf on this, aim for no attributes
        $expect[10] = array(
            new HTMLPurifier_Token_Start('a', array('"' => ''))
            );
-        // DOM doesn't register an invalid attribute
+        // DOM correctly has no attributes, but also closes the tag
        $dom_expect[10] = array(
            new HTMLPurifier_Token_Empty('a')
            );
        // SAX barfs on this
        $sax_expect[10] = array(
            new HTMLPurifier_Token_Start('a', array('"' => ''))
            );
        // [INVALID] [RECOVERABLE]
        $input[11] = '"';
@ -232,6 +237,18 @@ class HTMLPurifier_LexerTest extends UnitTestCase
        $input[6] = 'href="foo';
        $expect[6] = array('href' => 'foo');
        $input[7] = '"=';
        $expect[7] = array('"' => '');
        //           0123456789012345678901234567890123
        $input[8] = 'href ="about:blank"rel ="nofollow"';
        $expect[8] = array('href' => 'about:blank', 'rel' => 'nofollow');
        $input[9] = 'foo bar';
        $expect[9] = array('foo' => 'foo', 'bar' => 'bar');
        $input[10] = 'foo="bar" blue';
        $expect[10] = array('foo' => 'bar', 'blue' => 'blue');
        $size = count($input);
        for($i = 0; $i < $size; $i++) {
            $result = $this->DirectLex->tokenizeAttributeString($input[$i]);