diff --git a/docs/lexer.txt b/docs/lexer.txt index a59557ac..31b55ba7 100644 --- a/docs/lexer.txt +++ b/docs/lexer.txt @@ -2,27 +2,40 @@ Lexer The lexer parses a string of SGML-style markup and converts them into -corresponding tokens. It doesn't check for correctness, although it's +corresponding tokens. It doesn't check for well-formedness, although it's internal mechanism may make this automatic (such as the case of DOMLex). We have several implementations of the Lexer: -DirectLex - our in-house implementation +DirectLex [4,5] - our in-house implementation DirectLex has absolutely no dependencies, making it a reasonably good - default for PHP4. Written with efficiency in mind, it is generally - faster than the PEAR parser, although the two are very close and usually - overlap a bit. It will support UTF-8 completely eventually. + default for PHP4. Written with efficiency in mind, it is up to two + times faster than the PEAR parser. It will support UTF-8 completely + eventually. -PEARSax3 - uses the PEAR package XML_HTMLSax3 to parse +PEARSax3 [4,5] - uses the PEAR package XML_HTMLSax3 to parse PEAR, not suprisingly, also has a SAX parser for HTML. I don't know - very much about implementation, but it's fairly well written. You need - to have PEAR added to your path to use it though. Not sure whether or - not it's UTF-8 aware. + very much about implementation, but it's fairly well written. However, that + abstraction comes at a price: performance. You need to have it installed, + and if the API changes, it might break our adapter. Not sure whether or not + it's UTF-8 aware, but it has some entity parsing trouble. -DOMLex - uses the PHP5 core extension DOM to parse +DOMLex [5] - uses the PHP5 core extension DOM to parse In PHP 5, the DOM XML extension was revamped into DOM and added to the core. It gives us a forgiving HTML parser, which we use to transform the HTML - into a DOM, and then into the tokens. It is extremely fast, and is the + into a DOM, and then into the tokens. It is blazingly fast, and is the default choice for PHP 5. However, entity resolution may be troublesome, - though it's UTF-8 is excellent. + though its UTF-8 is excellent. Also, any empty elements will have empty + tokens associated with them, even if this is prohibited. +We use tokens because creating a DOM representation would: + +1. Require more processing power to create, +2. Require recursion to iterate, +3. Must be compatible with PHP 5's DOM, +4. Has the entire document structure (html and body not needed), and +5. Has unknown readability improvement. + +What the last item means is that the functions for manipulating tokens are +already fairly compact, and when well-commented, more abstraction may not +be needed. \ No newline at end of file diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index e974d4a6..b24416b6 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -62,32 +62,13 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer } function nextQuote($string, $offset = 0) { - $quotes = array('"', "'"); - return $this->next($string, $quotes, $offset); + $next = strcspn($string, '"\'', $offset) + $offset; + return strlen($string) == $next ? false : $next; } function nextWhiteSpace($string, $offset = 0) { - $spaces = array(chr(0x20), chr(0x9), chr(0xD), chr(0xA)); - return $this->next($string, $spaces, $offset); - } - - function next($haystack, $needles, $offset = 0) { - if (is_string($needles)) { - $string_needles = $needles; - $needles = array(); - $size = strlen($string_needles); - for ($i = 0; $i < $size; $i++) { - $needles[] = $string_needles{$i}; - } - } - $positions = array(); - foreach ($needles as $needle) { - $position = strpos($haystack, $needle, $offset); - if ($position !== false) { - $positions[] = $position; - } - } - return empty($positions) ? false : min($positions); + $next = strcspn($string, "\x20\x09\x0D\x0A", $offset) + $offset; + return strlen($string) == $next ? false : $next; } function tokenizeHTML($string) { diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index 421805e9..59685eac 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -30,6 +30,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase $this->assertIdentical(1, $HP->nextWhiteSpace("a\tsdf")); $this->assertIdentical(4, $HP->nextWhiteSpace("asdf\r")); $this->assertIdentical(2, $HP->nextWhiteSpace("as\t\r\nasdf as")); + $this->assertIdentical(3, $HP->nextWhiteSpace('a a ', 2)); } function test_parseData() { @@ -210,31 +211,31 @@ class HTMLPurifier_LexerTest extends UnitTestCase // internals testing function test_tokenizeAttributeString() { - $input[] = 'href="asdf" boom="assdf"'; - $expect[] = array('href'=>'asdf', 'boom'=>'assdf'); + $input[0] = 'href="asdf" boom="assdf"'; + $expect[0] = array('href'=>'asdf', 'boom'=>'assdf'); - $input[] = "href='r'"; - $expect[] = array('href'=>'r'); + $input[1] = "href='r'"; + $expect[1] = array('href'=>'r'); - $input[] = 'onclick="javascript:alert(\'asdf\');"'; - $expect[] = array('onclick' => "javascript:alert('asdf');"); + $input[2] = 'onclick="javascript:alert(\'asdf\');"'; + $expect[2] = array('onclick' => "javascript:alert('asdf');"); - $input[] = 'selected'; - $expect[] = array('selected'=>'selected'); + $input[3] = 'selected'; + $expect[3] = array('selected'=>'selected'); - $input[] = '="asdf"'; - $expect[] = array(); + $input[4] = '="asdf"'; + $expect[4] = array(); - $input[] = 'missile=launch'; - $expect[] = array('missile' => 'launch'); + $input[5] = 'missile=launch'; + $expect[5] = array('missile' => 'launch'); - $input[] = 'href="foo'; - $expect[] = array('href' => 'foo'); + $input[6] = 'href="foo'; + $expect[6] = array('href' => 'foo'); $size = count($input); for($i = 0; $i < $size; $i++) { $result = $this->DirectLex->tokenizeAttributeString($input[$i]); - $this->assertEqual($expect[$i], $result); + $this->assertEqual($expect[$i], $result, 'Test ' . $i . ': %s'); paintIf($result, $expect[$i] != $result); }