0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-05 06:01:52 +00:00

Optimize next*() functions in DirectLex, add test for offset. Update Lexer documents.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@90 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-07-22 18:55:34 +00:00
parent eac83995e1
commit ac1e62e043
3 changed files with 45 additions and 50 deletions

View File

@ -2,27 +2,40 @@
Lexer Lexer
The lexer parses a string of SGML-style markup and converts them into The lexer parses a string of SGML-style markup and converts them into
corresponding tokens. It doesn't check for correctness, although it's corresponding tokens. It doesn't check for well-formedness, although it's
internal mechanism may make this automatic (such as the case of DOMLex). internal mechanism may make this automatic (such as the case of DOMLex).
We have several implementations of the Lexer: We have several implementations of the Lexer:
DirectLex - our in-house implementation DirectLex [4,5] - our in-house implementation
DirectLex has absolutely no dependencies, making it a reasonably good DirectLex has absolutely no dependencies, making it a reasonably good
default for PHP4. Written with efficiency in mind, it is generally default for PHP4. Written with efficiency in mind, it is up to two
faster than the PEAR parser, although the two are very close and usually times faster than the PEAR parser. It will support UTF-8 completely
overlap a bit. It will support UTF-8 completely eventually. eventually.
PEARSax3 - uses the PEAR package XML_HTMLSax3 to parse PEARSax3 [4,5] - uses the PEAR package XML_HTMLSax3 to parse
PEAR, not suprisingly, also has a SAX parser for HTML. I don't know PEAR, not suprisingly, also has a SAX parser for HTML. I don't know
very much about implementation, but it's fairly well written. You need very much about implementation, but it's fairly well written. However, that
to have PEAR added to your path to use it though. Not sure whether or abstraction comes at a price: performance. You need to have it installed,
not it's UTF-8 aware. and if the API changes, it might break our adapter. Not sure whether or not
it's UTF-8 aware, but it has some entity parsing trouble.
DOMLex - uses the PHP5 core extension DOM to parse DOMLex [5] - uses the PHP5 core extension DOM to parse
In PHP 5, the DOM XML extension was revamped into DOM and added to the core. In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
It gives us a forgiving HTML parser, which we use to transform the HTML It gives us a forgiving HTML parser, which we use to transform the HTML
into a DOM, and then into the tokens. It is extremely fast, and is the into a DOM, and then into the tokens. It is blazingly fast, and is the
default choice for PHP 5. However, entity resolution may be troublesome, default choice for PHP 5. However, entity resolution may be troublesome,
though it's UTF-8 is excellent. though its UTF-8 is excellent. Also, any empty elements will have empty
tokens associated with them, even if this is prohibited.
We use tokens because creating a DOM representation would:
1. Require more processing power to create,
2. Require recursion to iterate,
3. Must be compatible with PHP 5's DOM,
4. Has the entire document structure (html and body not needed), and
5. Has unknown readability improvement.
What the last item means is that the functions for manipulating tokens are
already fairly compact, and when well-commented, more abstraction may not
be needed.

View File

@ -62,32 +62,13 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
} }
function nextQuote($string, $offset = 0) { function nextQuote($string, $offset = 0) {
$quotes = array('"', "'"); $next = strcspn($string, '"\'', $offset) + $offset;
return $this->next($string, $quotes, $offset); return strlen($string) == $next ? false : $next;
} }
function nextWhiteSpace($string, $offset = 0) { function nextWhiteSpace($string, $offset = 0) {
$spaces = array(chr(0x20), chr(0x9), chr(0xD), chr(0xA)); $next = strcspn($string, "\x20\x09\x0D\x0A", $offset) + $offset;
return $this->next($string, $spaces, $offset); return strlen($string) == $next ? false : $next;
}
function next($haystack, $needles, $offset = 0) {
if (is_string($needles)) {
$string_needles = $needles;
$needles = array();
$size = strlen($string_needles);
for ($i = 0; $i < $size; $i++) {
$needles[] = $string_needles{$i};
}
}
$positions = array();
foreach ($needles as $needle) {
$position = strpos($haystack, $needle, $offset);
if ($position !== false) {
$positions[] = $position;
}
}
return empty($positions) ? false : min($positions);
} }
function tokenizeHTML($string) { function tokenizeHTML($string) {

View File

@ -30,6 +30,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase
$this->assertIdentical(1, $HP->nextWhiteSpace("a\tsdf")); $this->assertIdentical(1, $HP->nextWhiteSpace("a\tsdf"));
$this->assertIdentical(4, $HP->nextWhiteSpace("asdf\r")); $this->assertIdentical(4, $HP->nextWhiteSpace("asdf\r"));
$this->assertIdentical(2, $HP->nextWhiteSpace("as\t\r\nasdf as")); $this->assertIdentical(2, $HP->nextWhiteSpace("as\t\r\nasdf as"));
$this->assertIdentical(3, $HP->nextWhiteSpace('a a ', 2));
} }
function test_parseData() { function test_parseData() {
@ -210,31 +211,31 @@ class HTMLPurifier_LexerTest extends UnitTestCase
// internals testing // internals testing
function test_tokenizeAttributeString() { function test_tokenizeAttributeString() {
$input[] = 'href="asdf" boom="assdf"'; $input[0] = 'href="asdf" boom="assdf"';
$expect[] = array('href'=>'asdf', 'boom'=>'assdf'); $expect[0] = array('href'=>'asdf', 'boom'=>'assdf');
$input[] = "href='r'"; $input[1] = "href='r'";
$expect[] = array('href'=>'r'); $expect[1] = array('href'=>'r');
$input[] = 'onclick="javascript:alert(\'asdf\');"'; $input[2] = 'onclick="javascript:alert(\'asdf\');"';
$expect[] = array('onclick' => "javascript:alert('asdf');"); $expect[2] = array('onclick' => "javascript:alert('asdf');");
$input[] = 'selected'; $input[3] = 'selected';
$expect[] = array('selected'=>'selected'); $expect[3] = array('selected'=>'selected');
$input[] = '="asdf"'; $input[4] = '="asdf"';
$expect[] = array(); $expect[4] = array();
$input[] = 'missile=launch'; $input[5] = 'missile=launch';
$expect[] = array('missile' => 'launch'); $expect[5] = array('missile' => 'launch');
$input[] = 'href="foo'; $input[6] = 'href="foo';
$expect[] = array('href' => 'foo'); $expect[6] = array('href' => 'foo');
$size = count($input); $size = count($input);
for($i = 0; $i < $size; $i++) { for($i = 0; $i < $size; $i++) {
$result = $this->DirectLex->tokenizeAttributeString($input[$i]); $result = $this->DirectLex->tokenizeAttributeString($input[$i]);
$this->assertEqual($expect[$i], $result); $this->assertEqual($expect[$i], $result, 'Test ' . $i . ': %s');
paintIf($result, $expect[$i] != $result); paintIf($result, $expect[$i] != $result);
} }