mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-01-03 05:11:52 +00:00
Optimize next*() functions in DirectLex, add test for offset. Update Lexer documents.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@90 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
eac83995e1
commit
ac1e62e043
@ -2,27 +2,40 @@
|
|||||||
Lexer
|
Lexer
|
||||||
|
|
||||||
The lexer parses a string of SGML-style markup and converts them into
|
The lexer parses a string of SGML-style markup and converts them into
|
||||||
corresponding tokens. It doesn't check for correctness, although it's
|
corresponding tokens. It doesn't check for well-formedness, although it's
|
||||||
internal mechanism may make this automatic (such as the case of DOMLex).
|
internal mechanism may make this automatic (such as the case of DOMLex).
|
||||||
|
|
||||||
We have several implementations of the Lexer:
|
We have several implementations of the Lexer:
|
||||||
|
|
||||||
DirectLex - our in-house implementation
|
DirectLex [4,5] - our in-house implementation
|
||||||
DirectLex has absolutely no dependencies, making it a reasonably good
|
DirectLex has absolutely no dependencies, making it a reasonably good
|
||||||
default for PHP4. Written with efficiency in mind, it is generally
|
default for PHP4. Written with efficiency in mind, it is up to two
|
||||||
faster than the PEAR parser, although the two are very close and usually
|
times faster than the PEAR parser. It will support UTF-8 completely
|
||||||
overlap a bit. It will support UTF-8 completely eventually.
|
eventually.
|
||||||
|
|
||||||
PEARSax3 - uses the PEAR package XML_HTMLSax3 to parse
|
PEARSax3 [4,5] - uses the PEAR package XML_HTMLSax3 to parse
|
||||||
PEAR, not suprisingly, also has a SAX parser for HTML. I don't know
|
PEAR, not suprisingly, also has a SAX parser for HTML. I don't know
|
||||||
very much about implementation, but it's fairly well written. You need
|
very much about implementation, but it's fairly well written. However, that
|
||||||
to have PEAR added to your path to use it though. Not sure whether or
|
abstraction comes at a price: performance. You need to have it installed,
|
||||||
not it's UTF-8 aware.
|
and if the API changes, it might break our adapter. Not sure whether or not
|
||||||
|
it's UTF-8 aware, but it has some entity parsing trouble.
|
||||||
|
|
||||||
DOMLex - uses the PHP5 core extension DOM to parse
|
DOMLex [5] - uses the PHP5 core extension DOM to parse
|
||||||
In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
|
In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
|
||||||
It gives us a forgiving HTML parser, which we use to transform the HTML
|
It gives us a forgiving HTML parser, which we use to transform the HTML
|
||||||
into a DOM, and then into the tokens. It is extremely fast, and is the
|
into a DOM, and then into the tokens. It is blazingly fast, and is the
|
||||||
default choice for PHP 5. However, entity resolution may be troublesome,
|
default choice for PHP 5. However, entity resolution may be troublesome,
|
||||||
though it's UTF-8 is excellent.
|
though its UTF-8 is excellent. Also, any empty elements will have empty
|
||||||
|
tokens associated with them, even if this is prohibited.
|
||||||
|
|
||||||
|
We use tokens because creating a DOM representation would:
|
||||||
|
|
||||||
|
1. Require more processing power to create,
|
||||||
|
2. Require recursion to iterate,
|
||||||
|
3. Must be compatible with PHP 5's DOM,
|
||||||
|
4. Has the entire document structure (html and body not needed), and
|
||||||
|
5. Has unknown readability improvement.
|
||||||
|
|
||||||
|
What the last item means is that the functions for manipulating tokens are
|
||||||
|
already fairly compact, and when well-commented, more abstraction may not
|
||||||
|
be needed.
|
@ -62,32 +62,13 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
}
|
}
|
||||||
|
|
||||||
function nextQuote($string, $offset = 0) {
|
function nextQuote($string, $offset = 0) {
|
||||||
$quotes = array('"', "'");
|
$next = strcspn($string, '"\'', $offset) + $offset;
|
||||||
return $this->next($string, $quotes, $offset);
|
return strlen($string) == $next ? false : $next;
|
||||||
}
|
}
|
||||||
|
|
||||||
function nextWhiteSpace($string, $offset = 0) {
|
function nextWhiteSpace($string, $offset = 0) {
|
||||||
$spaces = array(chr(0x20), chr(0x9), chr(0xD), chr(0xA));
|
$next = strcspn($string, "\x20\x09\x0D\x0A", $offset) + $offset;
|
||||||
return $this->next($string, $spaces, $offset);
|
return strlen($string) == $next ? false : $next;
|
||||||
}
|
|
||||||
|
|
||||||
function next($haystack, $needles, $offset = 0) {
|
|
||||||
if (is_string($needles)) {
|
|
||||||
$string_needles = $needles;
|
|
||||||
$needles = array();
|
|
||||||
$size = strlen($string_needles);
|
|
||||||
for ($i = 0; $i < $size; $i++) {
|
|
||||||
$needles[] = $string_needles{$i};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
$positions = array();
|
|
||||||
foreach ($needles as $needle) {
|
|
||||||
$position = strpos($haystack, $needle, $offset);
|
|
||||||
if ($position !== false) {
|
|
||||||
$positions[] = $position;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return empty($positions) ? false : min($positions);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function tokenizeHTML($string) {
|
function tokenizeHTML($string) {
|
||||||
|
@ -30,6 +30,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
|||||||
$this->assertIdentical(1, $HP->nextWhiteSpace("a\tsdf"));
|
$this->assertIdentical(1, $HP->nextWhiteSpace("a\tsdf"));
|
||||||
$this->assertIdentical(4, $HP->nextWhiteSpace("asdf\r"));
|
$this->assertIdentical(4, $HP->nextWhiteSpace("asdf\r"));
|
||||||
$this->assertIdentical(2, $HP->nextWhiteSpace("as\t\r\nasdf as"));
|
$this->assertIdentical(2, $HP->nextWhiteSpace("as\t\r\nasdf as"));
|
||||||
|
$this->assertIdentical(3, $HP->nextWhiteSpace('a a ', 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
function test_parseData() {
|
function test_parseData() {
|
||||||
@ -210,31 +211,31 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
|||||||
// internals testing
|
// internals testing
|
||||||
function test_tokenizeAttributeString() {
|
function test_tokenizeAttributeString() {
|
||||||
|
|
||||||
$input[] = 'href="asdf" boom="assdf"';
|
$input[0] = 'href="asdf" boom="assdf"';
|
||||||
$expect[] = array('href'=>'asdf', 'boom'=>'assdf');
|
$expect[0] = array('href'=>'asdf', 'boom'=>'assdf');
|
||||||
|
|
||||||
$input[] = "href='r'";
|
$input[1] = "href='r'";
|
||||||
$expect[] = array('href'=>'r');
|
$expect[1] = array('href'=>'r');
|
||||||
|
|
||||||
$input[] = 'onclick="javascript:alert(\'asdf\');"';
|
$input[2] = 'onclick="javascript:alert(\'asdf\');"';
|
||||||
$expect[] = array('onclick' => "javascript:alert('asdf');");
|
$expect[2] = array('onclick' => "javascript:alert('asdf');");
|
||||||
|
|
||||||
$input[] = 'selected';
|
$input[3] = 'selected';
|
||||||
$expect[] = array('selected'=>'selected');
|
$expect[3] = array('selected'=>'selected');
|
||||||
|
|
||||||
$input[] = '="asdf"';
|
$input[4] = '="asdf"';
|
||||||
$expect[] = array();
|
$expect[4] = array();
|
||||||
|
|
||||||
$input[] = 'missile=launch';
|
$input[5] = 'missile=launch';
|
||||||
$expect[] = array('missile' => 'launch');
|
$expect[5] = array('missile' => 'launch');
|
||||||
|
|
||||||
$input[] = 'href="foo';
|
$input[6] = 'href="foo';
|
||||||
$expect[] = array('href' => 'foo');
|
$expect[6] = array('href' => 'foo');
|
||||||
|
|
||||||
$size = count($input);
|
$size = count($input);
|
||||||
for($i = 0; $i < $size; $i++) {
|
for($i = 0; $i < $size; $i++) {
|
||||||
$result = $this->DirectLex->tokenizeAttributeString($input[$i]);
|
$result = $this->DirectLex->tokenizeAttributeString($input[$i]);
|
||||||
$this->assertEqual($expect[$i], $result);
|
$this->assertEqual($expect[$i], $result, 'Test ' . $i . ': %s');
|
||||||
paintIf($result, $expect[$i] != $result);
|
paintIf($result, $expect[$i] != $result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user