mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-22 08:21:52 +00:00
Commit various optimizations to the Lexer, and add stub file for profiling the lexer.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@92 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
de5ab5e6a0
commit
ca1aefe271
14
benchmarks/ProfileDirectLex.php
Normal file
14
benchmarks/ProfileDirectLex.php
Normal file
@ -0,0 +1,14 @@
|
||||
<?php
|
||||
|
||||
set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
|
||||
|
||||
require_once 'HTMLPurifier/Lexer/DirectLex.php';
|
||||
|
||||
$input = file_get_contents('samples/Lexer/4.html');
|
||||
$lexer = new HTMLPurifier_Lexer_DirectLex();
|
||||
|
||||
for ($i = 0; $i < 10; $i++) {
|
||||
$tokens = $lexer->tokenizeHTML($input);
|
||||
}
|
||||
|
||||
?>
|
@ -1,7 +1,7 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Forgivingly lexes SGML style documents: HTML, XML, XHTML, etc.
|
||||
* Forgivingly lexes HTML (not XML, since it doesn't adhere to spec exactly)
|
||||
*/
|
||||
|
||||
require_once 'HTMLPurifier/Token.php';
|
||||
|
@ -190,7 +190,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
$segment, $position_first_space
|
||||
)
|
||||
);
|
||||
$attributes = $this->tokenizeAttributeString($attribute_string);
|
||||
if ($attribute_string) {
|
||||
$attributes = $this->tokenizeAttributeString(
|
||||
$attribute_string
|
||||
);
|
||||
} else {
|
||||
$attributes = array();
|
||||
}
|
||||
|
||||
if ($is_self_closing) {
|
||||
$array[] = new HTMLPurifier_Token_Empty($type, $attributes);
|
||||
} else {
|
||||
@ -216,13 +223,47 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
}
|
||||
|
||||
function tokenizeAttributeString($string) {
|
||||
$string = (string) $string;
|
||||
if ($string == '') return array();
|
||||
$array = array();
|
||||
$cursor = 0;
|
||||
$in_value = false;
|
||||
$i = 0;
|
||||
$size = strlen($string);
|
||||
$string = (string) $string; // quick typecast
|
||||
|
||||
if ($string == '') return array(); // no attributes
|
||||
|
||||
// let's see if we can abort as quickly as possible
|
||||
// one equal sign, no spaces => one attribute
|
||||
$num_equal = substr_count($string, '=');
|
||||
$has_space = strpos($string, ' ');
|
||||
if ($num_equal === 0 && !$has_space) {
|
||||
// bool attribute
|
||||
return array($string => $string);
|
||||
} elseif ($num_equal === 1 && !$has_space) {
|
||||
// only one attribute
|
||||
list($key, $quoted_value) = explode('=', $string);
|
||||
$quoted_value = trim($quoted_value);
|
||||
if (!$key) return array();
|
||||
if (!$quoted_value) return array($key => '');
|
||||
$first_char = @$quoted_value[0];
|
||||
$last_char = @$quoted_value[strlen($quoted_value)-1];
|
||||
|
||||
$same_quote = ($first_char == $last_char);
|
||||
$open_quote = ($first_char == '"' || $first_char == "'");
|
||||
|
||||
if ( $same_quote && $open_quote) {
|
||||
// well behaved
|
||||
$value = substr($quoted_value, 1, strlen($quoted_value) - 2);
|
||||
} else {
|
||||
// not well behaved
|
||||
if ($open_quote) {
|
||||
$value = substr($quoted_value, 1);
|
||||
} else {
|
||||
$value = $quoted_value;
|
||||
}
|
||||
}
|
||||
return array($key => $value);
|
||||
}
|
||||
|
||||
// setup loop environment
|
||||
$array = array(); // return assoc array of attributes
|
||||
$cursor = 0; // current position in string (moves forward)
|
||||
$size = strlen($string); // size of the string (stays the same)
|
||||
|
||||
// if we have unquoted attributes, the parser expects a terminating
|
||||
// space, so let's guarantee that there's always a terminating space.
|
||||
@ -234,88 +275,75 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
while(true) {
|
||||
|
||||
// infinite loop protection
|
||||
// if we've looped 1000 times, abort. Nothing good can come of this
|
||||
if (++$loops > 1000) return array();
|
||||
|
||||
if ($cursor >= $size) {
|
||||
break;
|
||||
}
|
||||
|
||||
$cursor += ($value = strspn($string, "\x20\x09\x0D\x0A", $cursor));
|
||||
|
||||
$position_next_space = $this->nextWhiteSpace($string, $cursor);
|
||||
//scroll to the last whitespace before text
|
||||
while ($position_next_space === $cursor) {
|
||||
$cursor++;
|
||||
$position_next_space = $this->nextWhiteSpace($string, $cursor);
|
||||
}
|
||||
$position_next_equal = strpos($string, '=', $cursor);
|
||||
if ($position_next_equal !== false &&
|
||||
($position_next_equal < $position_next_space ||
|
||||
$position_next_space === false)) {
|
||||
//attr="asdf"
|
||||
// grab the key
|
||||
$key = trim(
|
||||
substr(
|
||||
$string, $cursor, $position_next_equal - $cursor
|
||||
)
|
||||
);
|
||||
|
||||
// grab the key
|
||||
|
||||
$key_begin = $cursor; //we're currently at the start of the key
|
||||
|
||||
// scroll past all characters that are the key (not whitespace or =)
|
||||
$cursor += strcspn($string, "\x20\x09\x0D\x0A=", $cursor);
|
||||
|
||||
$key_end = $cursor; // now at the end of the key
|
||||
|
||||
$key = substr($string, $key_begin, $key_end - $key_begin);
|
||||
|
||||
if (!$key) continue; // empty key
|
||||
|
||||
// scroll past all whitespace
|
||||
$cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
|
||||
|
||||
if ($cursor >= $size) {
|
||||
$array[$key] = $key;
|
||||
break;
|
||||
}
|
||||
|
||||
// if the next character is an equal sign, we've got a regular
|
||||
// pair, otherwise, it's a bool attribute
|
||||
$first_char = @$string[$cursor];
|
||||
|
||||
if ($first_char == '=') {
|
||||
// key="value"
|
||||
|
||||
// set cursor right after the equal sign
|
||||
$cursor = $position_next_equal + 1;
|
||||
$cursor++;
|
||||
$cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
|
||||
|
||||
// consume all spaces after the equal sign
|
||||
$position_next_space = $this->nextWhiteSpace($string, $cursor);
|
||||
while ($position_next_space === $cursor) {
|
||||
// we might be in front of a quote right now
|
||||
|
||||
$char = @$string[$cursor];
|
||||
|
||||
if ($char == '"' || $char == "'") {
|
||||
// it's quoted, end bound is $char
|
||||
$cursor++;
|
||||
$position_next_space=$this->nextWhiteSpace($string,$cursor);
|
||||
$value_begin = $cursor;
|
||||
$cursor = strpos($string, $char, $cursor);
|
||||
$value_end = $cursor;
|
||||
} else {
|
||||
// it's not quoted, end bound is whitespace
|
||||
$value_begin = $cursor;
|
||||
$cursor += strcspn($string, "\x20\x09\x0D\x0A", $cursor);
|
||||
$value_end = $cursor;
|
||||
}
|
||||
|
||||
// if we've hit the end, assign the key an empty value and abort
|
||||
if ($cursor >= $size) {
|
||||
$array[$key] = '';
|
||||
break;
|
||||
}
|
||||
$value = substr($string, $value_begin, $value_end - $value_begin);
|
||||
$array[$key] = $value;
|
||||
$cursor++;
|
||||
|
||||
// find the next quote
|
||||
$position_next_quote = $this->nextQuote($string, $cursor);
|
||||
|
||||
// if the quote is not where the cursor is, we're dealing
|
||||
// with an unquoted attribute
|
||||
if ($position_next_quote !== $cursor) {
|
||||
if ($key) {
|
||||
$array[$key] = trim(substr($string, $cursor,
|
||||
$position_next_space - $cursor));
|
||||
}
|
||||
$cursor = $position_next_space + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// otherwise, regular attribute
|
||||
$quote = $string{$position_next_quote};
|
||||
$position_end_quote = strpos(
|
||||
$string, $quote, $position_next_quote + 1
|
||||
);
|
||||
|
||||
// check if the ending quote is missing
|
||||
if ($position_end_quote === false) {
|
||||
// it is, assign it to the end of the string
|
||||
$position_end_quote = $size;
|
||||
}
|
||||
|
||||
$value = substr($string, $position_next_quote + 1,
|
||||
$position_end_quote - $position_next_quote - 1);
|
||||
if ($key) {
|
||||
$array[$key] = html_entity_decode($value, ENT_QUOTES);
|
||||
}
|
||||
$cursor = $position_end_quote + 1;
|
||||
} else {
|
||||
//boolattr
|
||||
if ($position_next_space === false) {
|
||||
$position_next_space = $size;
|
||||
}
|
||||
$key = substr($string, $cursor, $position_next_space - $cursor);
|
||||
if ($key) {
|
||||
// boolattr
|
||||
if ($key !== '') {
|
||||
$array[$key] = $key;
|
||||
}
|
||||
$cursor = $position_next_space + 1;
|
||||
|
||||
}
|
||||
}
|
||||
return $array;
|
||||
|
@ -8,36 +8,26 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
|
||||
{
|
||||
var $is_tag = true;
|
||||
var $name;
|
||||
function HTMLPurifier_Token_Tag($name) {
|
||||
// watch out, actually XML is case-sensitive, while HTML
|
||||
// is case insensitive, which means we can't use this for XML
|
||||
$this->name = strtolower($name); // for some reason, the SAX parser
|
||||
// uses uppercase. Investigate?
|
||||
}
|
||||
}
|
||||
|
||||
// a rich tag has attributes
|
||||
class HTMLPurifier_Token_RichTag extends HTMLPurifier_Token_Tag // abstract
|
||||
{
|
||||
var $attributes = array();
|
||||
function HTMLPurifier_Token_RichTag($name, $attributes = array()) {
|
||||
$this->HTMLPurifier_Token_Tag($name);
|
||||
function HTMLPurifier_Token_Tag($name, $attributes = array()) {
|
||||
$this->name = ctype_lower($name) ? $name : strtolower($name);
|
||||
$this->attributes = $attributes;
|
||||
}
|
||||
}
|
||||
|
||||
// start CONCRETE ones
|
||||
|
||||
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_RichTag
|
||||
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
|
||||
{
|
||||
var $type = 'start';
|
||||
}
|
||||
|
||||
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_RichTag
|
||||
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
|
||||
{
|
||||
var $type = 'empty';
|
||||
}
|
||||
|
||||
// accepts attributes even though it really can't, for optimization reasons
|
||||
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
|
||||
{
|
||||
var $type = 'end';
|
||||
@ -51,7 +41,7 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token
|
||||
var $is_whitespace = false;
|
||||
function HTMLPurifier_Token_Text($data) {
|
||||
$this->data = $data;
|
||||
if (trim($data, " \n\r\t") === '') $this->is_whitespace = true;
|
||||
if (ctype_space($data)) $this->is_whitespace = true;
|
||||
}
|
||||
function append($text) {
|
||||
return new HTMLPurifier_Token_Text($this->data . $text->data);
|
||||
|
@ -153,13 +153,18 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
||||
|
||||
// [SGML-INVALID]
|
||||
$input[10] = '<a "=>';
|
||||
// We barf on this, aim for no attributes
|
||||
$expect[10] = array(
|
||||
new HTMLPurifier_Token_Start('a', array('"' => ''))
|
||||
);
|
||||
// DOM doesn't register an invalid attribute
|
||||
// DOM correctly has no attributes, but also closes the tag
|
||||
$dom_expect[10] = array(
|
||||
new HTMLPurifier_Token_Empty('a')
|
||||
);
|
||||
// SAX barfs on this
|
||||
$sax_expect[10] = array(
|
||||
new HTMLPurifier_Token_Start('a', array('"' => ''))
|
||||
);
|
||||
|
||||
// [INVALID] [RECOVERABLE]
|
||||
$input[11] = '"';
|
||||
@ -232,6 +237,18 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
||||
$input[6] = 'href="foo';
|
||||
$expect[6] = array('href' => 'foo');
|
||||
|
||||
$input[7] = '"=';
|
||||
$expect[7] = array('"' => '');
|
||||
// 0123456789012345678901234567890123
|
||||
$input[8] = 'href ="about:blank"rel ="nofollow"';
|
||||
$expect[8] = array('href' => 'about:blank', 'rel' => 'nofollow');
|
||||
|
||||
$input[9] = 'foo bar';
|
||||
$expect[9] = array('foo' => 'foo', 'bar' => 'bar');
|
||||
|
||||
$input[10] = 'foo="bar" blue';
|
||||
$expect[10] = array('foo' => 'bar', 'blue' => 'blue');
|
||||
|
||||
$size = count($input);
|
||||
for($i = 0; $i < $size; $i++) {
|
||||
$result = $this->DirectLex->tokenizeAttributeString($input[$i]);
|
||||
|
Loading…
Reference in New Issue
Block a user