mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-01-03 13:21:51 +00:00
Commit various optimizations to the Lexer, and add stub file for profiling the lexer.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@92 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
de5ab5e6a0
commit
ca1aefe271
14
benchmarks/ProfileDirectLex.php
Normal file
14
benchmarks/ProfileDirectLex.php
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
|
||||||
|
|
||||||
|
require_once 'HTMLPurifier/Lexer/DirectLex.php';
|
||||||
|
|
||||||
|
$input = file_get_contents('samples/Lexer/4.html');
|
||||||
|
$lexer = new HTMLPurifier_Lexer_DirectLex();
|
||||||
|
|
||||||
|
for ($i = 0; $i < 10; $i++) {
|
||||||
|
$tokens = $lexer->tokenizeHTML($input);
|
||||||
|
}
|
||||||
|
|
||||||
|
?>
|
@ -1,7 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Forgivingly lexes SGML style documents: HTML, XML, XHTML, etc.
|
* Forgivingly lexes HTML (not XML, since it doesn't adhere to spec exactly)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
require_once 'HTMLPurifier/Token.php';
|
require_once 'HTMLPurifier/Token.php';
|
||||||
|
@ -190,7 +190,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
$segment, $position_first_space
|
$segment, $position_first_space
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
$attributes = $this->tokenizeAttributeString($attribute_string);
|
if ($attribute_string) {
|
||||||
|
$attributes = $this->tokenizeAttributeString(
|
||||||
|
$attribute_string
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
$attributes = array();
|
||||||
|
}
|
||||||
|
|
||||||
if ($is_self_closing) {
|
if ($is_self_closing) {
|
||||||
$array[] = new HTMLPurifier_Token_Empty($type, $attributes);
|
$array[] = new HTMLPurifier_Token_Empty($type, $attributes);
|
||||||
} else {
|
} else {
|
||||||
@ -216,13 +223,47 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
}
|
}
|
||||||
|
|
||||||
function tokenizeAttributeString($string) {
|
function tokenizeAttributeString($string) {
|
||||||
$string = (string) $string;
|
$string = (string) $string; // quick typecast
|
||||||
if ($string == '') return array();
|
|
||||||
$array = array();
|
if ($string == '') return array(); // no attributes
|
||||||
$cursor = 0;
|
|
||||||
$in_value = false;
|
// let's see if we can abort as quickly as possible
|
||||||
$i = 0;
|
// one equal sign, no spaces => one attribute
|
||||||
$size = strlen($string);
|
$num_equal = substr_count($string, '=');
|
||||||
|
$has_space = strpos($string, ' ');
|
||||||
|
if ($num_equal === 0 && !$has_space) {
|
||||||
|
// bool attribute
|
||||||
|
return array($string => $string);
|
||||||
|
} elseif ($num_equal === 1 && !$has_space) {
|
||||||
|
// only one attribute
|
||||||
|
list($key, $quoted_value) = explode('=', $string);
|
||||||
|
$quoted_value = trim($quoted_value);
|
||||||
|
if (!$key) return array();
|
||||||
|
if (!$quoted_value) return array($key => '');
|
||||||
|
$first_char = @$quoted_value[0];
|
||||||
|
$last_char = @$quoted_value[strlen($quoted_value)-1];
|
||||||
|
|
||||||
|
$same_quote = ($first_char == $last_char);
|
||||||
|
$open_quote = ($first_char == '"' || $first_char == "'");
|
||||||
|
|
||||||
|
if ( $same_quote && $open_quote) {
|
||||||
|
// well behaved
|
||||||
|
$value = substr($quoted_value, 1, strlen($quoted_value) - 2);
|
||||||
|
} else {
|
||||||
|
// not well behaved
|
||||||
|
if ($open_quote) {
|
||||||
|
$value = substr($quoted_value, 1);
|
||||||
|
} else {
|
||||||
|
$value = $quoted_value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return array($key => $value);
|
||||||
|
}
|
||||||
|
|
||||||
|
// setup loop environment
|
||||||
|
$array = array(); // return assoc array of attributes
|
||||||
|
$cursor = 0; // current position in string (moves forward)
|
||||||
|
$size = strlen($string); // size of the string (stays the same)
|
||||||
|
|
||||||
// if we have unquoted attributes, the parser expects a terminating
|
// if we have unquoted attributes, the parser expects a terminating
|
||||||
// space, so let's guarantee that there's always a terminating space.
|
// space, so let's guarantee that there's always a terminating space.
|
||||||
@ -234,88 +275,75 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
while(true) {
|
while(true) {
|
||||||
|
|
||||||
// infinite loop protection
|
// infinite loop protection
|
||||||
// if we've looped 1000 times, abort. Nothing good can come of this
|
|
||||||
if (++$loops > 1000) return array();
|
if (++$loops > 1000) return array();
|
||||||
|
|
||||||
if ($cursor >= $size) {
|
if ($cursor >= $size) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$cursor += ($value = strspn($string, "\x20\x09\x0D\x0A", $cursor));
|
||||||
|
|
||||||
$position_next_space = $this->nextWhiteSpace($string, $cursor);
|
$position_next_space = $this->nextWhiteSpace($string, $cursor);
|
||||||
//scroll to the last whitespace before text
|
|
||||||
while ($position_next_space === $cursor) {
|
|
||||||
$cursor++;
|
|
||||||
$position_next_space = $this->nextWhiteSpace($string, $cursor);
|
|
||||||
}
|
|
||||||
$position_next_equal = strpos($string, '=', $cursor);
|
$position_next_equal = strpos($string, '=', $cursor);
|
||||||
if ($position_next_equal !== false &&
|
|
||||||
($position_next_equal < $position_next_space ||
|
|
||||||
$position_next_space === false)) {
|
|
||||||
//attr="asdf"
|
|
||||||
// grab the key
|
// grab the key
|
||||||
$key = trim(
|
|
||||||
substr(
|
|
||||||
$string, $cursor, $position_next_equal - $cursor
|
|
||||||
)
|
|
||||||
);
|
|
||||||
|
|
||||||
// set cursor right after the equal sign
|
$key_begin = $cursor; //we're currently at the start of the key
|
||||||
$cursor = $position_next_equal + 1;
|
|
||||||
|
|
||||||
// consume all spaces after the equal sign
|
// scroll past all characters that are the key (not whitespace or =)
|
||||||
$position_next_space = $this->nextWhiteSpace($string, $cursor);
|
$cursor += strcspn($string, "\x20\x09\x0D\x0A=", $cursor);
|
||||||
while ($position_next_space === $cursor) {
|
|
||||||
$cursor++;
|
$key_end = $cursor; // now at the end of the key
|
||||||
$position_next_space=$this->nextWhiteSpace($string,$cursor);
|
|
||||||
}
|
$key = substr($string, $key_begin, $key_end - $key_begin);
|
||||||
|
|
||||||
|
if (!$key) continue; // empty key
|
||||||
|
|
||||||
|
// scroll past all whitespace
|
||||||
|
$cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
|
||||||
|
|
||||||
// if we've hit the end, assign the key an empty value and abort
|
|
||||||
if ($cursor >= $size) {
|
if ($cursor >= $size) {
|
||||||
$array[$key] = '';
|
$array[$key] = $key;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// find the next quote
|
// if the next character is an equal sign, we've got a regular
|
||||||
$position_next_quote = $this->nextQuote($string, $cursor);
|
// pair, otherwise, it's a bool attribute
|
||||||
|
$first_char = @$string[$cursor];
|
||||||
|
|
||||||
// if the quote is not where the cursor is, we're dealing
|
if ($first_char == '=') {
|
||||||
// with an unquoted attribute
|
// key="value"
|
||||||
if ($position_next_quote !== $cursor) {
|
|
||||||
if ($key) {
|
|
||||||
$array[$key] = trim(substr($string, $cursor,
|
|
||||||
$position_next_space - $cursor));
|
|
||||||
}
|
|
||||||
$cursor = $position_next_space + 1;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// otherwise, regular attribute
|
$cursor++;
|
||||||
$quote = $string{$position_next_quote};
|
$cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
|
||||||
$position_end_quote = strpos(
|
|
||||||
$string, $quote, $position_next_quote + 1
|
|
||||||
);
|
|
||||||
|
|
||||||
// check if the ending quote is missing
|
// we might be in front of a quote right now
|
||||||
if ($position_end_quote === false) {
|
|
||||||
// it is, assign it to the end of the string
|
|
||||||
$position_end_quote = $size;
|
|
||||||
}
|
|
||||||
|
|
||||||
$value = substr($string, $position_next_quote + 1,
|
$char = @$string[$cursor];
|
||||||
$position_end_quote - $position_next_quote - 1);
|
|
||||||
if ($key) {
|
if ($char == '"' || $char == "'") {
|
||||||
$array[$key] = html_entity_decode($value, ENT_QUOTES);
|
// it's quoted, end bound is $char
|
||||||
}
|
$cursor++;
|
||||||
$cursor = $position_end_quote + 1;
|
$value_begin = $cursor;
|
||||||
|
$cursor = strpos($string, $char, $cursor);
|
||||||
|
$value_end = $cursor;
|
||||||
} else {
|
} else {
|
||||||
//boolattr
|
// it's not quoted, end bound is whitespace
|
||||||
if ($position_next_space === false) {
|
$value_begin = $cursor;
|
||||||
$position_next_space = $size;
|
$cursor += strcspn($string, "\x20\x09\x0D\x0A", $cursor);
|
||||||
|
$value_end = $cursor;
|
||||||
}
|
}
|
||||||
$key = substr($string, $cursor, $position_next_space - $cursor);
|
|
||||||
if ($key) {
|
$value = substr($string, $value_begin, $value_end - $value_begin);
|
||||||
|
$array[$key] = $value;
|
||||||
|
$cursor++;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// boolattr
|
||||||
|
if ($key !== '') {
|
||||||
$array[$key] = $key;
|
$array[$key] = $key;
|
||||||
}
|
}
|
||||||
$cursor = $position_next_space + 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return $array;
|
return $array;
|
||||||
|
@ -8,36 +8,26 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
|
|||||||
{
|
{
|
||||||
var $is_tag = true;
|
var $is_tag = true;
|
||||||
var $name;
|
var $name;
|
||||||
function HTMLPurifier_Token_Tag($name) {
|
|
||||||
// watch out, actually XML is case-sensitive, while HTML
|
|
||||||
// is case insensitive, which means we can't use this for XML
|
|
||||||
$this->name = strtolower($name); // for some reason, the SAX parser
|
|
||||||
// uses uppercase. Investigate?
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// a rich tag has attributes
|
|
||||||
class HTMLPurifier_Token_RichTag extends HTMLPurifier_Token_Tag // abstract
|
|
||||||
{
|
|
||||||
var $attributes = array();
|
var $attributes = array();
|
||||||
function HTMLPurifier_Token_RichTag($name, $attributes = array()) {
|
function HTMLPurifier_Token_Tag($name, $attributes = array()) {
|
||||||
$this->HTMLPurifier_Token_Tag($name);
|
$this->name = ctype_lower($name) ? $name : strtolower($name);
|
||||||
$this->attributes = $attributes;
|
$this->attributes = $attributes;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// start CONCRETE ones
|
// start CONCRETE ones
|
||||||
|
|
||||||
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_RichTag
|
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
|
||||||
{
|
{
|
||||||
var $type = 'start';
|
var $type = 'start';
|
||||||
}
|
}
|
||||||
|
|
||||||
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_RichTag
|
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
|
||||||
{
|
{
|
||||||
var $type = 'empty';
|
var $type = 'empty';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// accepts attributes even though it really can't, for optimization reasons
|
||||||
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
|
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
|
||||||
{
|
{
|
||||||
var $type = 'end';
|
var $type = 'end';
|
||||||
@ -51,7 +41,7 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token
|
|||||||
var $is_whitespace = false;
|
var $is_whitespace = false;
|
||||||
function HTMLPurifier_Token_Text($data) {
|
function HTMLPurifier_Token_Text($data) {
|
||||||
$this->data = $data;
|
$this->data = $data;
|
||||||
if (trim($data, " \n\r\t") === '') $this->is_whitespace = true;
|
if (ctype_space($data)) $this->is_whitespace = true;
|
||||||
}
|
}
|
||||||
function append($text) {
|
function append($text) {
|
||||||
return new HTMLPurifier_Token_Text($this->data . $text->data);
|
return new HTMLPurifier_Token_Text($this->data . $text->data);
|
||||||
|
@ -153,13 +153,18 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
|||||||
|
|
||||||
// [SGML-INVALID]
|
// [SGML-INVALID]
|
||||||
$input[10] = '<a "=>';
|
$input[10] = '<a "=>';
|
||||||
|
// We barf on this, aim for no attributes
|
||||||
$expect[10] = array(
|
$expect[10] = array(
|
||||||
new HTMLPurifier_Token_Start('a', array('"' => ''))
|
new HTMLPurifier_Token_Start('a', array('"' => ''))
|
||||||
);
|
);
|
||||||
// DOM doesn't register an invalid attribute
|
// DOM correctly has no attributes, but also closes the tag
|
||||||
$dom_expect[10] = array(
|
$dom_expect[10] = array(
|
||||||
new HTMLPurifier_Token_Empty('a')
|
new HTMLPurifier_Token_Empty('a')
|
||||||
);
|
);
|
||||||
|
// SAX barfs on this
|
||||||
|
$sax_expect[10] = array(
|
||||||
|
new HTMLPurifier_Token_Start('a', array('"' => ''))
|
||||||
|
);
|
||||||
|
|
||||||
// [INVALID] [RECOVERABLE]
|
// [INVALID] [RECOVERABLE]
|
||||||
$input[11] = '"';
|
$input[11] = '"';
|
||||||
@ -232,6 +237,18 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
|||||||
$input[6] = 'href="foo';
|
$input[6] = 'href="foo';
|
||||||
$expect[6] = array('href' => 'foo');
|
$expect[6] = array('href' => 'foo');
|
||||||
|
|
||||||
|
$input[7] = '"=';
|
||||||
|
$expect[7] = array('"' => '');
|
||||||
|
// 0123456789012345678901234567890123
|
||||||
|
$input[8] = 'href ="about:blank"rel ="nofollow"';
|
||||||
|
$expect[8] = array('href' => 'about:blank', 'rel' => 'nofollow');
|
||||||
|
|
||||||
|
$input[9] = 'foo bar';
|
||||||
|
$expect[9] = array('foo' => 'foo', 'bar' => 'bar');
|
||||||
|
|
||||||
|
$input[10] = 'foo="bar" blue';
|
||||||
|
$expect[10] = array('foo' => 'bar', 'blue' => 'blue');
|
||||||
|
|
||||||
$size = count($input);
|
$size = count($input);
|
||||||
for($i = 0; $i < $size; $i++) {
|
for($i = 0; $i < $size; $i++) {
|
||||||
$result = $this->DirectLex->tokenizeAttributeString($input[$i]);
|
$result = $this->DirectLex->tokenizeAttributeString($input[$i]);
|
||||||
|
Loading…
Reference in New Issue
Block a user