0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-12-22 08:21:52 +00:00

Commit various optimizations to the Lexer, and add stub file for profiling the lexer.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@92 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-07-22 22:48:07 +00:00
parent de5ab5e6a0
commit ca1aefe271
5 changed files with 141 additions and 92 deletions

View File

@ -0,0 +1,14 @@
<?php
set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
require_once 'HTMLPurifier/Lexer/DirectLex.php';
$input = file_get_contents('samples/Lexer/4.html');
$lexer = new HTMLPurifier_Lexer_DirectLex();
for ($i = 0; $i < 10; $i++) {
$tokens = $lexer->tokenizeHTML($input);
}
?>

View File

@ -1,7 +1,7 @@
<?php
/**
* Forgivingly lexes SGML style documents: HTML, XML, XHTML, etc.
* Forgivingly lexes HTML (not XML, since it doesn't adhere to spec exactly)
*/
require_once 'HTMLPurifier/Token.php';

View File

@ -190,7 +190,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
$segment, $position_first_space
)
);
$attributes = $this->tokenizeAttributeString($attribute_string);
if ($attribute_string) {
$attributes = $this->tokenizeAttributeString(
$attribute_string
);
} else {
$attributes = array();
}
if ($is_self_closing) {
$array[] = new HTMLPurifier_Token_Empty($type, $attributes);
} else {
@ -216,13 +223,47 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
}
function tokenizeAttributeString($string) {
$string = (string) $string;
if ($string == '') return array();
$array = array();
$cursor = 0;
$in_value = false;
$i = 0;
$size = strlen($string);
$string = (string) $string; // quick typecast
if ($string == '') return array(); // no attributes
// let's see if we can abort as quickly as possible
// one equal sign, no spaces => one attribute
$num_equal = substr_count($string, '=');
$has_space = strpos($string, ' ');
if ($num_equal === 0 && !$has_space) {
// bool attribute
return array($string => $string);
} elseif ($num_equal === 1 && !$has_space) {
// only one attribute
list($key, $quoted_value) = explode('=', $string);
$quoted_value = trim($quoted_value);
if (!$key) return array();
if (!$quoted_value) return array($key => '');
$first_char = @$quoted_value[0];
$last_char = @$quoted_value[strlen($quoted_value)-1];
$same_quote = ($first_char == $last_char);
$open_quote = ($first_char == '"' || $first_char == "'");
if ( $same_quote && $open_quote) {
// well behaved
$value = substr($quoted_value, 1, strlen($quoted_value) - 2);
} else {
// not well behaved
if ($open_quote) {
$value = substr($quoted_value, 1);
} else {
$value = $quoted_value;
}
}
return array($key => $value);
}
// setup loop environment
$array = array(); // return assoc array of attributes
$cursor = 0; // current position in string (moves forward)
$size = strlen($string); // size of the string (stays the same)
// if we have unquoted attributes, the parser expects a terminating
// space, so let's guarantee that there's always a terminating space.
@ -234,88 +275,75 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
while(true) {
// infinite loop protection
// if we've looped 1000 times, abort. Nothing good can come of this
if (++$loops > 1000) return array();
if ($cursor >= $size) {
break;
}
$cursor += ($value = strspn($string, "\x20\x09\x0D\x0A", $cursor));
$position_next_space = $this->nextWhiteSpace($string, $cursor);
//scroll to the last whitespace before text
while ($position_next_space === $cursor) {
$cursor++;
$position_next_space = $this->nextWhiteSpace($string, $cursor);
}
$position_next_equal = strpos($string, '=', $cursor);
if ($position_next_equal !== false &&
($position_next_equal < $position_next_space ||
$position_next_space === false)) {
//attr="asdf"
// grab the key
$key = trim(
substr(
$string, $cursor, $position_next_equal - $cursor
)
);
// grab the key
$key_begin = $cursor; //we're currently at the start of the key
// scroll past all characters that are the key (not whitespace or =)
$cursor += strcspn($string, "\x20\x09\x0D\x0A=", $cursor);
$key_end = $cursor; // now at the end of the key
$key = substr($string, $key_begin, $key_end - $key_begin);
if (!$key) continue; // empty key
// scroll past all whitespace
$cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
if ($cursor >= $size) {
$array[$key] = $key;
break;
}
// if the next character is an equal sign, we've got a regular
// pair, otherwise, it's a bool attribute
$first_char = @$string[$cursor];
if ($first_char == '=') {
// key="value"
// set cursor right after the equal sign
$cursor = $position_next_equal + 1;
$cursor++;
$cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
// consume all spaces after the equal sign
$position_next_space = $this->nextWhiteSpace($string, $cursor);
while ($position_next_space === $cursor) {
// we might be in front of a quote right now
$char = @$string[$cursor];
if ($char == '"' || $char == "'") {
// it's quoted, end bound is $char
$cursor++;
$position_next_space=$this->nextWhiteSpace($string,$cursor);
$value_begin = $cursor;
$cursor = strpos($string, $char, $cursor);
$value_end = $cursor;
} else {
// it's not quoted, end bound is whitespace
$value_begin = $cursor;
$cursor += strcspn($string, "\x20\x09\x0D\x0A", $cursor);
$value_end = $cursor;
}
// if we've hit the end, assign the key an empty value and abort
if ($cursor >= $size) {
$array[$key] = '';
break;
}
$value = substr($string, $value_begin, $value_end - $value_begin);
$array[$key] = $value;
$cursor++;
// find the next quote
$position_next_quote = $this->nextQuote($string, $cursor);
// if the quote is not where the cursor is, we're dealing
// with an unquoted attribute
if ($position_next_quote !== $cursor) {
if ($key) {
$array[$key] = trim(substr($string, $cursor,
$position_next_space - $cursor));
}
$cursor = $position_next_space + 1;
continue;
}
// otherwise, regular attribute
$quote = $string{$position_next_quote};
$position_end_quote = strpos(
$string, $quote, $position_next_quote + 1
);
// check if the ending quote is missing
if ($position_end_quote === false) {
// it is, assign it to the end of the string
$position_end_quote = $size;
}
$value = substr($string, $position_next_quote + 1,
$position_end_quote - $position_next_quote - 1);
if ($key) {
$array[$key] = html_entity_decode($value, ENT_QUOTES);
}
$cursor = $position_end_quote + 1;
} else {
//boolattr
if ($position_next_space === false) {
$position_next_space = $size;
}
$key = substr($string, $cursor, $position_next_space - $cursor);
if ($key) {
// boolattr
if ($key !== '') {
$array[$key] = $key;
}
$cursor = $position_next_space + 1;
}
}
return $array;

View File

@ -8,36 +8,26 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
{
var $is_tag = true;
var $name;
function HTMLPurifier_Token_Tag($name) {
// watch out, actually XML is case-sensitive, while HTML
// is case insensitive, which means we can't use this for XML
$this->name = strtolower($name); // for some reason, the SAX parser
// uses uppercase. Investigate?
}
}
// a rich tag has attributes
class HTMLPurifier_Token_RichTag extends HTMLPurifier_Token_Tag // abstract
{
var $attributes = array();
function HTMLPurifier_Token_RichTag($name, $attributes = array()) {
$this->HTMLPurifier_Token_Tag($name);
function HTMLPurifier_Token_Tag($name, $attributes = array()) {
$this->name = ctype_lower($name) ? $name : strtolower($name);
$this->attributes = $attributes;
}
}
// start CONCRETE ones
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_RichTag
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
{
var $type = 'start';
}
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_RichTag
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
{
var $type = 'empty';
}
// accepts attributes even though it really can't, for optimization reasons
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
{
var $type = 'end';
@ -51,7 +41,7 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token
var $is_whitespace = false;
function HTMLPurifier_Token_Text($data) {
$this->data = $data;
if (trim($data, " \n\r\t") === '') $this->is_whitespace = true;
if (ctype_space($data)) $this->is_whitespace = true;
}
function append($text) {
return new HTMLPurifier_Token_Text($this->data . $text->data);

View File

@ -153,13 +153,18 @@ class HTMLPurifier_LexerTest extends UnitTestCase
// [SGML-INVALID]
$input[10] = '<a "=>';
// We barf on this, aim for no attributes
$expect[10] = array(
new HTMLPurifier_Token_Start('a', array('"' => ''))
);
// DOM doesn't register an invalid attribute
// DOM correctly has no attributes, but also closes the tag
$dom_expect[10] = array(
new HTMLPurifier_Token_Empty('a')
);
// SAX barfs on this
$sax_expect[10] = array(
new HTMLPurifier_Token_Start('a', array('"' => ''))
);
// [INVALID] [RECOVERABLE]
$input[11] = '"';
@ -232,6 +237,18 @@ class HTMLPurifier_LexerTest extends UnitTestCase
$input[6] = 'href="foo';
$expect[6] = array('href' => 'foo');
$input[7] = '"=';
$expect[7] = array('"' => '');
// 0123456789012345678901234567890123
$input[8] = 'href ="about:blank"rel ="nofollow"';
$expect[8] = array('href' => 'about:blank', 'rel' => 'nofollow');
$input[9] = 'foo bar';
$expect[9] = array('foo' => 'foo', 'bar' => 'bar');
$input[10] = 'foo="bar" blue';
$expect[10] = array('foo' => 'bar', 'blue' => 'blue');
$size = count($input);
for($i = 0; $i < $size; $i++) {
$result = $this->DirectLex->tokenizeAttributeString($input[$i]);