0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-12-31 20:01:52 +00:00

[1.7.0] Implement line number counting in DirectLex, in preparation for error reporting

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1155 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2007-06-18 02:01:01 +00:00
parent 70bcccf54c
commit 4bf15de536
6 changed files with 168 additions and 11 deletions

1
NEWS
View File

@ -53,6 +53,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
. Lexer is now pre-emptively included, with a conditional include for the
PHP5 only version.
. HTMLDefinition and CSSDefinition have a common parent class: Definition.
. DirectLex can now track line-numbers
1.6.1, released 2007-05-05
! Support for more deprecated attributes via transformations:

View File

@ -47,9 +47,24 @@ HTMLPurifier_ConfigSchema::define(
to use it.
</dd>
</dl>
<p>
This directive has been available since 1.7.0.
</p>
'
);
HTMLPurifier_ConfigSchema::define(
'Core', 'MaintainLineNumbers', false, 'bool', '
<p>
If true, HTML Purifier will add line number information to all tokens.
This is useful when error reporting is turned on, but can result in
significant performance degradation and should not be used when
unnecessary. This directive must be used with the DirectLex lexer,
as the DOMLex lexer does not (yet) support this functionality. This directive
has been available since 1.7.0.
</p>
');
/**
* Forgivingly lexes HTML (SGML-style) markup into tokens.
*
@ -135,7 +150,14 @@ class HTMLPurifier_Lexer
}
if (is_null($lexer)) { do {
// auto-detectection algorithm
// auto-detection algorithm
// once PHP DOM implements native line numbers, or we
// hack out something using XSLT, remove this stipulation
if ($config->get('Core', 'MaintainLineNumbers')) {
$lexer = 'DirectLex';
break;
}
if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5
class_exists('DOMDocument')) { // check for DOM support

View File

@ -2,6 +2,20 @@
require_once 'HTMLPurifier/Lexer.php';
HTMLPurifier_ConfigSchema::define(
'Core', 'DirectLexLineNumberSyncInterval', 0, 'int', '
<p>
Specifies the number of tokens the DirectLex line number tracking
implementations should process before attempting to resyncronize the
current line count by manually counting all previous new-lines. When
at 0, this functionality is disabled. Lower values will decrease
performance, and this is only strictly necessary if the counting
algorithm is buggy (in which case you should report it as a bug).
This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is
not being used. This directive has been available since 1.7.0.
</p>
');
/**
* Our in-house implementation of a parser.
*
@ -32,9 +46,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
$inside_tag = false; // whether or not we're parsing the inside of a tag
$array = array(); // result array
$maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
$current_line = 1;
$nl = PHP_EOL;
// how often to manually recalculate. This will ALWAYS be right,
// but it's pretty wasteful. Set to 0 to turn off
$synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval');
// infinite loop protection
// has to be pretty big, since html docs can be big
// we're allow two hundred thousand tags... more than enough?
// NOTE: this is also used for synchronization, so watch out
$loops = 0;
while(true) {
@ -42,10 +64,21 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// infinite loop protection
if (++$loops > 200000) return array();
// recalculate lines
if (
$maintain_line_numbers && // line number tracking is on
$synchronize_interval && // synchronization is on
$cursor > 0 && // cursor is further than zero
$loops % $synchronize_interval === 0 // time to synchronize!
) {
$current_line = 1 + substr_count($html, $nl, 0, $cursor);
}
$position_next_lt = strpos($html, '<', $cursor);
$position_next_gt = strpos($html, '>', $cursor);
// triggers on "<b>asdf</b>" but not "asdf <b></b>"
// special case to set up context
if ($position_next_lt === $cursor) {
$inside_tag = true;
$cursor++;
@ -53,7 +86,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
if (!$inside_tag && $position_next_lt !== false) {
// We are not inside tag and there still is another tag to parse
$array[] = new
$token = new
HTMLPurifier_Token_Text(
$this->parseData(
substr(
@ -61,6 +94,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
)
)
);
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += substr_count($html, $nl, $cursor, $position_next_lt - $cursor);
}
$array[] = $token;
$cursor = $position_next_lt + 1;
$inside_tag = true;
continue;
@ -69,7 +107,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// If we're already at the end, break
if ($cursor === strlen($html)) break;
// Create Text of rest of string
$array[] = new
$token = new
HTMLPurifier_Token_Text(
$this->parseData(
substr(
@ -77,6 +115,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
)
)
);
if ($maintain_line_numbers) $token->line = $current_line;
$array[] = $token;
break;
} elseif ($inside_tag && $position_next_gt !== false) {
// We are in tag and it is well formed
@ -89,12 +129,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
substr($segment, 0, 3) == '!--' &&
substr($segment, $strlen_segment-2, 2) == '--'
) {
$array[] = new
$token = new
HTMLPurifier_Token_Comment(
substr(
$segment, 3, $strlen_segment - 5
)
);
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
}
$array[] = $token;
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
@ -104,7 +149,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
$is_end_tag = (strpos($segment,'/') === 0);
if ($is_end_tag) {
$type = substr($segment, 1);
$array[] = new HTMLPurifier_Token_End($type);
$token = new HTMLPurifier_Token_End($type);
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
}
$array[] = $token;
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
@ -114,7 +164,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// have accidently grabbed an emoticon. Translate into
// text and go our merry way
if (!ctype_alnum($segment[0])) {
$array[] = new
$token = new
HTMLPurifier_Token_Text(
'<' .
$this->parseData(
@ -122,6 +172,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
) .
'>'
);
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
}
$array[] = $token;
$cursor = $position_next_gt + 1;
$inside_tag = false;
continue;
@ -142,10 +197,15 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
if ($position_first_space >= $strlen_segment) {
if ($is_self_closing) {
$array[] = new HTMLPurifier_Token_Empty($segment);
$token = new HTMLPurifier_Token_Empty($segment);
} else {
$array[] = new HTMLPurifier_Token_Start($segment);
$token = new HTMLPurifier_Token_Start($segment);
}
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
}
$array[] = $token;
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
@ -169,21 +229,29 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
}
if ($is_self_closing) {
$array[] = new HTMLPurifier_Token_Empty($type, $attr);
$token = new HTMLPurifier_Token_Empty($type, $attr);
} else {
$array[] = new HTMLPurifier_Token_Start($type, $attr);
$token = new HTMLPurifier_Token_Start($type, $attr);
}
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
}
$array[] = $token;
$cursor = $position_next_gt + 1;
$inside_tag = false;
continue;
} else {
$array[] = new
$token = new
HTMLPurifier_Token_Text(
'<' .
$this->parseData(
substr($html, $cursor)
)
);
if ($maintain_line_numbers) $token->line = $current_line;
// no cursor scroll? Hmm...
$array[] = $token;
break;
}
break;

View File

@ -11,6 +11,7 @@
*/
class HTMLPurifier_Token {
var $type; /**< Type of node to bypass <tt>is_a()</tt>. @public */
var $line; /**< Line number node was on in source document. Null if unknown. @public */
/**
* Copies the tag into a new one (clone substitute).

View File

@ -64,6 +64,65 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase
}
function testLineNumbers() {
$html = '<b>Line 1</b>
<i>Line 2</i>
Still Line 2<br
/>Now Line 4
<br />';
$expect = array(
// line 1
0 => new HTMLPurifier_Token_Start('b')
,1 => new HTMLPurifier_Token_Text('Line 1')
,2 => new HTMLPurifier_Token_End('b')
,3 => new HTMLPurifier_Token_Text('
')
// line 2
,4 => new HTMLPurifier_Token_Start('i')
,5 => new HTMLPurifier_Token_Text('Line 2')
,6 => new HTMLPurifier_Token_End('i')
,7 => new HTMLPurifier_Token_Text('
Still Line 2')
// line 3
,8 => new HTMLPurifier_Token_Empty('br')
// line 4
,9 => new HTMLPurifier_Token_Text('Now Line 4
')
// line SIX
,10 => new HTMLPurifier_Token_Empty('br')
);
$context = new HTMLPurifier_Context();
$config = HTMLPurifier_Config::createDefault();
$output = $this->DirectLex->tokenizeHTML($html, $config, $context);
$this->assertIdentical($output, $expect);
$context = new HTMLPurifier_Context();
$config = HTMLPurifier_Config::create(array(
'Core.MaintainLineNumbers' => true
));
$expect[0]->line = 1;
$expect[1]->line = 1;
$expect[2]->line = 1;
$expect[3]->line = 1;
$expect[4]->line = 2;
$expect[5]->line = 2;
$expect[6]->line = 2;
$expect[7]->line = 2;
$expect[8]->line = 3;
$expect[9]->line = 4;
$expect[10]->line = 6;
$output = $this->DirectLex->tokenizeHTML($html, $config, $context);
$this->assertIdentical($output, $expect);
}
}

View File

@ -35,6 +35,12 @@ class HTMLPurifier_LexerTest extends UnitTestCase
}
function test_create() {
$config = HTMLPurifier_Config::create(array('Core.MaintainLineNumbers' => true));
$lexer = HTMLPurifier_Lexer::create($config);
$this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
}
function assertExtractBody($text, $extract = true) {
$result = $this->Lexer->extractBody($text);
if ($extract === true) $extract = $text;