mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-22 16:31:53 +00:00
[1.7.0] Implement line number counting in DirectLex, in preparation for error reporting
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1155 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
70bcccf54c
commit
4bf15de536
1
NEWS
1
NEWS
@ -53,6 +53,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
||||
. Lexer is now pre-emptively included, with a conditional include for the
|
||||
PHP5 only version.
|
||||
. HTMLDefinition and CSSDefinition have a common parent class: Definition.
|
||||
. DirectLex can now track line-numbers
|
||||
|
||||
1.6.1, released 2007-05-05
|
||||
! Support for more deprecated attributes via transformations:
|
||||
|
@ -47,9 +47,24 @@ HTMLPurifier_ConfigSchema::define(
|
||||
to use it.
|
||||
</dd>
|
||||
</dl>
|
||||
<p>
|
||||
This directive has been available since 1.7.0.
|
||||
</p>
|
||||
'
|
||||
);
|
||||
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Core', 'MaintainLineNumbers', false, 'bool', '
|
||||
<p>
|
||||
If true, HTML Purifier will add line number information to all tokens.
|
||||
This is useful when error reporting is turned on, but can result in
|
||||
significant performance degradation and should not be used when
|
||||
unnecessary. This directive must be used with the DirectLex lexer,
|
||||
as the DOMLex lexer does not (yet) support this functionality. This directive
|
||||
has been available since 1.7.0.
|
||||
</p>
|
||||
');
|
||||
|
||||
/**
|
||||
* Forgivingly lexes HTML (SGML-style) markup into tokens.
|
||||
*
|
||||
@ -135,7 +150,14 @@ class HTMLPurifier_Lexer
|
||||
}
|
||||
|
||||
if (is_null($lexer)) { do {
|
||||
// auto-detectection algorithm
|
||||
// auto-detection algorithm
|
||||
|
||||
// once PHP DOM implements native line numbers, or we
|
||||
// hack out something using XSLT, remove this stipulation
|
||||
if ($config->get('Core', 'MaintainLineNumbers')) {
|
||||
$lexer = 'DirectLex';
|
||||
break;
|
||||
}
|
||||
|
||||
if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5
|
||||
class_exists('DOMDocument')) { // check for DOM support
|
||||
|
@ -2,6 +2,20 @@
|
||||
|
||||
require_once 'HTMLPurifier/Lexer.php';
|
||||
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Core', 'DirectLexLineNumberSyncInterval', 0, 'int', '
|
||||
<p>
|
||||
Specifies the number of tokens the DirectLex line number tracking
|
||||
implementations should process before attempting to resyncronize the
|
||||
current line count by manually counting all previous new-lines. When
|
||||
at 0, this functionality is disabled. Lower values will decrease
|
||||
performance, and this is only strictly necessary if the counting
|
||||
algorithm is buggy (in which case you should report it as a bug).
|
||||
This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is
|
||||
not being used. This directive has been available since 1.7.0.
|
||||
</p>
|
||||
');
|
||||
|
||||
/**
|
||||
* Our in-house implementation of a parser.
|
||||
*
|
||||
@ -32,9 +46,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
$inside_tag = false; // whether or not we're parsing the inside of a tag
|
||||
$array = array(); // result array
|
||||
|
||||
$maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
|
||||
$current_line = 1;
|
||||
$nl = PHP_EOL;
|
||||
// how often to manually recalculate. This will ALWAYS be right,
|
||||
// but it's pretty wasteful. Set to 0 to turn off
|
||||
$synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval');
|
||||
|
||||
// infinite loop protection
|
||||
// has to be pretty big, since html docs can be big
|
||||
// we're allow two hundred thousand tags... more than enough?
|
||||
// NOTE: this is also used for synchronization, so watch out
|
||||
$loops = 0;
|
||||
|
||||
while(true) {
|
||||
@ -42,10 +64,21 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
// infinite loop protection
|
||||
if (++$loops > 200000) return array();
|
||||
|
||||
// recalculate lines
|
||||
if (
|
||||
$maintain_line_numbers && // line number tracking is on
|
||||
$synchronize_interval && // synchronization is on
|
||||
$cursor > 0 && // cursor is further than zero
|
||||
$loops % $synchronize_interval === 0 // time to synchronize!
|
||||
) {
|
||||
$current_line = 1 + substr_count($html, $nl, 0, $cursor);
|
||||
}
|
||||
|
||||
$position_next_lt = strpos($html, '<', $cursor);
|
||||
$position_next_gt = strpos($html, '>', $cursor);
|
||||
|
||||
// triggers on "<b>asdf</b>" but not "asdf <b></b>"
|
||||
// special case to set up context
|
||||
if ($position_next_lt === $cursor) {
|
||||
$inside_tag = true;
|
||||
$cursor++;
|
||||
@ -53,7 +86,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
|
||||
if (!$inside_tag && $position_next_lt !== false) {
|
||||
// We are not inside tag and there still is another tag to parse
|
||||
$array[] = new
|
||||
$token = new
|
||||
HTMLPurifier_Token_Text(
|
||||
$this->parseData(
|
||||
substr(
|
||||
@ -61,6 +94,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
)
|
||||
)
|
||||
);
|
||||
if ($maintain_line_numbers) {
|
||||
$token->line = $current_line;
|
||||
$current_line += substr_count($html, $nl, $cursor, $position_next_lt - $cursor);
|
||||
}
|
||||
$array[] = $token;
|
||||
$cursor = $position_next_lt + 1;
|
||||
$inside_tag = true;
|
||||
continue;
|
||||
@ -69,7 +107,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
// If we're already at the end, break
|
||||
if ($cursor === strlen($html)) break;
|
||||
// Create Text of rest of string
|
||||
$array[] = new
|
||||
$token = new
|
||||
HTMLPurifier_Token_Text(
|
||||
$this->parseData(
|
||||
substr(
|
||||
@ -77,6 +115,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
)
|
||||
)
|
||||
);
|
||||
if ($maintain_line_numbers) $token->line = $current_line;
|
||||
$array[] = $token;
|
||||
break;
|
||||
} elseif ($inside_tag && $position_next_gt !== false) {
|
||||
// We are in tag and it is well formed
|
||||
@ -89,12 +129,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
substr($segment, 0, 3) == '!--' &&
|
||||
substr($segment, $strlen_segment-2, 2) == '--'
|
||||
) {
|
||||
$array[] = new
|
||||
$token = new
|
||||
HTMLPurifier_Token_Comment(
|
||||
substr(
|
||||
$segment, 3, $strlen_segment - 5
|
||||
)
|
||||
);
|
||||
if ($maintain_line_numbers) {
|
||||
$token->line = $current_line;
|
||||
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
|
||||
}
|
||||
$array[] = $token;
|
||||
$inside_tag = false;
|
||||
$cursor = $position_next_gt + 1;
|
||||
continue;
|
||||
@ -104,7 +149,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
$is_end_tag = (strpos($segment,'/') === 0);
|
||||
if ($is_end_tag) {
|
||||
$type = substr($segment, 1);
|
||||
$array[] = new HTMLPurifier_Token_End($type);
|
||||
$token = new HTMLPurifier_Token_End($type);
|
||||
if ($maintain_line_numbers) {
|
||||
$token->line = $current_line;
|
||||
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
|
||||
}
|
||||
$array[] = $token;
|
||||
$inside_tag = false;
|
||||
$cursor = $position_next_gt + 1;
|
||||
continue;
|
||||
@ -114,7 +164,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
// have accidently grabbed an emoticon. Translate into
|
||||
// text and go our merry way
|
||||
if (!ctype_alnum($segment[0])) {
|
||||
$array[] = new
|
||||
$token = new
|
||||
HTMLPurifier_Token_Text(
|
||||
'<' .
|
||||
$this->parseData(
|
||||
@ -122,6 +172,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
) .
|
||||
'>'
|
||||
);
|
||||
if ($maintain_line_numbers) {
|
||||
$token->line = $current_line;
|
||||
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
|
||||
}
|
||||
$array[] = $token;
|
||||
$cursor = $position_next_gt + 1;
|
||||
$inside_tag = false;
|
||||
continue;
|
||||
@ -142,10 +197,15 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
|
||||
if ($position_first_space >= $strlen_segment) {
|
||||
if ($is_self_closing) {
|
||||
$array[] = new HTMLPurifier_Token_Empty($segment);
|
||||
$token = new HTMLPurifier_Token_Empty($segment);
|
||||
} else {
|
||||
$array[] = new HTMLPurifier_Token_Start($segment);
|
||||
$token = new HTMLPurifier_Token_Start($segment);
|
||||
}
|
||||
if ($maintain_line_numbers) {
|
||||
$token->line = $current_line;
|
||||
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
|
||||
}
|
||||
$array[] = $token;
|
||||
$inside_tag = false;
|
||||
$cursor = $position_next_gt + 1;
|
||||
continue;
|
||||
@ -169,21 +229,29 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
}
|
||||
|
||||
if ($is_self_closing) {
|
||||
$array[] = new HTMLPurifier_Token_Empty($type, $attr);
|
||||
$token = new HTMLPurifier_Token_Empty($type, $attr);
|
||||
} else {
|
||||
$array[] = new HTMLPurifier_Token_Start($type, $attr);
|
||||
$token = new HTMLPurifier_Token_Start($type, $attr);
|
||||
}
|
||||
if ($maintain_line_numbers) {
|
||||
$token->line = $current_line;
|
||||
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
|
||||
}
|
||||
$array[] = $token;
|
||||
$cursor = $position_next_gt + 1;
|
||||
$inside_tag = false;
|
||||
continue;
|
||||
} else {
|
||||
$array[] = new
|
||||
$token = new
|
||||
HTMLPurifier_Token_Text(
|
||||
'<' .
|
||||
$this->parseData(
|
||||
substr($html, $cursor)
|
||||
)
|
||||
);
|
||||
if ($maintain_line_numbers) $token->line = $current_line;
|
||||
// no cursor scroll? Hmm...
|
||||
$array[] = $token;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
@ -11,6 +11,7 @@
|
||||
*/
|
||||
class HTMLPurifier_Token {
|
||||
var $type; /**< Type of node to bypass <tt>is_a()</tt>. @public */
|
||||
var $line; /**< Line number node was on in source document. Null if unknown. @public */
|
||||
|
||||
/**
|
||||
* Copies the tag into a new one (clone substitute).
|
||||
|
@ -64,6 +64,65 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase
|
||||
|
||||
}
|
||||
|
||||
function testLineNumbers() {
|
||||
|
||||
$html = '<b>Line 1</b>
|
||||
<i>Line 2</i>
|
||||
Still Line 2<br
|
||||
/>Now Line 4
|
||||
|
||||
<br />';
|
||||
|
||||
$expect = array(
|
||||
// line 1
|
||||
0 => new HTMLPurifier_Token_Start('b')
|
||||
,1 => new HTMLPurifier_Token_Text('Line 1')
|
||||
,2 => new HTMLPurifier_Token_End('b')
|
||||
,3 => new HTMLPurifier_Token_Text('
|
||||
')
|
||||
// line 2
|
||||
,4 => new HTMLPurifier_Token_Start('i')
|
||||
,5 => new HTMLPurifier_Token_Text('Line 2')
|
||||
,6 => new HTMLPurifier_Token_End('i')
|
||||
,7 => new HTMLPurifier_Token_Text('
|
||||
Still Line 2')
|
||||
// line 3
|
||||
,8 => new HTMLPurifier_Token_Empty('br')
|
||||
// line 4
|
||||
,9 => new HTMLPurifier_Token_Text('Now Line 4
|
||||
|
||||
')
|
||||
// line SIX
|
||||
,10 => new HTMLPurifier_Token_Empty('br')
|
||||
);
|
||||
|
||||
$context = new HTMLPurifier_Context();
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$output = $this->DirectLex->tokenizeHTML($html, $config, $context);
|
||||
|
||||
$this->assertIdentical($output, $expect);
|
||||
|
||||
$context = new HTMLPurifier_Context();
|
||||
$config = HTMLPurifier_Config::create(array(
|
||||
'Core.MaintainLineNumbers' => true
|
||||
));
|
||||
$expect[0]->line = 1;
|
||||
$expect[1]->line = 1;
|
||||
$expect[2]->line = 1;
|
||||
$expect[3]->line = 1;
|
||||
$expect[4]->line = 2;
|
||||
$expect[5]->line = 2;
|
||||
$expect[6]->line = 2;
|
||||
$expect[7]->line = 2;
|
||||
$expect[8]->line = 3;
|
||||
$expect[9]->line = 4;
|
||||
$expect[10]->line = 6;
|
||||
|
||||
$output = $this->DirectLex->tokenizeHTML($html, $config, $context);
|
||||
$this->assertIdentical($output, $expect);
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
@ -35,6 +35,12 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
||||
|
||||
}
|
||||
|
||||
function test_create() {
|
||||
$config = HTMLPurifier_Config::create(array('Core.MaintainLineNumbers' => true));
|
||||
$lexer = HTMLPurifier_Lexer::create($config);
|
||||
$this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
|
||||
}
|
||||
|
||||
function assertExtractBody($text, $extract = true) {
|
||||
$result = $this->Lexer->extractBody($text);
|
||||
if ($extract === true) $extract = $text;
|
||||
|
Loading…
Reference in New Issue
Block a user