mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-01-10 16:01:53 +00:00
[1.7.0] Implement line number counting in DirectLex, in preparation for error reporting
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1155 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
70bcccf54c
commit
4bf15de536
1
NEWS
1
NEWS
@ -53,6 +53,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
. Lexer is now pre-emptively included, with a conditional include for the
|
. Lexer is now pre-emptively included, with a conditional include for the
|
||||||
PHP5 only version.
|
PHP5 only version.
|
||||||
. HTMLDefinition and CSSDefinition have a common parent class: Definition.
|
. HTMLDefinition and CSSDefinition have a common parent class: Definition.
|
||||||
|
. DirectLex can now track line-numbers
|
||||||
|
|
||||||
1.6.1, released 2007-05-05
|
1.6.1, released 2007-05-05
|
||||||
! Support for more deprecated attributes via transformations:
|
! Support for more deprecated attributes via transformations:
|
||||||
|
@ -47,9 +47,24 @@ HTMLPurifier_ConfigSchema::define(
|
|||||||
to use it.
|
to use it.
|
||||||
</dd>
|
</dd>
|
||||||
</dl>
|
</dl>
|
||||||
|
<p>
|
||||||
|
This directive has been available since 1.7.0.
|
||||||
|
</p>
|
||||||
'
|
'
|
||||||
);
|
);
|
||||||
|
|
||||||
|
HTMLPurifier_ConfigSchema::define(
|
||||||
|
'Core', 'MaintainLineNumbers', false, 'bool', '
|
||||||
|
<p>
|
||||||
|
If true, HTML Purifier will add line number information to all tokens.
|
||||||
|
This is useful when error reporting is turned on, but can result in
|
||||||
|
significant performance degradation and should not be used when
|
||||||
|
unnecessary. This directive must be used with the DirectLex lexer,
|
||||||
|
as the DOMLex lexer does not (yet) support this functionality. This directive
|
||||||
|
has been available since 1.7.0.
|
||||||
|
</p>
|
||||||
|
');
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Forgivingly lexes HTML (SGML-style) markup into tokens.
|
* Forgivingly lexes HTML (SGML-style) markup into tokens.
|
||||||
*
|
*
|
||||||
@ -135,7 +150,14 @@ class HTMLPurifier_Lexer
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (is_null($lexer)) { do {
|
if (is_null($lexer)) { do {
|
||||||
// auto-detectection algorithm
|
// auto-detection algorithm
|
||||||
|
|
||||||
|
// once PHP DOM implements native line numbers, or we
|
||||||
|
// hack out something using XSLT, remove this stipulation
|
||||||
|
if ($config->get('Core', 'MaintainLineNumbers')) {
|
||||||
|
$lexer = 'DirectLex';
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5
|
if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5
|
||||||
class_exists('DOMDocument')) { // check for DOM support
|
class_exists('DOMDocument')) { // check for DOM support
|
||||||
|
@ -2,6 +2,20 @@
|
|||||||
|
|
||||||
require_once 'HTMLPurifier/Lexer.php';
|
require_once 'HTMLPurifier/Lexer.php';
|
||||||
|
|
||||||
|
HTMLPurifier_ConfigSchema::define(
|
||||||
|
'Core', 'DirectLexLineNumberSyncInterval', 0, 'int', '
|
||||||
|
<p>
|
||||||
|
Specifies the number of tokens the DirectLex line number tracking
|
||||||
|
implementations should process before attempting to resyncronize the
|
||||||
|
current line count by manually counting all previous new-lines. When
|
||||||
|
at 0, this functionality is disabled. Lower values will decrease
|
||||||
|
performance, and this is only strictly necessary if the counting
|
||||||
|
algorithm is buggy (in which case you should report it as a bug).
|
||||||
|
This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is
|
||||||
|
not being used. This directive has been available since 1.7.0.
|
||||||
|
</p>
|
||||||
|
');
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Our in-house implementation of a parser.
|
* Our in-house implementation of a parser.
|
||||||
*
|
*
|
||||||
@ -32,9 +46,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
$inside_tag = false; // whether or not we're parsing the inside of a tag
|
$inside_tag = false; // whether or not we're parsing the inside of a tag
|
||||||
$array = array(); // result array
|
$array = array(); // result array
|
||||||
|
|
||||||
|
$maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
|
||||||
|
$current_line = 1;
|
||||||
|
$nl = PHP_EOL;
|
||||||
|
// how often to manually recalculate. This will ALWAYS be right,
|
||||||
|
// but it's pretty wasteful. Set to 0 to turn off
|
||||||
|
$synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval');
|
||||||
|
|
||||||
// infinite loop protection
|
// infinite loop protection
|
||||||
// has to be pretty big, since html docs can be big
|
// has to be pretty big, since html docs can be big
|
||||||
// we're allow two hundred thousand tags... more than enough?
|
// we're allow two hundred thousand tags... more than enough?
|
||||||
|
// NOTE: this is also used for synchronization, so watch out
|
||||||
$loops = 0;
|
$loops = 0;
|
||||||
|
|
||||||
while(true) {
|
while(true) {
|
||||||
@ -42,10 +64,21 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
// infinite loop protection
|
// infinite loop protection
|
||||||
if (++$loops > 200000) return array();
|
if (++$loops > 200000) return array();
|
||||||
|
|
||||||
|
// recalculate lines
|
||||||
|
if (
|
||||||
|
$maintain_line_numbers && // line number tracking is on
|
||||||
|
$synchronize_interval && // synchronization is on
|
||||||
|
$cursor > 0 && // cursor is further than zero
|
||||||
|
$loops % $synchronize_interval === 0 // time to synchronize!
|
||||||
|
) {
|
||||||
|
$current_line = 1 + substr_count($html, $nl, 0, $cursor);
|
||||||
|
}
|
||||||
|
|
||||||
$position_next_lt = strpos($html, '<', $cursor);
|
$position_next_lt = strpos($html, '<', $cursor);
|
||||||
$position_next_gt = strpos($html, '>', $cursor);
|
$position_next_gt = strpos($html, '>', $cursor);
|
||||||
|
|
||||||
// triggers on "<b>asdf</b>" but not "asdf <b></b>"
|
// triggers on "<b>asdf</b>" but not "asdf <b></b>"
|
||||||
|
// special case to set up context
|
||||||
if ($position_next_lt === $cursor) {
|
if ($position_next_lt === $cursor) {
|
||||||
$inside_tag = true;
|
$inside_tag = true;
|
||||||
$cursor++;
|
$cursor++;
|
||||||
@ -53,7 +86,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
|
|
||||||
if (!$inside_tag && $position_next_lt !== false) {
|
if (!$inside_tag && $position_next_lt !== false) {
|
||||||
// We are not inside tag and there still is another tag to parse
|
// We are not inside tag and there still is another tag to parse
|
||||||
$array[] = new
|
$token = new
|
||||||
HTMLPurifier_Token_Text(
|
HTMLPurifier_Token_Text(
|
||||||
$this->parseData(
|
$this->parseData(
|
||||||
substr(
|
substr(
|
||||||
@ -61,6 +94,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
if ($maintain_line_numbers) {
|
||||||
|
$token->line = $current_line;
|
||||||
|
$current_line += substr_count($html, $nl, $cursor, $position_next_lt - $cursor);
|
||||||
|
}
|
||||||
|
$array[] = $token;
|
||||||
$cursor = $position_next_lt + 1;
|
$cursor = $position_next_lt + 1;
|
||||||
$inside_tag = true;
|
$inside_tag = true;
|
||||||
continue;
|
continue;
|
||||||
@ -69,7 +107,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
// If we're already at the end, break
|
// If we're already at the end, break
|
||||||
if ($cursor === strlen($html)) break;
|
if ($cursor === strlen($html)) break;
|
||||||
// Create Text of rest of string
|
// Create Text of rest of string
|
||||||
$array[] = new
|
$token = new
|
||||||
HTMLPurifier_Token_Text(
|
HTMLPurifier_Token_Text(
|
||||||
$this->parseData(
|
$this->parseData(
|
||||||
substr(
|
substr(
|
||||||
@ -77,6 +115,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
if ($maintain_line_numbers) $token->line = $current_line;
|
||||||
|
$array[] = $token;
|
||||||
break;
|
break;
|
||||||
} elseif ($inside_tag && $position_next_gt !== false) {
|
} elseif ($inside_tag && $position_next_gt !== false) {
|
||||||
// We are in tag and it is well formed
|
// We are in tag and it is well formed
|
||||||
@ -89,12 +129,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
substr($segment, 0, 3) == '!--' &&
|
substr($segment, 0, 3) == '!--' &&
|
||||||
substr($segment, $strlen_segment-2, 2) == '--'
|
substr($segment, $strlen_segment-2, 2) == '--'
|
||||||
) {
|
) {
|
||||||
$array[] = new
|
$token = new
|
||||||
HTMLPurifier_Token_Comment(
|
HTMLPurifier_Token_Comment(
|
||||||
substr(
|
substr(
|
||||||
$segment, 3, $strlen_segment - 5
|
$segment, 3, $strlen_segment - 5
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
if ($maintain_line_numbers) {
|
||||||
|
$token->line = $current_line;
|
||||||
|
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
|
||||||
|
}
|
||||||
|
$array[] = $token;
|
||||||
$inside_tag = false;
|
$inside_tag = false;
|
||||||
$cursor = $position_next_gt + 1;
|
$cursor = $position_next_gt + 1;
|
||||||
continue;
|
continue;
|
||||||
@ -104,7 +149,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
$is_end_tag = (strpos($segment,'/') === 0);
|
$is_end_tag = (strpos($segment,'/') === 0);
|
||||||
if ($is_end_tag) {
|
if ($is_end_tag) {
|
||||||
$type = substr($segment, 1);
|
$type = substr($segment, 1);
|
||||||
$array[] = new HTMLPurifier_Token_End($type);
|
$token = new HTMLPurifier_Token_End($type);
|
||||||
|
if ($maintain_line_numbers) {
|
||||||
|
$token->line = $current_line;
|
||||||
|
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
|
||||||
|
}
|
||||||
|
$array[] = $token;
|
||||||
$inside_tag = false;
|
$inside_tag = false;
|
||||||
$cursor = $position_next_gt + 1;
|
$cursor = $position_next_gt + 1;
|
||||||
continue;
|
continue;
|
||||||
@ -114,7 +164,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
// have accidently grabbed an emoticon. Translate into
|
// have accidently grabbed an emoticon. Translate into
|
||||||
// text and go our merry way
|
// text and go our merry way
|
||||||
if (!ctype_alnum($segment[0])) {
|
if (!ctype_alnum($segment[0])) {
|
||||||
$array[] = new
|
$token = new
|
||||||
HTMLPurifier_Token_Text(
|
HTMLPurifier_Token_Text(
|
||||||
'<' .
|
'<' .
|
||||||
$this->parseData(
|
$this->parseData(
|
||||||
@ -122,6 +172,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
) .
|
) .
|
||||||
'>'
|
'>'
|
||||||
);
|
);
|
||||||
|
if ($maintain_line_numbers) {
|
||||||
|
$token->line = $current_line;
|
||||||
|
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
|
||||||
|
}
|
||||||
|
$array[] = $token;
|
||||||
$cursor = $position_next_gt + 1;
|
$cursor = $position_next_gt + 1;
|
||||||
$inside_tag = false;
|
$inside_tag = false;
|
||||||
continue;
|
continue;
|
||||||
@ -142,10 +197,15 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
|
|
||||||
if ($position_first_space >= $strlen_segment) {
|
if ($position_first_space >= $strlen_segment) {
|
||||||
if ($is_self_closing) {
|
if ($is_self_closing) {
|
||||||
$array[] = new HTMLPurifier_Token_Empty($segment);
|
$token = new HTMLPurifier_Token_Empty($segment);
|
||||||
} else {
|
} else {
|
||||||
$array[] = new HTMLPurifier_Token_Start($segment);
|
$token = new HTMLPurifier_Token_Start($segment);
|
||||||
}
|
}
|
||||||
|
if ($maintain_line_numbers) {
|
||||||
|
$token->line = $current_line;
|
||||||
|
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
|
||||||
|
}
|
||||||
|
$array[] = $token;
|
||||||
$inside_tag = false;
|
$inside_tag = false;
|
||||||
$cursor = $position_next_gt + 1;
|
$cursor = $position_next_gt + 1;
|
||||||
continue;
|
continue;
|
||||||
@ -169,21 +229,29 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
}
|
}
|
||||||
|
|
||||||
if ($is_self_closing) {
|
if ($is_self_closing) {
|
||||||
$array[] = new HTMLPurifier_Token_Empty($type, $attr);
|
$token = new HTMLPurifier_Token_Empty($type, $attr);
|
||||||
} else {
|
} else {
|
||||||
$array[] = new HTMLPurifier_Token_Start($type, $attr);
|
$token = new HTMLPurifier_Token_Start($type, $attr);
|
||||||
}
|
}
|
||||||
|
if ($maintain_line_numbers) {
|
||||||
|
$token->line = $current_line;
|
||||||
|
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
|
||||||
|
}
|
||||||
|
$array[] = $token;
|
||||||
$cursor = $position_next_gt + 1;
|
$cursor = $position_next_gt + 1;
|
||||||
$inside_tag = false;
|
$inside_tag = false;
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
$array[] = new
|
$token = new
|
||||||
HTMLPurifier_Token_Text(
|
HTMLPurifier_Token_Text(
|
||||||
'<' .
|
'<' .
|
||||||
$this->parseData(
|
$this->parseData(
|
||||||
substr($html, $cursor)
|
substr($html, $cursor)
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
if ($maintain_line_numbers) $token->line = $current_line;
|
||||||
|
// no cursor scroll? Hmm...
|
||||||
|
$array[] = $token;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -11,6 +11,7 @@
|
|||||||
*/
|
*/
|
||||||
class HTMLPurifier_Token {
|
class HTMLPurifier_Token {
|
||||||
var $type; /**< Type of node to bypass <tt>is_a()</tt>. @public */
|
var $type; /**< Type of node to bypass <tt>is_a()</tt>. @public */
|
||||||
|
var $line; /**< Line number node was on in source document. Null if unknown. @public */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Copies the tag into a new one (clone substitute).
|
* Copies the tag into a new one (clone substitute).
|
||||||
|
@ -64,6 +64,65 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function testLineNumbers() {
|
||||||
|
|
||||||
|
$html = '<b>Line 1</b>
|
||||||
|
<i>Line 2</i>
|
||||||
|
Still Line 2<br
|
||||||
|
/>Now Line 4
|
||||||
|
|
||||||
|
<br />';
|
||||||
|
|
||||||
|
$expect = array(
|
||||||
|
// line 1
|
||||||
|
0 => new HTMLPurifier_Token_Start('b')
|
||||||
|
,1 => new HTMLPurifier_Token_Text('Line 1')
|
||||||
|
,2 => new HTMLPurifier_Token_End('b')
|
||||||
|
,3 => new HTMLPurifier_Token_Text('
|
||||||
|
')
|
||||||
|
// line 2
|
||||||
|
,4 => new HTMLPurifier_Token_Start('i')
|
||||||
|
,5 => new HTMLPurifier_Token_Text('Line 2')
|
||||||
|
,6 => new HTMLPurifier_Token_End('i')
|
||||||
|
,7 => new HTMLPurifier_Token_Text('
|
||||||
|
Still Line 2')
|
||||||
|
// line 3
|
||||||
|
,8 => new HTMLPurifier_Token_Empty('br')
|
||||||
|
// line 4
|
||||||
|
,9 => new HTMLPurifier_Token_Text('Now Line 4
|
||||||
|
|
||||||
|
')
|
||||||
|
// line SIX
|
||||||
|
,10 => new HTMLPurifier_Token_Empty('br')
|
||||||
|
);
|
||||||
|
|
||||||
|
$context = new HTMLPurifier_Context();
|
||||||
|
$config = HTMLPurifier_Config::createDefault();
|
||||||
|
$output = $this->DirectLex->tokenizeHTML($html, $config, $context);
|
||||||
|
|
||||||
|
$this->assertIdentical($output, $expect);
|
||||||
|
|
||||||
|
$context = new HTMLPurifier_Context();
|
||||||
|
$config = HTMLPurifier_Config::create(array(
|
||||||
|
'Core.MaintainLineNumbers' => true
|
||||||
|
));
|
||||||
|
$expect[0]->line = 1;
|
||||||
|
$expect[1]->line = 1;
|
||||||
|
$expect[2]->line = 1;
|
||||||
|
$expect[3]->line = 1;
|
||||||
|
$expect[4]->line = 2;
|
||||||
|
$expect[5]->line = 2;
|
||||||
|
$expect[6]->line = 2;
|
||||||
|
$expect[7]->line = 2;
|
||||||
|
$expect[8]->line = 3;
|
||||||
|
$expect[9]->line = 4;
|
||||||
|
$expect[10]->line = 6;
|
||||||
|
|
||||||
|
$output = $this->DirectLex->tokenizeHTML($html, $config, $context);
|
||||||
|
$this->assertIdentical($output, $expect);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -35,6 +35,12 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function test_create() {
|
||||||
|
$config = HTMLPurifier_Config::create(array('Core.MaintainLineNumbers' => true));
|
||||||
|
$lexer = HTMLPurifier_Lexer::create($config);
|
||||||
|
$this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
|
||||||
|
}
|
||||||
|
|
||||||
function assertExtractBody($text, $extract = true) {
|
function assertExtractBody($text, $extract = true) {
|
||||||
$result = $this->Lexer->extractBody($text);
|
$result = $this->Lexer->extractBody($text);
|
||||||
if ($extract === true) $extract = $text;
|
if ($extract === true) $extract = $text;
|
||||||
|
Loading…
Reference in New Issue
Block a user