mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-11-09 15:28:40 +00:00
Track column numbers in addition to line numbers.
Signed-off-by: Edward Z. Yang <edwardzyang@thewritingpot.com>
This commit is contained in:
parent
9977350143
commit
c6914dce51
1
NEWS
1
NEWS
@ -31,6 +31,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
useless, but as a bonus, the test suite and handling of edge cases is also
|
useless, but as a bonus, the test suite and handling of edge cases is also
|
||||||
improved.
|
improved.
|
||||||
! Experimental implementation of forms for %HTML.Trusted
|
! Experimental implementation of forms for %HTML.Trusted
|
||||||
|
! Track column numbers when maintain line numbers is on
|
||||||
- Fix two bugs in %URI.MakeAbsolute; one involving empty paths in base URLs,
|
- Fix two bugs in %URI.MakeAbsolute; one involving empty paths in base URLs,
|
||||||
the other involving an undefined $is_folder error.
|
the other involving an undefined $is_folder error.
|
||||||
- Throw error when %Core.Encoding is set to a spurious value. Previously,
|
- Throw error when %Core.Encoding is set to a spurious value. Previously,
|
||||||
|
@ -42,6 +42,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
$inside_tag = false; // whether or not we're parsing the inside of a tag
|
$inside_tag = false; // whether or not we're parsing the inside of a tag
|
||||||
$array = array(); // result array
|
$array = array(); // result array
|
||||||
|
|
||||||
|
// This is also treated to mean maintain *column* numbers too
|
||||||
$maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
|
$maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
|
||||||
|
|
||||||
if ($maintain_line_numbers === null) {
|
if ($maintain_line_numbers === null) {
|
||||||
@ -50,8 +51,15 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
$maintain_line_numbers = $config->get('Core', 'CollectErrors');
|
$maintain_line_numbers = $config->get('Core', 'CollectErrors');
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($maintain_line_numbers) $current_line = 1;
|
if ($maintain_line_numbers) {
|
||||||
else $current_line = false;
|
$current_line = 1;
|
||||||
|
$current_col = 0;
|
||||||
|
$length = strlen($html);
|
||||||
|
} else {
|
||||||
|
$current_line = false;
|
||||||
|
$current_col = false;
|
||||||
|
$length = false;
|
||||||
|
}
|
||||||
$context->register('CurrentLine', $current_line);
|
$context->register('CurrentLine', $current_line);
|
||||||
$nl = "\n";
|
$nl = "\n";
|
||||||
// how often to manually recalculate. This will ALWAYS be right,
|
// how often to manually recalculate. This will ALWAYS be right,
|
||||||
@ -68,14 +76,31 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
|
|
||||||
while(++$loops) {
|
while(++$loops) {
|
||||||
|
|
||||||
// recalculate lines
|
// $cursor is either at the start of a token, or inside of
|
||||||
if (
|
// a tag (i.e. there was a < immediately before it), as indicated
|
||||||
$maintain_line_numbers && // line number tracking is on
|
// by $inside_tag
|
||||||
$synchronize_interval && // synchronization is on
|
|
||||||
$cursor > 0 && // cursor is further than zero
|
if ($maintain_line_numbers) {
|
||||||
$loops % $synchronize_interval === 0 // time to synchronize!
|
|
||||||
) {
|
// $rcursor, however, is always at the start of a token.
|
||||||
$current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
|
$rcursor = $cursor - (int) $inside_tag;
|
||||||
|
|
||||||
|
// Column number is cheap, so we calculate it every round.
|
||||||
|
// We're interested at the *end* of the newline string, so
|
||||||
|
// we need to add strlen($nl) == 1 to $nl_pos before subtracting it
|
||||||
|
// from our "rcursor" position.
|
||||||
|
$nl_pos = strrpos($html, $nl, $rcursor - $length);
|
||||||
|
$current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
|
||||||
|
|
||||||
|
// recalculate lines
|
||||||
|
if (
|
||||||
|
$synchronize_interval && // synchronization is on
|
||||||
|
$cursor > 0 && // cursor is further than zero
|
||||||
|
$loops % $synchronize_interval === 0 // time to synchronize!
|
||||||
|
) {
|
||||||
|
$current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$position_next_lt = strpos($html, '<', $cursor);
|
$position_next_lt = strpos($html, '<', $cursor);
|
||||||
@ -99,7 +124,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
)
|
)
|
||||||
);
|
);
|
||||||
if ($maintain_line_numbers) {
|
if ($maintain_line_numbers) {
|
||||||
$token->line = $current_line;
|
$token->rawPosition($current_line, $current_col);
|
||||||
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
|
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
|
||||||
}
|
}
|
||||||
$array[] = $token;
|
$array[] = $token;
|
||||||
@ -119,7 +144,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
if ($maintain_line_numbers) $token->line = $current_line;
|
if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
|
||||||
$array[] = $token;
|
$array[] = $token;
|
||||||
break;
|
break;
|
||||||
} elseif ($inside_tag && $position_next_gt !== false) {
|
} elseif ($inside_tag && $position_next_gt !== false) {
|
||||||
@ -167,7 +192,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
)
|
)
|
||||||
);
|
);
|
||||||
if ($maintain_line_numbers) {
|
if ($maintain_line_numbers) {
|
||||||
$token->line = $current_line;
|
$token->rawPosition($current_line, $current_col);
|
||||||
$current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
|
$current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
|
||||||
}
|
}
|
||||||
$array[] = $token;
|
$array[] = $token;
|
||||||
@ -182,7 +207,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
$type = substr($segment, 1);
|
$type = substr($segment, 1);
|
||||||
$token = new HTMLPurifier_Token_End($type);
|
$token = new HTMLPurifier_Token_End($type);
|
||||||
if ($maintain_line_numbers) {
|
if ($maintain_line_numbers) {
|
||||||
$token->line = $current_line;
|
$token->rawPosition($current_line, $current_col);
|
||||||
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
|
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
|
||||||
}
|
}
|
||||||
$array[] = $token;
|
$array[] = $token;
|
||||||
@ -199,7 +224,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
|
if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
|
||||||
$token = new HTMLPurifier_Token_Text('<');
|
$token = new HTMLPurifier_Token_Text('<');
|
||||||
if ($maintain_line_numbers) {
|
if ($maintain_line_numbers) {
|
||||||
$token->line = $current_line;
|
$token->rawPosition($current_line, $current_col);
|
||||||
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
|
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
|
||||||
}
|
}
|
||||||
$array[] = $token;
|
$array[] = $token;
|
||||||
@ -227,7 +252,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
$token = new HTMLPurifier_Token_Start($segment);
|
$token = new HTMLPurifier_Token_Start($segment);
|
||||||
}
|
}
|
||||||
if ($maintain_line_numbers) {
|
if ($maintain_line_numbers) {
|
||||||
$token->line = $current_line;
|
$token->rawPosition($current_line, $current_col);
|
||||||
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
|
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
|
||||||
}
|
}
|
||||||
$array[] = $token;
|
$array[] = $token;
|
||||||
@ -259,7 +284,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
$token = new HTMLPurifier_Token_Start($type, $attr);
|
$token = new HTMLPurifier_Token_Start($type, $attr);
|
||||||
}
|
}
|
||||||
if ($maintain_line_numbers) {
|
if ($maintain_line_numbers) {
|
||||||
$token->line = $current_line;
|
$token->rawPosition($current_line, $current_col);
|
||||||
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
|
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
|
||||||
}
|
}
|
||||||
$array[] = $token;
|
$array[] = $token;
|
||||||
@ -276,7 +301,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
substr($html, $cursor)
|
substr($html, $cursor)
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
if ($maintain_line_numbers) $token->line = $current_line;
|
if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
|
||||||
// no cursor scroll? Hmm...
|
// no cursor scroll? Hmm...
|
||||||
$array[] = $token;
|
$array[] = $token;
|
||||||
break;
|
break;
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
*/
|
*/
|
||||||
class HTMLPurifier_Token {
|
class HTMLPurifier_Token {
|
||||||
public $line; /**< Line number node was on in source document. Null if unknown. */
|
public $line; /**< Line number node was on in source document. Null if unknown. */
|
||||||
|
public $col; /**< Column of line node was on in source document. Null if unknown. */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lookup array of processing that this token is exempt from.
|
* Lookup array of processing that this token is exempt from.
|
||||||
@ -17,13 +18,31 @@ class HTMLPurifier_Token {
|
|||||||
if ($n === 'type') {
|
if ($n === 'type') {
|
||||||
trigger_error('Deprecated type property called; use instanceof', E_USER_NOTICE);
|
trigger_error('Deprecated type property called; use instanceof', E_USER_NOTICE);
|
||||||
switch (get_class($this)) {
|
switch (get_class($this)) {
|
||||||
case 'HTMLPurifier_Token_Start': return 'start';
|
case 'HTMLPurifier_Token_Start': return 'start';
|
||||||
case 'HTMLPurifier_Token_Empty': return 'empty';
|
case 'HTMLPurifier_Token_Empty': return 'empty';
|
||||||
case 'HTMLPurifier_Token_End': return 'end';
|
case 'HTMLPurifier_Token_End': return 'end';
|
||||||
case 'HTMLPurifier_Token_Text': return 'text';
|
case 'HTMLPurifier_Token_Text': return 'text';
|
||||||
case 'HTMLPurifier_Token_Comment': return 'comment';
|
case 'HTMLPurifier_Token_Comment': return 'comment';
|
||||||
default: return null;
|
default: return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the position of the token in the source document.
|
||||||
|
*/
|
||||||
|
public function position($l = null, $c = null) {
|
||||||
|
$this->line = $l;
|
||||||
|
$this->col = $c;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convenience function for DirectLex settings line/col position.
|
||||||
|
*/
|
||||||
|
public function rawPosition($l, $c) {
|
||||||
|
if ($c === -1) $l++;
|
||||||
|
$this->line = $l;
|
||||||
|
$this->col = $c;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -11,9 +11,10 @@ class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
|
|||||||
*
|
*
|
||||||
* @param $data String comment data.
|
* @param $data String comment data.
|
||||||
*/
|
*/
|
||||||
public function __construct($data, $line = null) {
|
public function __construct($data, $line = null, $col = null) {
|
||||||
$this->data = $data;
|
$this->data = $data;
|
||||||
$this->line = $line;
|
$this->line = $line;
|
||||||
|
$this->col = $col;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -33,7 +33,7 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token
|
|||||||
* @param $name String name.
|
* @param $name String name.
|
||||||
* @param $attr Associative array of attributes.
|
* @param $attr Associative array of attributes.
|
||||||
*/
|
*/
|
||||||
public function __construct($name, $attr = array(), $line = null) {
|
public function __construct($name, $attr = array(), $line = null, $col = null) {
|
||||||
$this->name = ctype_lower($name) ? $name : strtolower($name);
|
$this->name = ctype_lower($name) ? $name : strtolower($name);
|
||||||
foreach ($attr as $key => $value) {
|
foreach ($attr as $key => $value) {
|
||||||
// normalization only necessary when key is not lowercase
|
// normalization only necessary when key is not lowercase
|
||||||
@ -49,5 +49,6 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token
|
|||||||
}
|
}
|
||||||
$this->attr = $attr;
|
$this->attr = $attr;
|
||||||
$this->line = $line;
|
$this->line = $line;
|
||||||
|
$this->col = $col;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -21,10 +21,11 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token
|
|||||||
*
|
*
|
||||||
* @param $data String parsed character data.
|
* @param $data String parsed character data.
|
||||||
*/
|
*/
|
||||||
public function __construct($data, $line = null) {
|
public function __construct($data, $line = null, $col = null) {
|
||||||
$this->data = $data;
|
$this->data = $data;
|
||||||
$this->is_whitespace = ctype_space($data);
|
$this->is_whitespace = ctype_space($data);
|
||||||
$this->line = $line;
|
$this->line = $line;
|
||||||
|
$this->col = $col;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -75,6 +75,8 @@ class HTMLPurifier_Lexer_DirectLexTest extends HTMLPurifier_Harness
|
|||||||
|
|
||||||
function testLineNumbers() {
|
function testLineNumbers() {
|
||||||
|
|
||||||
|
// . . . . . . . . . .
|
||||||
|
// 01234567890123 01234567890123 0123456789012345 0123456789012 012345
|
||||||
$html = "<b>Line 1</b>\n<i>Line 2</i>\nStill Line 2<br\n/>Now Line 4\n\n<br />";
|
$html = "<b>Line 1</b>\n<i>Line 2</i>\nStill Line 2<br\n/>Now Line 4\n\n<br />";
|
||||||
|
|
||||||
$expect = array(
|
$expect = array(
|
||||||
@ -106,17 +108,17 @@ class HTMLPurifier_Lexer_DirectLexTest extends HTMLPurifier_Harness
|
|||||||
$config = HTMLPurifier_Config::create(array(
|
$config = HTMLPurifier_Config::create(array(
|
||||||
'Core.MaintainLineNumbers' => true
|
'Core.MaintainLineNumbers' => true
|
||||||
));
|
));
|
||||||
$expect[0]->line = 1;
|
$expect[0]->position(1, 0);
|
||||||
$expect[1]->line = 1;
|
$expect[1]->position(1, 3);
|
||||||
$expect[2]->line = 1;
|
$expect[2]->position(1, 9);
|
||||||
$expect[3]->line = 1;
|
$expect[3]->position(2, -1);
|
||||||
$expect[4]->line = 2;
|
$expect[4]->position(2, 0);
|
||||||
$expect[5]->line = 2;
|
$expect[5]->position(2, 3);
|
||||||
$expect[6]->line = 2;
|
$expect[6]->position(2, 9);
|
||||||
$expect[7]->line = 2;
|
$expect[7]->position(3, -1);
|
||||||
$expect[8]->line = 3;
|
$expect[8]->position(3, 12);
|
||||||
$expect[9]->line = 4;
|
$expect[9]->position(4, 2);
|
||||||
$expect[10]->line = 6;
|
$expect[10]->position(6, 0);
|
||||||
|
|
||||||
$output = $this->DirectLex->tokenizeHTML($html, $config, $context);
|
$output = $this->DirectLex->tokenizeHTML($html, $config, $context);
|
||||||
$this->assertIdentical($output, $expect);
|
$this->assertIdentical($output, $expect);
|
||||||
|
@ -9,43 +9,44 @@ class HTMLPurifier_Strategy_MakeWellFormed_ErrorsTest extends HTMLPurifier_Strat
|
|||||||
|
|
||||||
function testUnnecessaryEndTagRemoved() {
|
function testUnnecessaryEndTagRemoved() {
|
||||||
$this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed');
|
$this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed');
|
||||||
$this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1));
|
$this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1, 0));
|
||||||
$this->invoke('</b>');
|
$this->invoke('</b>');
|
||||||
}
|
}
|
||||||
|
|
||||||
function testUnnecessaryEndTagToText() {
|
function testUnnecessaryEndTagToText() {
|
||||||
$this->config->set('Core', 'EscapeInvalidTags', true);
|
$this->config->set('Core', 'EscapeInvalidTags', true);
|
||||||
$this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text');
|
$this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text');
|
||||||
$this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1));
|
$this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1, 0));
|
||||||
$this->invoke('</b>');
|
$this->invoke('</b>');
|
||||||
}
|
}
|
||||||
|
|
||||||
function testTagAutoClosed() {
|
function testTagAutoClosed() {
|
||||||
$this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', new HTMLPurifier_Token_Start('b', array(), 1));
|
$this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', new HTMLPurifier_Token_Start('b', array(), 1, 0));
|
||||||
$this->expectContext('CurrentToken', new HTMLPurifier_Token_Start('div', array(), 1));
|
$this->expectContext('CurrentToken', new HTMLPurifier_Token_Start('div', array(), 1, 6));
|
||||||
$this->invoke('<b>Foo<div>Bar</div>');
|
$this->invoke('<b>Foo<div>Bar</div>');
|
||||||
}
|
}
|
||||||
|
|
||||||
function testStrayEndTagRemoved() {
|
function testStrayEndTagRemoved() {
|
||||||
$this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed');
|
$this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed');
|
||||||
$this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1));
|
$this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1, 3));
|
||||||
$this->invoke('<i></b></i>');
|
$this->invoke('<i></b></i>');
|
||||||
}
|
}
|
||||||
|
|
||||||
function testStrayEndTagToText() {
|
function testStrayEndTagToText() {
|
||||||
$this->config->set('Core', 'EscapeInvalidTags', true);
|
$this->config->set('Core', 'EscapeInvalidTags', true);
|
||||||
$this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text');
|
$this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text');
|
||||||
$this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1));
|
$this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1, 3));
|
||||||
$this->invoke('<i></b></i>');
|
$this->invoke('<i></b></i>');
|
||||||
}
|
}
|
||||||
|
|
||||||
function testTagClosedByElementEnd() {
|
function testTagClosedByElementEnd() {
|
||||||
$this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', new HTMLPurifier_Token_Start('b', array(), 1));
|
$this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', new HTMLPurifier_Token_Start('b', array(), 1, 3));
|
||||||
|
$this->expectContext('CurrentToken', new HTMLPurifier_Token_End('i', array(), 1, 12));
|
||||||
$this->invoke('<i><b>Foobar</i>');
|
$this->invoke('<i><b>Foobar</i>');
|
||||||
}
|
}
|
||||||
|
|
||||||
function testTagClosedByDocumentEnd() {
|
function testTagClosedByDocumentEnd() {
|
||||||
$this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', new HTMLPurifier_Token_Start('b', array(), 1));
|
$this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', new HTMLPurifier_Token_Start('b', array(), 1, 0));
|
||||||
$this->invoke('<b>Foobar');
|
$this->invoke('<b>Foobar');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user