From c6914dce51ee1a5b6c935c8b4a1a5c816d941bd9 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Mon, 1 Sep 2008 14:10:10 -0400 Subject: [PATCH] Track column numbers in addition to line numbers. Signed-off-by: Edward Z. Yang --- NEWS | 1 + library/HTMLPurifier/Lexer/DirectLex.php | 61 +++++++++++++------ library/HTMLPurifier/Token.php | 29 +++++++-- library/HTMLPurifier/Token/Comment.php | 3 +- library/HTMLPurifier/Token/Tag.php | 3 +- library/HTMLPurifier/Token/Text.php | 3 +- tests/HTMLPurifier/Lexer/DirectLexTest.php | 24 ++++---- .../Strategy/MakeWellFormed_ErrorsTest.php | 17 +++--- 8 files changed, 96 insertions(+), 45 deletions(-) diff --git a/NEWS b/NEWS index 95b61773..91891baf 100644 --- a/NEWS +++ b/NEWS @@ -31,6 +31,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier useless, but as a bonus, the test suite and handling of edge cases is also improved. ! Experimental implementation of forms for %HTML.Trusted +! Track column numbers when maintain line numbers is on - Fix two bugs in %URI.MakeAbsolute; one involving empty paths in base URLs, the other involving an undefined $is_folder error. - Throw error when %Core.Encoding is set to a spurious value. Previously, diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index 3942c442..913457bd 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -42,6 +42,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $inside_tag = false; // whether or not we're parsing the inside of a tag $array = array(); // result array + // This is also treated to mean maintain *column* numbers too $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers'); if ($maintain_line_numbers === null) { @@ -50,8 +51,15 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $maintain_line_numbers = $config->get('Core', 'CollectErrors'); } - if ($maintain_line_numbers) $current_line = 1; - else $current_line = false; + if ($maintain_line_numbers) { + $current_line = 1; + $current_col = 0; + $length = strlen($html); + } else { + $current_line = false; + $current_col = false; + $length = false; + } $context->register('CurrentLine', $current_line); $nl = "\n"; // how often to manually recalculate. This will ALWAYS be right, @@ -68,14 +76,31 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer while(++$loops) { - // recalculate lines - if ( - $maintain_line_numbers && // line number tracking is on - $synchronize_interval && // synchronization is on - $cursor > 0 && // cursor is further than zero - $loops % $synchronize_interval === 0 // time to synchronize! - ) { - $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); + // $cursor is either at the start of a token, or inside of + // a tag (i.e. there was a < immediately before it), as indicated + // by $inside_tag + + if ($maintain_line_numbers) { + + // $rcursor, however, is always at the start of a token. + $rcursor = $cursor - (int) $inside_tag; + + // Column number is cheap, so we calculate it every round. + // We're interested at the *end* of the newline string, so + // we need to add strlen($nl) == 1 to $nl_pos before subtracting it + // from our "rcursor" position. + $nl_pos = strrpos($html, $nl, $rcursor - $length); + $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1); + + // recalculate lines + if ( + $synchronize_interval && // synchronization is on + $cursor > 0 && // cursor is further than zero + $loops % $synchronize_interval === 0 // time to synchronize! + ) { + $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); + } + } $position_next_lt = strpos($html, '<', $cursor); @@ -99,7 +124,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer ) ); if ($maintain_line_numbers) { - $token->line = $current_line; + $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); } $array[] = $token; @@ -119,7 +144,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer ) ) ); - if ($maintain_line_numbers) $token->line = $current_line; + if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col); $array[] = $token; break; } elseif ($inside_tag && $position_next_gt !== false) { @@ -167,7 +192,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer ) ); if ($maintain_line_numbers) { - $token->line = $current_line; + $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); } $array[] = $token; @@ -182,7 +207,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $type = substr($segment, 1); $token = new HTMLPurifier_Token_End($type); if ($maintain_line_numbers) { - $token->line = $current_line; + $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; @@ -199,7 +224,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt'); $token = new HTMLPurifier_Token_Text('<'); if ($maintain_line_numbers) { - $token->line = $current_line; + $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; @@ -227,7 +252,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $token = new HTMLPurifier_Token_Start($segment); } if ($maintain_line_numbers) { - $token->line = $current_line; + $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; @@ -259,7 +284,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $token = new HTMLPurifier_Token_Start($type, $attr); } if ($maintain_line_numbers) { - $token->line = $current_line; + $token->rawPosition($current_line, $current_col); $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); } $array[] = $token; @@ -276,7 +301,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer substr($html, $cursor) ) ); - if ($maintain_line_numbers) $token->line = $current_line; + if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col); // no cursor scroll? Hmm... $array[] = $token; break; diff --git a/library/HTMLPurifier/Token.php b/library/HTMLPurifier/Token.php index 8803307b..12481026 100644 --- a/library/HTMLPurifier/Token.php +++ b/library/HTMLPurifier/Token.php @@ -5,6 +5,7 @@ */ class HTMLPurifier_Token { public $line; /**< Line number node was on in source document. Null if unknown. */ + public $col; /**< Column of line node was on in source document. Null if unknown. */ /** * Lookup array of processing that this token is exempt from. @@ -17,13 +18,31 @@ class HTMLPurifier_Token { if ($n === 'type') { trigger_error('Deprecated type property called; use instanceof', E_USER_NOTICE); switch (get_class($this)) { - case 'HTMLPurifier_Token_Start': return 'start'; - case 'HTMLPurifier_Token_Empty': return 'empty'; - case 'HTMLPurifier_Token_End': return 'end'; - case 'HTMLPurifier_Token_Text': return 'text'; - case 'HTMLPurifier_Token_Comment': return 'comment'; + case 'HTMLPurifier_Token_Start': return 'start'; + case 'HTMLPurifier_Token_Empty': return 'empty'; + case 'HTMLPurifier_Token_End': return 'end'; + case 'HTMLPurifier_Token_Text': return 'text'; + case 'HTMLPurifier_Token_Comment': return 'comment'; default: return null; } } } + + /** + * Sets the position of the token in the source document. + */ + public function position($l = null, $c = null) { + $this->line = $l; + $this->col = $c; + } + + /** + * Convenience function for DirectLex settings line/col position. + */ + public function rawPosition($l, $c) { + if ($c === -1) $l++; + $this->line = $l; + $this->col = $c; + } + } diff --git a/library/HTMLPurifier/Token/Comment.php b/library/HTMLPurifier/Token/Comment.php index 1571a40d..67174780 100644 --- a/library/HTMLPurifier/Token/Comment.php +++ b/library/HTMLPurifier/Token/Comment.php @@ -11,9 +11,10 @@ class HTMLPurifier_Token_Comment extends HTMLPurifier_Token * * @param $data String comment data. */ - public function __construct($data, $line = null) { + public function __construct($data, $line = null, $col = null) { $this->data = $data; $this->line = $line; + $this->col = $col; } } diff --git a/library/HTMLPurifier/Token/Tag.php b/library/HTMLPurifier/Token/Tag.php index 43748f70..795c40f6 100644 --- a/library/HTMLPurifier/Token/Tag.php +++ b/library/HTMLPurifier/Token/Tag.php @@ -33,7 +33,7 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token * @param $name String name. * @param $attr Associative array of attributes. */ - public function __construct($name, $attr = array(), $line = null) { + public function __construct($name, $attr = array(), $line = null, $col = null) { $this->name = ctype_lower($name) ? $name : strtolower($name); foreach ($attr as $key => $value) { // normalization only necessary when key is not lowercase @@ -49,5 +49,6 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token } $this->attr = $attr; $this->line = $line; + $this->col = $col; } } diff --git a/library/HTMLPurifier/Token/Text.php b/library/HTMLPurifier/Token/Text.php index 3942f8a0..02b53e5c 100644 --- a/library/HTMLPurifier/Token/Text.php +++ b/library/HTMLPurifier/Token/Text.php @@ -21,10 +21,11 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token * * @param $data String parsed character data. */ - public function __construct($data, $line = null) { + public function __construct($data, $line = null, $col = null) { $this->data = $data; $this->is_whitespace = ctype_space($data); $this->line = $line; + $this->col = $col; } } diff --git a/tests/HTMLPurifier/Lexer/DirectLexTest.php b/tests/HTMLPurifier/Lexer/DirectLexTest.php index d39438d8..a9fd49e5 100644 --- a/tests/HTMLPurifier/Lexer/DirectLexTest.php +++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php @@ -75,6 +75,8 @@ class HTMLPurifier_Lexer_DirectLexTest extends HTMLPurifier_Harness function testLineNumbers() { + // . . . . . . . . . . + // 01234567890123 01234567890123 0123456789012345 0123456789012 012345 $html = "Line 1\nLine 2\nStill Line 2Now Line 4\n\n
"; $expect = array( @@ -106,17 +108,17 @@ class HTMLPurifier_Lexer_DirectLexTest extends HTMLPurifier_Harness $config = HTMLPurifier_Config::create(array( 'Core.MaintainLineNumbers' => true )); - $expect[0]->line = 1; - $expect[1]->line = 1; - $expect[2]->line = 1; - $expect[3]->line = 1; - $expect[4]->line = 2; - $expect[5]->line = 2; - $expect[6]->line = 2; - $expect[7]->line = 2; - $expect[8]->line = 3; - $expect[9]->line = 4; - $expect[10]->line = 6; + $expect[0]->position(1, 0); + $expect[1]->position(1, 3); + $expect[2]->position(1, 9); + $expect[3]->position(2, -1); + $expect[4]->position(2, 0); + $expect[5]->position(2, 3); + $expect[6]->position(2, 9); + $expect[7]->position(3, -1); + $expect[8]->position(3, 12); + $expect[9]->position(4, 2); + $expect[10]->position(6, 0); $output = $this->DirectLex->tokenizeHTML($html, $config, $context); $this->assertIdentical($output, $expect); diff --git a/tests/HTMLPurifier/Strategy/MakeWellFormed_ErrorsTest.php b/tests/HTMLPurifier/Strategy/MakeWellFormed_ErrorsTest.php index 08919dda..2f38c996 100644 --- a/tests/HTMLPurifier/Strategy/MakeWellFormed_ErrorsTest.php +++ b/tests/HTMLPurifier/Strategy/MakeWellFormed_ErrorsTest.php @@ -9,43 +9,44 @@ class HTMLPurifier_Strategy_MakeWellFormed_ErrorsTest extends HTMLPurifier_Strat function testUnnecessaryEndTagRemoved() { $this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed'); - $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1)); + $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1, 0)); $this->invoke(''); } function testUnnecessaryEndTagToText() { $this->config->set('Core', 'EscapeInvalidTags', true); $this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text'); - $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1)); + $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1, 0)); $this->invoke(''); } function testTagAutoClosed() { - $this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', new HTMLPurifier_Token_Start('b', array(), 1)); - $this->expectContext('CurrentToken', new HTMLPurifier_Token_Start('div', array(), 1)); + $this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', new HTMLPurifier_Token_Start('b', array(), 1, 0)); + $this->expectContext('CurrentToken', new HTMLPurifier_Token_Start('div', array(), 1, 6)); $this->invoke('Foo
Bar
'); } function testStrayEndTagRemoved() { $this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed'); - $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1)); + $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1, 3)); $this->invoke('
'); } function testStrayEndTagToText() { $this->config->set('Core', 'EscapeInvalidTags', true); $this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text'); - $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1)); + $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1, 3)); $this->invoke(''); } function testTagClosedByElementEnd() { - $this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', new HTMLPurifier_Token_Start('b', array(), 1)); + $this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', new HTMLPurifier_Token_Start('b', array(), 1, 3)); + $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('i', array(), 1, 12)); $this->invoke('Foobar'); } function testTagClosedByDocumentEnd() { - $this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', new HTMLPurifier_Token_Start('b', array(), 1)); + $this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', new HTMLPurifier_Token_Start('b', array(), 1, 0)); $this->invoke('Foobar'); }