From c6914dce51ee1a5b6c935c8b4a1a5c816d941bd9 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <edwardzyang@thewritingpot.com>
Date: Mon, 1 Sep 2008 14:10:10 -0400
Subject: [PATCH] Track column numbers in addition to line numbers.

Signed-off-by: Edward Z. Yang <edwardzyang@thewritingpot.com>
---
 NEWS                                          |  1 +
 library/HTMLPurifier/Lexer/DirectLex.php      | 61 +++++++++++++------
 library/HTMLPurifier/Token.php                | 29 +++++++--
 library/HTMLPurifier/Token/Comment.php        |  3 +-
 library/HTMLPurifier/Token/Tag.php            |  3 +-
 library/HTMLPurifier/Token/Text.php           |  3 +-
 tests/HTMLPurifier/Lexer/DirectLexTest.php    | 24 ++++----
 .../Strategy/MakeWellFormed_ErrorsTest.php    | 17 +++---
 8 files changed, 96 insertions(+), 45 deletions(-)

diff --git a/NEWS b/NEWS
index 95b61773..91891baf 100644
--- a/NEWS
+++ b/NEWS
@@ -31,6 +31,7 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
   useless, but as a bonus, the test suite and handling of edge cases is also
   improved.
 ! Experimental implementation of forms for %HTML.Trusted
+! Track column numbers when maintain line numbers is on
 - Fix two bugs in %URI.MakeAbsolute; one involving empty paths in base URLs,
   the other involving an undefined $is_folder error.
 - Throw error when %Core.Encoding is set to a spurious value. Previously,
diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php
index 3942c442..913457bd 100644
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -42,6 +42,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
         $inside_tag = false; // whether or not we're parsing the inside of a tag
         $array = array(); // result array
         
+        // This is also treated to mean maintain *column* numbers too
         $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
         
         if ($maintain_line_numbers === null) {
@@ -50,8 +51,15 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
             $maintain_line_numbers = $config->get('Core', 'CollectErrors');
         }
         
-        if ($maintain_line_numbers) $current_line = 1;
-        else $current_line = false;
+        if ($maintain_line_numbers) {
+            $current_line = 1;
+            $current_col  = 0; 
+            $length = strlen($html);
+        } else {
+            $current_line = false;
+            $current_col  = false;
+            $length = false;
+        }
         $context->register('CurrentLine', $current_line);
         $nl = "\n";
         // how often to manually recalculate. This will ALWAYS be right,
@@ -68,14 +76,31 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
         
         while(++$loops) {
             
-            // recalculate lines
-            if (
-                $maintain_line_numbers && // line number tracking is on
-                $synchronize_interval &&  // synchronization is on
-                $cursor > 0 &&            // cursor is further than zero
-                $loops % $synchronize_interval === 0 // time to synchronize!
-            ) {
-                $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
+            // $cursor is either at the start of a token, or inside of
+            // a tag (i.e. there was a < immediately before it), as indicated
+            // by $inside_tag
+            
+            if ($maintain_line_numbers) {
+                
+                // $rcursor, however, is always at the start of a token.
+                $rcursor = $cursor - (int) $inside_tag;
+                
+                // Column number is cheap, so we calculate it every round.
+                // We're interested at the *end* of the newline string, so 
+                // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
+                // from our "rcursor" position.
+                $nl_pos = strrpos($html, $nl, $rcursor - $length);
+                $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
+                
+                // recalculate lines
+                if (
+                    $synchronize_interval &&  // synchronization is on
+                    $cursor > 0 &&            // cursor is further than zero
+                    $loops % $synchronize_interval === 0 // time to synchronize!
+                ) {
+                    $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
+                }
+                
             }
             
             $position_next_lt = strpos($html, '<', $cursor);
@@ -99,7 +124,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                         )
                     );
                 if ($maintain_line_numbers) {
-                    $token->line = $current_line;
+                    $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
                 }
                 $array[] = $token;
@@ -119,7 +144,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                             )
                         )
                     );
-                if ($maintain_line_numbers) $token->line = $current_line;
+                if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
                 $array[] = $token;
                 break;
             } elseif ($inside_tag && $position_next_gt !== false) {
@@ -167,7 +192,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                             )
                         );
                     if ($maintain_line_numbers) {
-                        $token->line = $current_line;
+                        $token->rawPosition($current_line, $current_col);
                         $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
                     }
                     $array[] = $token;
@@ -182,7 +207,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                     $type = substr($segment, 1);
                     $token = new HTMLPurifier_Token_End($type);
                     if ($maintain_line_numbers) {
-                        $token->line = $current_line;
+                        $token->rawPosition($current_line, $current_col);
                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                     }
                     $array[] = $token;
@@ -199,7 +224,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                     if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
                     $token = new HTMLPurifier_Token_Text('<');
                     if ($maintain_line_numbers) {
-                        $token->line = $current_line;
+                        $token->rawPosition($current_line, $current_col);
                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                     }
                     $array[] = $token;
@@ -227,7 +252,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                         $token = new HTMLPurifier_Token_Start($segment);
                     }
                     if ($maintain_line_numbers) {
-                        $token->line = $current_line;
+                        $token->rawPosition($current_line, $current_col);
                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                     }
                     $array[] = $token;
@@ -259,7 +284,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                     $token = new HTMLPurifier_Token_Start($type, $attr);
                 }
                 if ($maintain_line_numbers) {
-                    $token->line = $current_line;
+                    $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
                 }
                 $array[] = $token;
@@ -276,7 +301,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                             substr($html, $cursor)
                         )
                     );
-                if ($maintain_line_numbers) $token->line = $current_line;
+                if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
                 // no cursor scroll? Hmm...
                 $array[] = $token;
                 break;
diff --git a/library/HTMLPurifier/Token.php b/library/HTMLPurifier/Token.php
index 8803307b..12481026 100644
--- a/library/HTMLPurifier/Token.php
+++ b/library/HTMLPurifier/Token.php
@@ -5,6 +5,7 @@
  */
 class HTMLPurifier_Token {
     public $line; /**< Line number node was on in source document. Null if unknown. */
+    public $col;  /**< Column of line node was on in source document. Null if unknown. */
     
     /**
      * Lookup array of processing that this token is exempt from.
@@ -17,13 +18,31 @@ class HTMLPurifier_Token {
       if ($n === 'type') {
         trigger_error('Deprecated type property called; use instanceof', E_USER_NOTICE);
         switch (get_class($this)) {
-          case 'HTMLPurifier_Token_Start': return 'start';
-          case 'HTMLPurifier_Token_Empty': return 'empty';
-          case 'HTMLPurifier_Token_End': return 'end';
-          case 'HTMLPurifier_Token_Text': return 'text';
-          case 'HTMLPurifier_Token_Comment': return 'comment';
+          case 'HTMLPurifier_Token_Start':      return 'start';
+          case 'HTMLPurifier_Token_Empty':      return 'empty';
+          case 'HTMLPurifier_Token_End':        return 'end';
+          case 'HTMLPurifier_Token_Text':       return 'text';
+          case 'HTMLPurifier_Token_Comment':    return 'comment';
           default: return null;
         }
       }
     }
+    
+    /**
+     * Sets the position of the token in the source document.
+     */
+    public function position($l = null, $c = null) {
+        $this->line = $l;
+        $this->col  = $c;
+    }
+    
+    /**
+     * Convenience function for DirectLex settings line/col position.
+     */
+    public function rawPosition($l, $c) {
+        if ($c === -1) $l++;
+        $this->line = $l;
+        $this->col  = $c;
+    }
+    
 }
diff --git a/library/HTMLPurifier/Token/Comment.php b/library/HTMLPurifier/Token/Comment.php
index 1571a40d..67174780 100644
--- a/library/HTMLPurifier/Token/Comment.php
+++ b/library/HTMLPurifier/Token/Comment.php
@@ -11,9 +11,10 @@ class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
      * 
      * @param $data String comment data.
      */
-    public function __construct($data, $line = null) {
+    public function __construct($data, $line = null, $col = null) {
         $this->data = $data;
         $this->line = $line;
+        $this->col  = $col;
     }
 }
 
diff --git a/library/HTMLPurifier/Token/Tag.php b/library/HTMLPurifier/Token/Tag.php
index 43748f70..795c40f6 100644
--- a/library/HTMLPurifier/Token/Tag.php
+++ b/library/HTMLPurifier/Token/Tag.php
@@ -33,7 +33,7 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token
      * @param $name String name.
      * @param $attr Associative array of attributes.
      */
-    public function __construct($name, $attr = array(), $line = null) {
+    public function __construct($name, $attr = array(), $line = null, $col = null) {
         $this->name = ctype_lower($name) ? $name : strtolower($name);
         foreach ($attr as $key => $value) {
             // normalization only necessary when key is not lowercase
@@ -49,5 +49,6 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token
         }
         $this->attr = $attr;
         $this->line = $line;
+        $this->col  = $col;
     }
 }
diff --git a/library/HTMLPurifier/Token/Text.php b/library/HTMLPurifier/Token/Text.php
index 3942f8a0..02b53e5c 100644
--- a/library/HTMLPurifier/Token/Text.php
+++ b/library/HTMLPurifier/Token/Text.php
@@ -21,10 +21,11 @@ class HTMLPurifier_Token_Text extends HTMLPurifier_Token
      * 
      * @param $data String parsed character data.
      */
-    public function __construct($data, $line = null) {
+    public function __construct($data, $line = null, $col = null) {
         $this->data = $data;
         $this->is_whitespace = ctype_space($data);
         $this->line = $line;
+        $this->col  = $col;
     }
     
 }
diff --git a/tests/HTMLPurifier/Lexer/DirectLexTest.php b/tests/HTMLPurifier/Lexer/DirectLexTest.php
index d39438d8..a9fd49e5 100644
--- a/tests/HTMLPurifier/Lexer/DirectLexTest.php
+++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php
@@ -75,6 +75,8 @@ class HTMLPurifier_Lexer_DirectLexTest extends HTMLPurifier_Harness
     
     function testLineNumbers() {
         
+        //       .  .     .     .  .     .     .           .      .             .
+        //       01234567890123 01234567890123 0123456789012345 0123456789012   012345
         $html = "<b>Line 1</b>\n<i>Line 2</i>\nStill Line 2<br\n/>Now Line 4\n\n<br />";
         
         $expect = array(
@@ -106,17 +108,17 @@ class HTMLPurifier_Lexer_DirectLexTest extends HTMLPurifier_Harness
         $config  = HTMLPurifier_Config::create(array(
             'Core.MaintainLineNumbers' => true
         ));
-        $expect[0]->line = 1;
-        $expect[1]->line = 1;
-        $expect[2]->line = 1;
-        $expect[3]->line = 1;
-        $expect[4]->line = 2;
-        $expect[5]->line = 2;
-        $expect[6]->line = 2;
-        $expect[7]->line = 2;
-        $expect[8]->line = 3;
-        $expect[9]->line = 4;
-        $expect[10]->line = 6;
+        $expect[0]->position(1, 0);
+        $expect[1]->position(1, 3);
+        $expect[2]->position(1, 9);
+        $expect[3]->position(2, -1);
+        $expect[4]->position(2, 0);
+        $expect[5]->position(2, 3);
+        $expect[6]->position(2, 9);
+        $expect[7]->position(3, -1);
+        $expect[8]->position(3, 12);
+        $expect[9]->position(4, 2);
+        $expect[10]->position(6, 0);
         
         $output = $this->DirectLex->tokenizeHTML($html, $config, $context);
         $this->assertIdentical($output, $expect);
diff --git a/tests/HTMLPurifier/Strategy/MakeWellFormed_ErrorsTest.php b/tests/HTMLPurifier/Strategy/MakeWellFormed_ErrorsTest.php
index 08919dda..2f38c996 100644
--- a/tests/HTMLPurifier/Strategy/MakeWellFormed_ErrorsTest.php
+++ b/tests/HTMLPurifier/Strategy/MakeWellFormed_ErrorsTest.php
@@ -9,43 +9,44 @@ class HTMLPurifier_Strategy_MakeWellFormed_ErrorsTest extends HTMLPurifier_Strat
     
     function testUnnecessaryEndTagRemoved() {
         $this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed');
-        $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1));
+        $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1, 0));
         $this->invoke('</b>');
     }
     
     function testUnnecessaryEndTagToText() {
         $this->config->set('Core', 'EscapeInvalidTags', true);
         $this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text');
-        $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1));
+        $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1, 0));
         $this->invoke('</b>');
     }
     
     function testTagAutoClosed() {
-        $this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', new HTMLPurifier_Token_Start('b', array(), 1));
-        $this->expectContext('CurrentToken', new HTMLPurifier_Token_Start('div', array(), 1));
+        $this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', new HTMLPurifier_Token_Start('b', array(), 1, 0));
+        $this->expectContext('CurrentToken', new HTMLPurifier_Token_Start('div', array(), 1, 6));
         $this->invoke('<b>Foo<div>Bar</div>');
     }
     
     function testStrayEndTagRemoved() {
         $this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed');
-        $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1));
+        $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1, 3));
         $this->invoke('<i></b></i>');
     }
     
     function testStrayEndTagToText() {
         $this->config->set('Core', 'EscapeInvalidTags', true);
         $this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text');
-        $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1));
+        $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('b', array(), 1, 3));
         $this->invoke('<i></b></i>');
     }
     
     function testTagClosedByElementEnd() {
-        $this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', new HTMLPurifier_Token_Start('b', array(), 1));
+        $this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', new HTMLPurifier_Token_Start('b', array(), 1, 3));
+        $this->expectContext('CurrentToken', new HTMLPurifier_Token_End('i', array(), 1, 12));
         $this->invoke('<i><b>Foobar</i>');
     }
     
     function testTagClosedByDocumentEnd() {
-        $this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', new HTMLPurifier_Token_Start('b', array(), 1));
+        $this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', new HTMLPurifier_Token_Start('b', array(), 1, 0));
         $this->invoke('<b>Foobar');
     }