[1.7.0] Implement line number counting in DirectLex, in preparation for error reporting

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1155 48356398-32a2-884e-a903-53898d9a118a
2024-12-22 16:31:53 +00:00 · 2007-06-18 02:01:01 +00:00 · 2007-06-18 02:01:01 +00:00 · 4bf15de536
commit 4bf15de536
parent 70bcccf54c
6 changed files with 168 additions and 11 deletions
--- a/1
+++ b/1
@ -53,6 +53,7 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
 . Lexer is now pre-emptively included, with a conditional include for the
  PHP5 only version.
 . HTMLDefinition and CSSDefinition have a common parent class: Definition.
+. DirectLex can now track line-numbers

 1.6.1, released 2007-05-05
 ! Support for more deprecated attributes via transformations:
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@ -47,9 +47,24 @@ HTMLPurifier_ConfigSchema::define(
    to use it.
  </dd>
 </dl>
+<p>
+  This directive has been available since 1.7.0.
+</p>
 '
 );

+HTMLPurifier_ConfigSchema::define(
+    'Core', 'MaintainLineNumbers', false, 'bool', '
+<p>
+  If true, HTML Purifier will add line number information to all tokens.
+  This is useful when error reporting is turned on, but can result in
+  significant performance degradation and should not be used when
+  unnecessary. This directive must be used with the DirectLex lexer,
+  as the DOMLex lexer does not (yet) support this functionality. This directive
+  has been available since 1.7.0.
+</p>
+');
+
 /**
 * Forgivingly lexes HTML (SGML-style) markup into tokens.
 * 
@ -135,7 +150,14 @@ class HTMLPurifier_Lexer
        }
        
        if (is_null($lexer)) { do {
-            // auto-detectection algorithm
+            // auto-detection algorithm
+            
+            // once PHP DOM implements native line numbers, or we
+            // hack out something using XSLT, remove this stipulation
+            if ($config->get('Core', 'MaintainLineNumbers')) {
+                $lexer = 'DirectLex';
+                break;
+            }
            
            if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5
                class_exists('DOMDocument')) { // check for DOM support
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@ -2,6 +2,20 @@

 require_once 'HTMLPurifier/Lexer.php';

+HTMLPurifier_ConfigSchema::define(
+    'Core', 'DirectLexLineNumberSyncInterval', 0, 'int', '
+<p>
+  Specifies the number of tokens the DirectLex line number tracking
+  implementations should process before attempting to resyncronize the
+  current line count by manually counting all previous new-lines. When
+  at 0, this functionality is disabled. Lower values will decrease
+  performance, and this is only strictly necessary if the counting
+  algorithm is buggy (in which case you should report it as a bug).
+  This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is
+  not being used. This directive has been available since 1.7.0.
+</p>
+');
+
 /**
 * Our in-house implementation of a parser.
 * 
@ -32,9 +46,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
        $inside_tag = false; // whether or not we're parsing the inside of a tag
        $array = array(); // result array
        
+        $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
+        $current_line = 1;
+        $nl = PHP_EOL;
+        // how often to manually recalculate. This will ALWAYS be right,
+        // but it's pretty wasteful. Set to 0 to turn off
+        $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval'); 
+        
        // infinite loop protection
        // has to be pretty big, since html docs can be big
        // we're allow two hundred thousand tags... more than enough?
+        // NOTE: this is also used for synchronization, so watch out
        $loops = 0;
        
        while(true) {
@ -42,10 +64,21 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
            // infinite loop protection
            if (++$loops > 200000) return array();
            
+            // recalculate lines
+            if (
+                $maintain_line_numbers && // line number tracking is on
+                $synchronize_interval &&  // synchronization is on
+                $cursor > 0 &&            // cursor is further than zero
+                $loops % $synchronize_interval === 0 // time to synchronize!
+            ) {
+                $current_line = 1 + substr_count($html, $nl, 0, $cursor);
+            }
+            
            $position_next_lt = strpos($html, '<', $cursor);
            $position_next_gt = strpos($html, '>', $cursor);
            
            // triggers on "<b>asdf</b>" but not "asdf <b></b>"
+            // special case to set up context
            if ($position_next_lt === $cursor) {
                $inside_tag = true;
                $cursor++;
@ -53,7 +86,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
            
            if (!$inside_tag && $position_next_lt !== false) {
                // We are not inside tag and there still is another tag to parse
-                $array[] = new
+                $token = new
                    HTMLPurifier_Token_Text(
                        $this->parseData(
                            substr(
@ -61,6 +94,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                            )
                        )
                    );
+                if ($maintain_line_numbers) {
+                    $token->line = $current_line;
+                    $current_line += substr_count($html, $nl, $cursor, $position_next_lt - $cursor);
+                }
+                $array[] = $token;
                $cursor  = $position_next_lt + 1;
                $inside_tag = true;
                continue;
@ -69,7 +107,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                // If we're already at the end, break
                if ($cursor === strlen($html)) break;
                // Create Text of rest of string
-                $array[] = new
+                $token = new
                    HTMLPurifier_Token_Text(
                        $this->parseData(
                            substr(
@ -77,6 +115,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                            )
                        )
                    );
+                if ($maintain_line_numbers) $token->line = $current_line;
+                $array[] = $token;
                break;
            } elseif ($inside_tag && $position_next_gt !== false) {
                // We are in tag and it is well formed
@ -89,12 +129,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                    substr($segment, 0, 3) == '!--' &&
                    substr($segment, $strlen_segment-2, 2) == '--'
                ) {
-                    $array[] = new
+                    $token = new
                        HTMLPurifier_Token_Comment(
                            substr(
                                $segment, 3, $strlen_segment - 5
                            )
                        );
+                    if ($maintain_line_numbers) {
+                        $token->line = $current_line;
+                        $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
+                    }
+                    $array[] = $token;
                    $inside_tag = false;
                    $cursor = $position_next_gt + 1;
                    continue;
@ -104,7 +149,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                $is_end_tag = (strpos($segment,'/') === 0);
                if ($is_end_tag) {
                    $type = substr($segment, 1);
-                    $array[] = new HTMLPurifier_Token_End($type);
+                    $token = new HTMLPurifier_Token_End($type);
+                    if ($maintain_line_numbers) {
+                        $token->line = $current_line;
+                        $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
+                    }
+                    $array[] = $token;
                    $inside_tag = false;
                    $cursor = $position_next_gt + 1;
                    continue;
@ -114,7 +164,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                // have accidently grabbed an emoticon. Translate into
                // text and go our merry way
                if (!ctype_alnum($segment[0])) {
-                    $array[] = new
+                    $token = new
                        HTMLPurifier_Token_Text(
                            '<' .
                            $this->parseData(
@ -122,6 +172,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                            ) . 
                            '>'
                        );
+                    if ($maintain_line_numbers) {
+                        $token->line = $current_line;
+                        $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
+                    }
+                    $array[] = $token;
                    $cursor = $position_next_gt + 1;
                    $inside_tag = false;
                    continue;
@ -142,10 +197,15 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                
                if ($position_first_space >= $strlen_segment) {
                    if ($is_self_closing) {
-                        $array[] = new HTMLPurifier_Token_Empty($segment);
+                        $token = new HTMLPurifier_Token_Empty($segment);
                    } else {
-                        $array[] = new HTMLPurifier_Token_Start($segment);
+                        $token = new HTMLPurifier_Token_Start($segment);
                    }
+                    if ($maintain_line_numbers) {
+                        $token->line = $current_line;
+                        $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
+                    }
+                    $array[] = $token;
                    $inside_tag = false;
                    $cursor = $position_next_gt + 1;
                    continue;
@ -169,21 +229,29 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                }
                
                if ($is_self_closing) {
-                    $array[] = new HTMLPurifier_Token_Empty($type, $attr);
+                    $token = new HTMLPurifier_Token_Empty($type, $attr);
                } else {
-                    $array[] = new HTMLPurifier_Token_Start($type, $attr);
+                    $token = new HTMLPurifier_Token_Start($type, $attr);
                }
+                if ($maintain_line_numbers) {
+                    $token->line = $current_line;
+                    $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
+                }
+                $array[] = $token;
                $cursor = $position_next_gt + 1;
                $inside_tag = false;
                continue;
            } else {
-                $array[] = new
+                $token = new
                    HTMLPurifier_Token_Text(
                        '<' .
                        $this->parseData(
                            substr($html, $cursor)
                        )
                    );
+                if ($maintain_line_numbers) $token->line = $current_line;
+                // no cursor scroll? Hmm...
+                $array[] = $token;
                break;
            }
            break;
--- a/library/HTMLPurifier/Token.php
+++ b/library/HTMLPurifier/Token.php
@ -11,6 +11,7 @@
 */
 class HTMLPurifier_Token {
    var $type; /**< Type of node to bypass <tt>is_a()</tt>. @public */
+    var $line; /**< Line number node was on in source document. Null if unknown. @public */
    
    /**
     * Copies the tag into a new one (clone substitute).
--- a/tests/HTMLPurifier/Lexer/DirectLexTest.php
+++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php
@ -64,6 +64,65 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase
        
    }
    
+    function testLineNumbers() {
+        
+        $html = '<b>Line 1</b>
+                 <i>Line 2</i>
+                 Still Line 2<br
+                 />Now Line 4
+                 
+                 <br />';
+        
+        $expect = array(
+            // line 1
+            0 => new HTMLPurifier_Token_Start('b')
+           ,1 => new HTMLPurifier_Token_Text('Line 1')
+           ,2 => new HTMLPurifier_Token_End('b')
+           ,3 => new HTMLPurifier_Token_Text('
+                 ')
+            // line 2
+           ,4 => new HTMLPurifier_Token_Start('i')
+           ,5 => new HTMLPurifier_Token_Text('Line 2')
+           ,6 => new HTMLPurifier_Token_End('i')
+           ,7 => new HTMLPurifier_Token_Text('
+                 Still Line 2')
+            // line 3
+           ,8 => new HTMLPurifier_Token_Empty('br')
+            // line 4
+           ,9 => new HTMLPurifier_Token_Text('Now Line 4
+                 
+                 ')
+            // line SIX
+           ,10 => new HTMLPurifier_Token_Empty('br')
+        );
+        
+        $context = new HTMLPurifier_Context();
+        $config  = HTMLPurifier_Config::createDefault();
+        $output = $this->DirectLex->tokenizeHTML($html, $config, $context);
+        
+        $this->assertIdentical($output, $expect);
+        
+        $context = new HTMLPurifier_Context();
+        $config  = HTMLPurifier_Config::create(array(
+            'Core.MaintainLineNumbers' => true
+        ));
+        $expect[0]->line = 1;
+        $expect[1]->line = 1;
+        $expect[2]->line = 1;
+        $expect[3]->line = 1;
+        $expect[4]->line = 2;
+        $expect[5]->line = 2;
+        $expect[6]->line = 2;
+        $expect[7]->line = 2;
+        $expect[8]->line = 3;
+        $expect[9]->line = 4;
+        $expect[10]->line = 6;
+        
+        $output = $this->DirectLex->tokenizeHTML($html, $config, $context);
+        $this->assertIdentical($output, $expect);
+        
+    }
+    
    
 }

--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -35,6 +35,12 @@ class HTMLPurifier_LexerTest extends UnitTestCase
        
    }
    
+    function test_create() {
+        $config = HTMLPurifier_Config::create(array('Core.MaintainLineNumbers' => true));
+        $lexer = HTMLPurifier_Lexer::create($config);
+        $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
+    }
+    
    function assertExtractBody($text, $extract = true) {
        $result = $this->Lexer->extractBody($text);
        if ($extract === true) $extract = $text;