[1.7.0] Implement line number counting in DirectLex, in preparation for error reporting

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1155 48356398-32a2-884e-a903-53898d9a118a
2024-12-23 00:41:52 +00:00 · 2007-06-18 02:01:01 +00:00 · 2007-06-18 02:01:01 +00:00 · 4bf15de536
commit 4bf15de536
parent 70bcccf54c
6 changed files with 168 additions and 11 deletions
--- a/1
+++ b/1
@ -53,6 +53,7 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
 . Lexer is now pre-emptively included, with a conditional include for the
  PHP5 only version.
 . HTMLDefinition and CSSDefinition have a common parent class: Definition.
 . DirectLex can now track line-numbers
 1.6.1, released 2007-05-05
 ! Support for more deprecated attributes via transformations:
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@ -47,9 +47,24 @@ HTMLPurifier_ConfigSchema::define(
    to use it.
  </dd>
 </dl>
 <p>
  This directive has been available since 1.7.0.
 </p>
 '
 );
 HTMLPurifier_ConfigSchema::define(
    'Core', 'MaintainLineNumbers', false, 'bool', '
 <p>
  If true, HTML Purifier will add line number information to all tokens.
  This is useful when error reporting is turned on, but can result in
  significant performance degradation and should not be used when
  unnecessary. This directive must be used with the DirectLex lexer,
  as the DOMLex lexer does not (yet) support this functionality. This directive
  has been available since 1.7.0.
 </p>
 ');
 /**
 * Forgivingly lexes HTML (SGML-style) markup into tokens.
 * 
@ -135,7 +150,14 @@ class HTMLPurifier_Lexer
        }
        if (is_null($lexer)) { do {
-            // auto-detectection algorithm
+            // auto-detection algorithm
            // once PHP DOM implements native line numbers, or we
            // hack out something using XSLT, remove this stipulation
            if ($config->get('Core', 'MaintainLineNumbers')) {
                $lexer = 'DirectLex';
                break;
            }
            if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5
                class_exists('DOMDocument')) { // check for DOM support
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@ -2,6 +2,20 @@
 require_once 'HTMLPurifier/Lexer.php';
 HTMLPurifier_ConfigSchema::define(
    'Core', 'DirectLexLineNumberSyncInterval', 0, 'int', '
 <p>
  Specifies the number of tokens the DirectLex line number tracking
  implementations should process before attempting to resyncronize the
  current line count by manually counting all previous new-lines. When
  at 0, this functionality is disabled. Lower values will decrease
  performance, and this is only strictly necessary if the counting
  algorithm is buggy (in which case you should report it as a bug).
  This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is
  not being used. This directive has been available since 1.7.0.
 </p>
 ');
 /**
 * Our in-house implementation of a parser.
 * 
@ -32,9 +46,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
        $inside_tag = false; // whether or not we're parsing the inside of a tag
        $array = array(); // result array
        $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
        $current_line = 1;
        $nl = PHP_EOL;
        // how often to manually recalculate. This will ALWAYS be right,
        // but it's pretty wasteful. Set to 0 to turn off
        $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval'); 
        // infinite loop protection
        // has to be pretty big, since html docs can be big
        // we're allow two hundred thousand tags... more than enough?
        // NOTE: this is also used for synchronization, so watch out
        $loops = 0;
        while(true) {
@ -42,10 +64,21 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
            // infinite loop protection
            if (++$loops > 200000) return array();
            // recalculate lines
            if (
                $maintain_line_numbers && // line number tracking is on
                $synchronize_interval &&  // synchronization is on
                $cursor > 0 &&            // cursor is further than zero
                $loops % $synchronize_interval === 0 // time to synchronize!
            ) {
                $current_line = 1 + substr_count($html, $nl, 0, $cursor);
            }
            $position_next_lt = strpos($html, '<', $cursor);
            $position_next_gt = strpos($html, '>', $cursor);
            // triggers on "<b>asdf</b>" but not "asdf <b></b>"
            // special case to set up context
            if ($position_next_lt === $cursor) {
                $inside_tag = true;
                $cursor++;
@ -53,7 +86,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
            if (!$inside_tag && $position_next_lt !== false) {
                // We are not inside tag and there still is another tag to parse
-                $array[] = new
+                $token = new
                    HTMLPurifier_Token_Text(
                        $this->parseData(
                            substr(
@ -61,6 +94,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                            )
                        )
                    );
                if ($maintain_line_numbers) {
                    $token->line = $current_line;
                    $current_line += substr_count($html, $nl, $cursor, $position_next_lt - $cursor);
                }
                $array[] = $token;
                $cursor  = $position_next_lt + 1;
                $inside_tag = true;
                continue;
@ -69,7 +107,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                // If we're already at the end, break
                if ($cursor === strlen($html)) break;
                // Create Text of rest of string
-                $array[] = new
+                $token = new
                    HTMLPurifier_Token_Text(
                        $this->parseData(
                            substr(
@ -77,6 +115,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                            )
                        )
                    );
                if ($maintain_line_numbers) $token->line = $current_line;
                $array[] = $token;
                break;
            } elseif ($inside_tag && $position_next_gt !== false) {
                // We are in tag and it is well formed
@ -89,12 +129,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                    substr($segment, 0, 3) == '!--' &&
                    substr($segment, $strlen_segment-2, 2) == '--'
                ) {
-                    $array[] = new
+                    $token = new
                        HTMLPurifier_Token_Comment(
                            substr(
                                $segment, 3, $strlen_segment - 5
                            )
                        );
                    if ($maintain_line_numbers) {
                        $token->line = $current_line;
                        $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
                    }
                    $array[] = $token;
                    $inside_tag = false;
                    $cursor = $position_next_gt + 1;
                    continue;
@ -104,7 +149,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                $is_end_tag = (strpos($segment,'/') === 0);
                if ($is_end_tag) {
                    $type = substr($segment, 1);
-                    $array[] = new HTMLPurifier_Token_End($type);
+                    $token = new HTMLPurifier_Token_End($type);
                    if ($maintain_line_numbers) {
                        $token->line = $current_line;
                        $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
                    }
                    $array[] = $token;
                    $inside_tag = false;
                    $cursor = $position_next_gt + 1;
                    continue;
@ -114,7 +164,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                // have accidently grabbed an emoticon. Translate into
                // text and go our merry way
                if (!ctype_alnum($segment[0])) {
-                    $array[] = new
+                    $token = new
                        HTMLPurifier_Token_Text(
                            '<' .
                            $this->parseData(
@ -122,6 +172,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                            ) . 
                            '>'
                        );
                    if ($maintain_line_numbers) {
                        $token->line = $current_line;
                        $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
                    }
                    $array[] = $token;
                    $cursor = $position_next_gt + 1;
                    $inside_tag = false;
                    continue;
@ -142,10 +197,15 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                if ($position_first_space >= $strlen_segment) {
                    if ($is_self_closing) {
-                        $array[] = new HTMLPurifier_Token_Empty($segment);
+                        $token = new HTMLPurifier_Token_Empty($segment);
                    } else {
-                        $array[] = new HTMLPurifier_Token_Start($segment);
+                        $token = new HTMLPurifier_Token_Start($segment);
                    }
                    if ($maintain_line_numbers) {
                        $token->line = $current_line;
                        $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
                    }
                    $array[] = $token;
                    $inside_tag = false;
                    $cursor = $position_next_gt + 1;
                    continue;
@ -169,21 +229,29 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                }
                if ($is_self_closing) {
-                    $array[] = new HTMLPurifier_Token_Empty($type, $attr);
+                    $token = new HTMLPurifier_Token_Empty($type, $attr);
                } else {
-                    $array[] = new HTMLPurifier_Token_Start($type, $attr);
+                    $token = new HTMLPurifier_Token_Start($type, $attr);
                }
                if ($maintain_line_numbers) {
                    $token->line = $current_line;
                    $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
                }
                $array[] = $token;
                $cursor = $position_next_gt + 1;
                $inside_tag = false;
                continue;
            } else {
-                $array[] = new
+                $token = new
                    HTMLPurifier_Token_Text(
                        '<' .
                        $this->parseData(
                            substr($html, $cursor)
                        )
                    );
                if ($maintain_line_numbers) $token->line = $current_line;
                // no cursor scroll? Hmm...
                $array[] = $token;
                break;
            }
            break;
--- a/library/HTMLPurifier/Token.php
+++ b/library/HTMLPurifier/Token.php
@ -11,6 +11,7 @@
 */
 class HTMLPurifier_Token {
    var $type; /**< Type of node to bypass <tt>is_a()</tt>. @public */
    var $line; /**< Line number node was on in source document. Null if unknown. @public */
    /**
     * Copies the tag into a new one (clone substitute).
--- a/tests/HTMLPurifier/Lexer/DirectLexTest.php
+++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php
@ -64,6 +64,65 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase
    }
    function testLineNumbers() {
        $html = '<b>Line 1</b>
                 <i>Line 2</i>
                 Still Line 2<br
                 />Now Line 4
                 <br />';
        $expect = array(
            // line 1
            0 => new HTMLPurifier_Token_Start('b')
           ,1 => new HTMLPurifier_Token_Text('Line 1')
           ,2 => new HTMLPurifier_Token_End('b')
           ,3 => new HTMLPurifier_Token_Text('
                 ')
            // line 2
           ,4 => new HTMLPurifier_Token_Start('i')
           ,5 => new HTMLPurifier_Token_Text('Line 2')
           ,6 => new HTMLPurifier_Token_End('i')
           ,7 => new HTMLPurifier_Token_Text('
                 Still Line 2')
            // line 3
           ,8 => new HTMLPurifier_Token_Empty('br')
            // line 4
           ,9 => new HTMLPurifier_Token_Text('Now Line 4
                 ')
            // line SIX
           ,10 => new HTMLPurifier_Token_Empty('br')
        );
        $context = new HTMLPurifier_Context();
        $config  = HTMLPurifier_Config::createDefault();
        $output = $this->DirectLex->tokenizeHTML($html, $config, $context);
        $this->assertIdentical($output, $expect);
        $context = new HTMLPurifier_Context();
        $config  = HTMLPurifier_Config::create(array(
            'Core.MaintainLineNumbers' => true
        ));
        $expect[0]->line = 1;
        $expect[1]->line = 1;
        $expect[2]->line = 1;
        $expect[3]->line = 1;
        $expect[4]->line = 2;
        $expect[5]->line = 2;
        $expect[6]->line = 2;
        $expect[7]->line = 2;
        $expect[8]->line = 3;
        $expect[9]->line = 4;
        $expect[10]->line = 6;
        $output = $this->DirectLex->tokenizeHTML($html, $config, $context);
        $this->assertIdentical($output, $expect);
    }
 }
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -35,6 +35,12 @@ class HTMLPurifier_LexerTest extends UnitTestCase
    }
    function test_create() {
        $config = HTMLPurifier_Config::create(array('Core.MaintainLineNumbers' => true));
        $lexer = HTMLPurifier_Lexer::create($config);
        $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
    }
    function assertExtractBody($text, $extract = true) {
        $result = $this->Lexer->extractBody($text);
        if ($extract === true) $extract = $text;