From ed7983b559692d032f99cd3ca24c22ccf7d2c48c Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Fri, 5 Sep 2008 14:04:23 -0400 Subject: [PATCH] Refactor lexer instantiation logic with exceptions and forced line tracking. Signed-off-by: Edward Z. Yang --- NEWS | 5 ++ library/HTMLPurifier/Lexer.php | 87 +++++++++++++++--------- library/HTMLPurifier/Lexer/DirectLex.php | 2 + tests/HTMLPurifier/LexerTest.php | 19 ++++++ 4 files changed, 79 insertions(+), 34 deletions(-) diff --git a/NEWS b/NEWS index 91891baf..c4d63a26 100644 --- a/NEWS +++ b/NEWS @@ -10,6 +10,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier ========================== 3.1.2, unknown release date +# Using %Core.CollectErrors forces line number/column tracking on, whereas + previously you could theoretically turn it off. ! %Output.AttrSort for when you need your attributes in alphabetical order to deal with a bug in FCKEditor. Requested by frank farmer. ! Enable HTML comments when %HTML.Trusted is on. Requested by Waldo Jaquith. @@ -49,6 +51,9 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier not collapsed by URIFilter_MakeAbsolute. - Fix bug with anonymous modules operating on SafeEmbed or SafeObject elements by reordering their addition. +- Will now throw exception on many error conditions during lexer creation; also + throw an exception when MaintainLineNumbers is true, but a non-tracksLineNumbers + is being used. . Strategy_MakeWellFormed now operates in-place, saving memory and allowing for more interesting filter-backtracking . New HTMLPurifier_Injector->rewind() functionality, allows injectors to rewind diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index 03fe032f..f6e5a90b 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -42,6 +42,12 @@ class HTMLPurifier_Lexer { + /** + * Whether or not this lexer implements line-number/column-number tracking. + * If it does, set to true. + */ + public $tracksLineNumbers = false; + // -- STATIC ---------------------------------------------------------- /** @@ -70,46 +76,59 @@ class HTMLPurifier_Lexer $lexer = $config->get('Core', 'LexerImpl'); } + $needs_tracking = + $config->get('Core', 'MaintainLineNumbers') || + $config->get('Core', 'CollectErrors'); + + $inst = null; if (is_object($lexer)) { - return $lexer; + $inst = $lexer; + } else { + + if (is_null($lexer)) { do { + // auto-detection algorithm + + if ($needs_tracking) { + $lexer = 'DirectLex'; + break; + } + + if (class_exists('DOMDocument') && method_exists('DOMDocument', 'loadHTML')) { + // check for DOM support, because, surprisingly enough, + // it's *not* part of the core! + $lexer = 'DOMLex'; + } else { + $lexer = 'DirectLex'; + } + + } while(0); } // do..while so we can break + + // instantiate recognized string names + switch ($lexer) { + case 'DOMLex': + $inst = new HTMLPurifier_Lexer_DOMLex(); + break; + case 'DirectLex': + $inst = new HTMLPurifier_Lexer_DirectLex(); + break; + case 'PH5P': + $inst = new HTMLPurifier_Lexer_PH5P(); + break; + default: + throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer)); + } } - if (is_null($lexer)) { do { - // auto-detection algorithm - - // once PHP DOM implements native line numbers, or we - // hack out something using XSLT, remove this stipulation - $line_numbers = $config->get('Core', 'MaintainLineNumbers'); - if ( - $line_numbers === true || - ($line_numbers === null && $config->get('Core', 'CollectErrors')) - ) { - $lexer = 'DirectLex'; - break; - } - - if (class_exists('DOMDocument') && method_exists('DOMDocument', 'loadHTML')) { - // check for DOM support, because, surprisingly enough, - // it's *not* part of the core! - $lexer = 'DOMLex'; - } else { - $lexer = 'DirectLex'; - } - - } while(0); } // do..while so we can break + if (!$inst) throw new HTMLPurifier_Exception('No lexer was instantiated'); - // instantiate recognized string names - switch ($lexer) { - case 'DOMLex': - return new HTMLPurifier_Lexer_DOMLex(); - case 'DirectLex': - return new HTMLPurifier_Lexer_DirectLex(); - case 'PH5P': - return new HTMLPurifier_Lexer_PH5P(); - default: - trigger_error("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer), E_USER_ERROR); + // once PHP DOM implements native line numbers, or we + // hack out something using XSLT, remove this stipulation + if ($needs_tracking && !$inst->tracksLineNumbers) { + throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'); } + return $inst; + } // -- CONVENIENCE MEMBERS --------------------------------------------- diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index 913457bd..1f92b0cb 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -13,6 +13,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer { + public $tracksLineNumbers = true; + /** * Whitespace characters for str(c)spn. */ diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index 349e5b24..07715d3d 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -29,6 +29,25 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex'); } + function test_create_objectLexerImpl() { + $this->config->set('Core', 'LexerImpl', new HTMLPurifier_Lexer_DirectLex()); + $lexer = HTMLPurifier_Lexer::create($this->config); + $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex'); + } + + function test_create_unknownLexer() { + $this->config->set('Core', 'LexerImpl', 'AsdfAsdf'); + $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf')); + HTMLPurifier_Lexer::create($this->config); + } + + function test_create_incompatibleLexer() { + $this->config->set('Core', 'LexerImpl', 'DOMLex'); + $this->config->set('Core', 'MaintainLineNumbers', true); + $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)')); + HTMLPurifier_Lexer::create($this->config); + } + // HTMLPurifier_Lexer->parseData() ----------------------------------------- function assertParseData($input, $expect = true) {