diff --git a/library/HTMLPurifier/Lexer/PH5P.php b/library/HTMLPurifier/Lexer/PH5P.php index 9fa92448..81659e50 100644 --- a/library/HTMLPurifier/Lexer/PH5P.php +++ b/library/HTMLPurifier/Lexer/PH5P.php @@ -2,17 +2,28 @@ /** * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library. - * Requires PHP5, and occupies space in the HTML5 pseudo-namespace (may - * cause conflicts, sorry). + * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts. + * + * @note + * Recent changes to PHP's DOM extension have resulted in some fatal + * error conditions with the original version of PH5P. Pending changes, + * this lexer will punt to DirectLex if DOM throughs an exception. */ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex { public function tokenizeHTML($html, $config, $context) { - $html = $this->normalize($html, $config, $context); - $html = $this->wrapHTML( $html, $config, $context); - $parser = new HTML5($html); - $doc = $parser->save(); + $new_html = $this->normalize($html, $config, $context); + $new_html = $this->wrapHTML($new_html, $config, $context); + try { + $parser = new HTML5($new_html); + $doc = $parser->save(); + } catch (DOMException $e) { + // Uh oh, it failed. Punt to DirectLex. + $lexer = new HTMLPurifier_Lexer_DirectLex(); + $context->register('PH5PError', $e); // save the error, so we can detect it + return $lexer->tokenizeHTML($html, $config, $context); // use original HTML + } $tokens = array(); $this->tokenizeDOM( $doc->getElementsByTagName('html')->item(0)-> // diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index 56e94e6d..a720f74a 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -138,7 +138,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness $lexers = array(); $lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex(); if ($this->_has_pear) $lexers['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3(); - if (version_compare(PHP_VERSION, "5", ">=") && class_exists('DOMDocument')) { + if (class_exists('DOMDocument')) { $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex(); $lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P(); } @@ -310,13 +310,26 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness new HTMLPurifier_Token_Start('a', array('"' => '')) ), 'PEARSax3' => $tokens, - 'PH5P' => array( - new HTMLPurifier_Token_Empty('a', array('"' => '')) - ), + 'PH5P' => false, // behavior varies; handle this personally ) ); } + function test_tokenizeHTML_earlyQuote_PH5P() { + if (!class_exists('DOMDocument')) return; + $lexer = new HTMLPurifier_Lexer_PH5P(); + $result = $lexer->tokenizeHTML('', $this->config, $this->context); + if ($this->context->get('PH5PError', true)) { + $this->assertIdentical(array( + new HTMLPurifier_Token_Start('a', array('"' => '')) + ), $result); + } else { + $this->assertIdentical(array( + new HTMLPurifier_Token_Empty('a', array('"' => '')) + ), $result); + } + } + function test_tokenizeHTML_unescapedQuote() { $this->assertTokenization( '"',