diff --git a/library/HTMLPurifier/Lexer/PH5P.php b/library/HTMLPurifier/Lexer/PH5P.php
index 9fa92448..81659e50 100644
--- a/library/HTMLPurifier/Lexer/PH5P.php
+++ b/library/HTMLPurifier/Lexer/PH5P.php
@@ -2,17 +2,28 @@
/**
* Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
- * Requires PHP5, and occupies space in the HTML5 pseudo-namespace (may
- * cause conflicts, sorry).
+ * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts.
+ *
+ * @note
+ * Recent changes to PHP's DOM extension have resulted in some fatal
+ * error conditions with the original version of PH5P. Pending changes,
+ * this lexer will punt to DirectLex if DOM throughs an exception.
*/
class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex {
public function tokenizeHTML($html, $config, $context) {
- $html = $this->normalize($html, $config, $context);
- $html = $this->wrapHTML( $html, $config, $context);
- $parser = new HTML5($html);
- $doc = $parser->save();
+ $new_html = $this->normalize($html, $config, $context);
+ $new_html = $this->wrapHTML($new_html, $config, $context);
+ try {
+ $parser = new HTML5($new_html);
+ $doc = $parser->save();
+ } catch (DOMException $e) {
+ // Uh oh, it failed. Punt to DirectLex.
+ $lexer = new HTMLPurifier_Lexer_DirectLex();
+ $context->register('PH5PError', $e); // save the error, so we can detect it
+ return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
+ }
$tokens = array();
$this->tokenizeDOM(
$doc->getElementsByTagName('html')->item(0)-> //
diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php
index 56e94e6d..a720f74a 100644
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@@ -138,7 +138,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
$lexers = array();
$lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
if ($this->_has_pear) $lexers['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3();
- if (version_compare(PHP_VERSION, "5", ">=") && class_exists('DOMDocument')) {
+ if (class_exists('DOMDocument')) {
$lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
$lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P();
}
@@ -310,13 +310,26 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
new HTMLPurifier_Token_Start('a', array('"' => ''))
),
'PEARSax3' => $tokens,
- 'PH5P' => array(
- new HTMLPurifier_Token_Empty('a', array('"' => ''))
- ),
+ 'PH5P' => false, // behavior varies; handle this personally
)
);
}
+ function test_tokenizeHTML_earlyQuote_PH5P() {
+ if (!class_exists('DOMDocument')) return;
+ $lexer = new HTMLPurifier_Lexer_PH5P();
+ $result = $lexer->tokenizeHTML('', $this->config, $this->context);
+ if ($this->context->get('PH5PError', true)) {
+ $this->assertIdentical(array(
+ new HTMLPurifier_Token_Start('a', array('"' => ''))
+ ), $result);
+ } else {
+ $this->assertIdentical(array(
+ new HTMLPurifier_Token_Empty('a', array('"' => ''))
+ ), $result);
+ }
+ }
+
function test_tokenizeHTML_unescapedQuote() {
$this->assertTokenization(
'"',