[2.1.2] Implement experimental HTML5 parsing using PH5P

- Fix debugger so that tokens can be printed without an index - Fix some broken PEAR unit tests git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1383 48356398-32a2-884e-a903-53898d9a118a
2025-03-24 22:57:03 +00:00 · 2007-08-19 18:49:35 +00:00 · 2007-08-19 18:49:35 +00:00 · cb92a57e4e
commit cb92a57e4e
parent 423afedbf4
7 changed files with 4014 additions and 26 deletions
--- a/8
+++ b/8
@ -11,6 +11,12 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier

 2.1.2, unknown release date
 ! Implemented Object module for trusted users
+! Implemented experimental HTML5 parsing mode using PH5P. To use, add
+  this to your code:
+        require_once 'HTMLPurifier/Lexer/PH5P.php';
+        $config->set('Core', 'LexerImpl', PH5P');
+  Note that this Lexer introduces some classes not in the HTMLPurifier
+  namespace.
 - Fix non-visible parsing error in DirectLex with empty tags that have
  slashes inside attribute values.
 - Fix typo in CSS definition: border-collapse:seperate; was incorrectly
@ -21,6 +27,8 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
 . Unit test refactoring for one logical test per test function
 . Config and context parameters in ComplexHarness deprecated: instead, edit
  the $config and $context member variables
+. HTML wrapper in DOMLex now takes DTD identifiers into account; doesn't
+  really make a difference, but is good for completeness sake

 2.1.1, released 2007-08-04
 - Fix show-stopper bug in %URI.MakeAbsolute functionality
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@ -189,6 +189,9 @@ class HTMLPurifier_Lexer
                return new HTMLPurifier_Lexer_DOMLex();
            case 'DirectLex':
                return new HTMLPurifier_Lexer_DirectLex();
+            case 'PH5P':
+                // experimental Lexer that must be manually included
+                return new HTMLPurifier_Lexer_PH5P();
            default:
                trigger_error("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer), E_USER_ERROR);
        }
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@ -53,14 +53,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
        }
        
        // preprocess html, essential for UTF-8
-        $html =
-            '<!DOCTYPE html '.
-                'PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'.
-                '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'.
-            '<html><head>'.
-            '<meta http-equiv="Content-Type" content="text/html;'.
-                ' charset=utf-8" />'.
-            '</head><body><div>'.$html.'</div></body></html>';
+        $html = $this->wrapHTML($html, $config, $context);
        
        $doc = new DOMDocument();
        $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
@ -177,5 +170,25 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
        return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
    }
    
+    /**
+     * Wraps an HTML fragment in the necessary HTML
+     */
+    function wrapHTML($html, $config, &$context) {
+        $def = $config->getDefinition('HTML');
+        $ret = '';
+        
+        if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
+            $ret .= '<!DOCTYPE html ';
+            if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
+            if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" ';
+            $ret .= '>';
+        }
+        
+        $ret .= '<html><head>';
+        $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
+        $ret .= '</head><body><div>'.$html.'</div></body></html>';
+        return $ret;
+    }
+    
 }

--- a/library/HTMLPurifier/Lexer/PH5P.php
+++ b/library/HTMLPurifier/Lexer/PH5P.php
--- a/maintenance/PH5P.patch
+++ b/maintenance/PH5P.patch
@ -0,0 +1,45 @@
+--- old.php	2007-08-19 14:42:33.640625000 -0400
+++ new.php	2007-08-19 14:41:51.609375000 -0400
+@@ -211,7 +211,10 @@
+         // If nothing is returned, emit a U+0026 AMPERSAND character token.
+         // Otherwise, emit the character token that was returned.
+         $char = (!$entity) ? '&' : $entity;
+-        $this->emitToken($char);
+        $this->emitToken(array(
+            'type' => self::CHARACTR,
+            'data' => $char
+        ));
+ 
+         // Finally, switch to the data state.
+         $this->state = 'data';
+@@ -708,7 +711,7 @@
+         } elseif($char === '&') {
+             /* U+0026 AMPERSAND (&)
+             Switch to the entity in attribute value state. */
+-            $this->entityInAttributeValueState('non');
+            $this->entityInAttributeValueState();
+ 
+         } elseif($char === '>') {
+             /* U+003E GREATER-THAN SIGN (>)
+@@ -738,7 +741,8 @@
+             ? '&'
+             : $entity;
+ 
+-        $this->emitToken($char);
+        $last = count($this->token['attr']) - 1;
+        $this->token['attr'][$last]['value'] .= $char;
+     }
+ 
+     private function bogusCommentState() {
+@@ -1066,6 +1070,11 @@
+                     $this->char++;
+ 
+                     if(in_array($id, $this->entities)) {
+                        if ($e_name[$c-1] !== ';') {
+                            if ($c < $len && $e_name[$c] == ';') {
+                                $this->char++; // consume extra semicolon
+                            }
+                        }
+                         $entity = $id;
+                         break;
+                     }
--- a/tests/Debugger.php
+++ b/tests/Debugger.php
@ -54,14 +54,14 @@ function isInScopes($array = array()) {
 }
 /**#@-*/

-function printTokens($tokens, $index) {
+function printTokens($tokens, $index = null) {
    $string = '<pre>';
    $generator = new HTMLPurifier_Generator();
    foreach ($tokens as $i => $token) {
-        if ($index == $i) $string .= '[<strong>';
+        if ($index === $i) $string .= '[<strong>';
        $string .= "<sup>$i</sup>";
        $string .= $generator->escape($generator->generateFromToken($token));
-        if ($index == $i) $string .= '</strong>]';
+        if ($index === $i) $string .= '</strong>]';
    }
    $string .= '</pre>';
    echo $string;
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -18,6 +18,9 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
            require_once 'HTMLPurifier/Lexer/PEARSax3.php';
            $this->_has_pear = true;
        }
+        if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
+            require_once 'HTMLPurifier/Lexer/PH5P.php';
+        }
    }
    
    // HTMLPurifier_Lexer::create() --------------------------------------------
@ -139,14 +142,21 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
        if ($this->_has_pear) $lexers['PEARSax3']   = new HTMLPurifier_Lexer_PEARSax3();
        if (version_compare(PHP_VERSION, "5", ">=") && class_exists('DOMDocument')) {
            $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
+            $lexers['PH5P']   = new HTMLPurifier_Lexer_PH5P();
        }
        foreach ($lexers as $name => $lexer) {
            $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
            if (isset($alt_expect[$name])) {
                if ($alt_expect[$name] === false) continue;
-                $this->assertIdentical($result, $alt_expect[$name]);
+                $t_expect = $alt_expect[$name];
+                $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
            } else {
-                $this->assertIdentical($result, $expect);
+                $t_expect = $expect;
+                $this->assertIdentical($result, $expect, "$name: %s");
+            }
+            if ($t_expect != $result) {
+                printTokens($result);
+                //var_dump($result);
            }
        }
    }
@ -206,8 +216,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
                new HTMLPurifier_Token_End('ASDF'),
            ),
            array(
-                // DOMLex automatically closes invalid tags
-                'DOMLex' => array(
+                'DOMLex' => $alt = array(
                    new HTMLPurifier_Token_Empty('asdf'),
                    new HTMLPurifier_Token_Empty('d'),
                    new HTMLPurifier_Token_Start('pooloka'),
@ -216,6 +225,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
                    new HTMLPurifier_Token_End('poolasdf'),
                    new HTMLPurifier_Token_End('pooloka'),
                ),
+                'PH5P' => $alt,
            )
        );
    }
@ -244,7 +254,10 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
    function test_tokenizeHTML_comment() {
        $this->assertTokenization(
            '<!-- Comment -->',
-            array( new HTMLPurifier_Token_Comment(' Comment ') )
+            array( new HTMLPurifier_Token_Comment(' Comment ') ),
+            array(
+                'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- Comment --') ),
+            )
        );
    }
    
@ -253,7 +266,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
            '<!-- not so well formed --->',
            array( new HTMLPurifier_Token_Comment(' not so well formed -') ),
            array(
-                'PEARSax3' => false, // behavior is undefined
+                'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- not so well formed ---') ),
            )
        );
    }
@ -266,6 +279,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
                // I like our behavior better, but it's non-standard
                'DOMLex'   => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
                'PEARSax3' => array( new HTMLPurifier_Token_Start('a', array('href'=>'')) ),
+                'PH5P' => false, // total barfing, grabs scaffolding too
            )
        );
    }
@ -277,13 +291,13 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
                new HTMLPurifier_Token_Text('<b>')
            ),
            array(
-                // it is possible to configure PEARSax3 to clump nodes together,
-                // I just don't know how
-                'PEARSax3' => array(
+                // some parsers will separate entities out
+                'PEARSax3' => $split = array(
                    new HTMLPurifier_Token_Text('<'),
                    new HTMLPurifier_Token_Text('b'),
                    new HTMLPurifier_Token_Text('>'),
-                )
+                ),
+                'PH5P' => $split,
            )
        );
    }
@ -298,6 +312,9 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
                    new HTMLPurifier_Token_Start('a', array('"' => ''))
                ),
                'PEARSax3' => $tokens,
+                'PH5P' => array(
+                    new HTMLPurifier_Token_Empty('a', array('"' => ''))
+                ),
            )
        );
    }
@ -312,7 +329,10 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
    function test_tokenizeHTML_escapedQuote() {
        $this->assertTokenization(
            '&quot;',
-            array( new HTMLPurifier_Token_Text('"') )
+            array( new HTMLPurifier_Token_Text('"') ),
+            array(
+                'PEARSax3' => false, // PEAR barfs on this
+            )
        );
    }
    
@ -322,7 +342,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
            array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
            array(
                // PEAR splits up all of the CDATA
-                'PEARSax3' => array(
+                'PEARSax3' => $split = array(
                    new HTMLPurifier_Token_Text('You '),
                    new HTMLPurifier_Token_Text('<'),
                    new HTMLPurifier_Token_Text('b'),
@ -335,6 +355,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
                    new HTMLPurifier_Token_Text('>'),
                    new HTMLPurifier_Token_Text(' get me!'),
                ),
+                'PH5P' => $split,
            )
        );
    }
@ -351,10 +372,11 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
            '<![CDATA[&rarr;]]>',
            array( new HTMLPurifier_Token_Text("&rarr;") ),
            array(
-                'PEARSax3' => array(
+                'PEARSax3' => $split = array(
                    new HTMLPurifier_Token_Text('&'),
                    new HTMLPurifier_Token_Text('rarr;'),
                ),
+                'PH5P' => $split,
            )
        );
    }
@ -403,6 +425,13 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
                    new HTMLPurifier_Token_End('b'),
                ),
                'PEARSax3' => false, // totally mangled
+                'PH5P' => array( // interesting grouping
+                    new HTMLPurifier_Token_Start('b'),
+                    new HTMLPurifier_Token_Text('Whoa! '),
+                    new HTMLPurifier_Token_Text('<'),
+                    new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
+                    new HTMLPurifier_Token_End('b'),
+                ),
            )
        );
    }
@ -426,7 +455,8 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
            array( new HTMLPurifier_Token_Comment(' This >< comment') ),
            array(
                'DOMLex'   => false,
-                'PEARSax3' => false
+                'PEARSax3' => false,
+                'PH5P'     => false,
            )
        );
    }
@ -434,14 +464,17 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
    function test_tokenizeHTML_scriptCDATAContents() {
        $this->config->set('HTML', 'Trusted', true);
        $this->assertTokenization(
-            '<script>alert("<foo>");</script>',
+            'Foo: <script>alert("<foo>");</script>',
            array(
+                new HTMLPurifier_Token_Text('Foo: '),
                new HTMLPurifier_Token_Start('script'),
                new HTMLPurifier_Token_Text('alert("<foo>");'),
                new HTMLPurifier_Token_End('script'),
            ),
            array(
                'PEARSax3' => false,
+                // PH5P, for some reason, bubbles the script to <head>
+                'PH5P' => false,
            )
        );
    }