[3.1.0] Fixed fatal error in PH5P lexer with invalid tag names

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1650 48356398-32a2-884e-a903-53898d9a118a
2024-12-22 08:21:52 +00:00 · 2008-04-05 04:28:37 +00:00 · 2008-04-05 04:28:37 +00:00 · 9f1e678b48
commit 9f1e678b48
parent c216968087
6 changed files with 93 additions and 24 deletions
--- a/1
+++ b/1
@ -55,6 +55,7 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
 - Fix bug with rgb(0, 1, 2) color syntax with spaces inside shorthand syntax
 - HTMLPurifier_HTMLDefinition->addAttribute can now be called multiple times
  on the same element without emitting errors.
+- Fixed fatal error in PH5P lexer with invalid tag names
 . Plugins now get their own changelogs according to project conventions.
 . Convert tokens to use instanceof, reducing memory footprint and
  improving comparison speed.
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@ -63,16 +63,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
            $e =& $context->get('ErrorCollector');
        }
        
-        // infinite loop protection
-        // has to be pretty big, since html docs can be big
-        // we're allow two hundred thousand tags... more than enough?
-        // NOTE: this is also used for synchronization, so watch out
+        // for testing synchronization
        $loops = 0;
        
-        while(true) {
-            
-            // infinite loop protection
-            if (++$loops > 200000) return array();
+        while(++$loops) {
            
            // recalculate lines
            if (
@ -381,16 +375,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
        // space, so let's guarantee that there's always a terminating space.
        $string .= ' ';
        
-        // infinite loop protection
-        $loops = 0;
        while(true) {
            
-            // infinite loop protection
-            if (++$loops > 1000) {
-                trigger_error('Infinite loop detected in attribute parsing', E_USER_WARNING);
-                return array();
-            }
-            
            if ($cursor >= $size) {
                break;
            }
--- a/library/HTMLPurifier/Lexer/PH5P.php
+++ b/library/HTMLPurifier/Lexer/PH5P.php
@ -115,7 +115,7 @@ class HTML5 {

    public function __construct($data) {
        $data = str_replace("\r\n", "\n", $data);
-        $date = str_replace("\r", null, $data);
+        $data = str_replace("\r", null, $data);

        $this->data = $data;
        $this->char = -1;
@ -2143,7 +2143,7 @@ class HTML5TreeConstructer {
                    /* Reconstruct the active formatting elements, if any. */
                    $this->reconstructActiveFormattingElements();

-                    $this->insertElement($token);
+                    $this->insertElement($token, true, true);
                break;
            }
            break;
@ -3524,7 +3524,18 @@ class HTML5TreeConstructer {
        }
    }

-    private function insertElement($token, $append = true) {
+    private function insertElement($token, $append = true, $check = false) {
+        // Proprietary workaround for libxml2's limitations with tag names
+        if ($check) {
+            // Slightly modified HTML5 tag-name modification,
+            // removing anything that's not an ASCII letter, digit, or hyphen
+            $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
+            // Remove leading hyphens and numbers
+            $token['name'] = ltrim($token['name'], '-0..9');
+            // In theory, this should ever be needed, but just in case
+            if ($token['name'] === '') $token['name'] = 'span'; // arbitrary generic choice
+        }
+        
        $el = $this->dom->createElement($token['name']);

        foreach($token['attr'] as $attr) {
--- a/maintenance/PH5P.patch
+++ b/maintenance/PH5P.patch
@ -1,5 +1,14 @@
--- C:\Users\Edward\Webs\htmlpurifier\maintenance\PH5P.php	2007-11-04 23:41:49.074543700 -0500
-+++ C:\Users\Edward\Webs\htmlpurifier\maintenance/PH5P.new.php	2007-11-05 00:23:52.839543700 -0500
+--- C:\Users\Edward\Webs\htmlpurifier\maintenance\PH5P.php	2007-11-05 00:01:51.643585000 -0500
+++ C:\Users\Edward\Webs\htmlpurifier\maintenance/PH5P.new.php	2008-04-05 00:26:39.343160000 -0400
+@@ -65,7 +65,7 @@
+ 
+     public function __construct($data) {
+         $data = str_replace("\r\n", "\n", $data);
+-        $date = str_replace("\r", null, $data);
+        $data = str_replace("\r", null, $data);
+ 
+         $this->data = $data;
+         $this->char = -1;
@@ -211,7 +211,10 @@
         // If nothing is returned, emit a U+0026 AMPERSAND character token.
         // Otherwise, emit the character token that was returned.
@ -43,7 +52,36 @@
                         $entity = $id;
                         break;
                     }
-@@ -3659,7 +3668,7 @@
+@@ -2084,7 +2093,7 @@
+                     /* Reconstruct the active formatting elements, if any. */
+                     $this->reconstructActiveFormattingElements();
+ 
+-                    $this->insertElement($token);
+                    $this->insertElement($token, true, true);
+                 break;
+             }
+             break;
+@@ -3465,7 +3474,18 @@
+         }
+     }
+ 
+-    private function insertElement($token, $append = true) {
+    private function insertElement($token, $append = true, $check = false) {
+        // Proprietary workaround for libxml2's limitations with tag names
+        if ($check) {
+            // Slightly modified HTML5 tag-name modification,
+            // removing anything that's not an ASCII letter, digit, or hyphen
+            $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
+            // Remove leading hyphens and numbers
+            $token['name'] = ltrim($token['name'], '-0..9');
+            // In theory, this should ever be needed, but just in case
+            if ($token['name'] === '') $token['name'] = 'span'; // arbitrary generic choice
+        }
+        
+         $el = $this->dom->createElement($token['name']);
+ 
+         foreach($token['attr'] as $attr) {
+@@ -3659,7 +3679,7 @@
         }
     }
 
@ -52,7 +90,7 @@
         /* When the steps below require the UA to generate implied end tags,
         then, if the current node is a dd element, a dt element, an li element,
         a p element, a td element, a th  element, or a tr element, the UA must
-@@ -3673,7 +3682,8 @@
+@@ -3673,7 +3693,8 @@
         }
     }
 
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -509,7 +509,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
        );
    }
    
-    function test_tokenizeHTML_() {
+    function test_tokenizeHTML_style() {
        $extra = array(
                // PH5P doesn't seem to like style tags
                'PH5P' => false,
@ -543,6 +543,26 @@ div {}
        );
    }
    
+    function test_tokenizeHTML_() {
+        $this->assertTokenization(
+            '<a@>>',
+            array(
+                new HTMLPurifier_Token_Start('a'),
+                new HTMLPurifier_Token_Text('>'),
+                new HTMLPurifier_Token_End('a'),
+            ),
+            array(
+                'DirectLex' => array(
+                    // Technically this is invalid, but it won't be a
+                    // problem with invalid element removal; also, this
+                    // mimics Mozilla's parsing of the tag.
+                    new HTMLPurifier_Token_Start('a@'),
+                    new HTMLPurifier_Token_Text('>'),
+                ),
+            )
+        );
+    }
+    
    /*
    
    function test_tokenizeHTML_() {
--- a/tests/common.php
+++ b/tests/common.php
@ -209,3 +209,16 @@ function htmlpurifier_flush($php, $reporter) {
        exit(1);
    }
 }
+
+/**
+ * Dumps error queue, useful if there has been a fatal error.
+ */
+function htmlpurifier_dump_error_queue() {
+    $context = &SimpleTest::getContext();
+    $queue = &$context->get('SimpleErrorQueue');
+    if ($queue && !empty($queue->_queue)) {
+        // replace this with something prettier
+        var_dump($queue->_queue);
+    }
+}
+register_shutdown_function('htmlpurifier_dump_error_queue');