Speed up cleanUTF8 with iconv. Also factored out code point to character code.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@351 48356398-32a2-884e-a903-53898d9a118a
2024-12-22 08:21:52 +00:00 · 2006-08-31 22:25:48 +00:00 · 2006-08-31 22:25:48 +00:00 · b621602ac1
commit b621602ac1
parent 14aeafcf22
3 changed files with 72 additions and 50 deletions
--- a/library/HTMLPurifier/Encoder.php
+++ b/library/HTMLPurifier/Encoder.php
@ -20,20 +20,7 @@ class HTMLPurifier_Encoder
     *       respectively. 128 and above the code points map to multibyte
     *       UTF-8 representations.
     * 
-     * @note The functionality provided by the original function could be
-     *       implemented with iconv using 'UTF-8//IGNORE', mbstring, or
-     *       even the PCRE modifier 'u', these do not allow us to strip
-     *       control characters or disallowed code points, and the latter
-     *       does not allow invalid UTF-8 characters to be ignored.  Once
-     *       PHP 6 appears all our problems magically disappear.
-     * 
-     * @note Decomposing the string into Unicode code points is necessary
-     *       because SGML disallows the use of specific code points, not
-     *       necessarily bytes.  A naive implementation that simply strtr
-     *       disallowed code points as bytes will break other Unicode
-     *       characters in which using such bytes is valid.
-     * 
-     * @note Code adapted from utf8ToUnicode by Henri Sivonen and
+     * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
     *       hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
     *       LGPL license.  Notes on what changed are inside, but in general,
     *       the original code transformed UTF-8 text into an array of integer
@ -46,7 +33,33 @@ class HTMLPurifier_Encoder
     *       would need that, and I'm probably not going to implement them.
     *       Once again, PHP 6 should solve all our problems.
     */
-    function cleanUTF8($str) {
+    function cleanUTF8($str, $force_php = false) {
+        
+        static $non_sgml_chars = array();
+        static $iconv = null;
+        
+        if (empty($non_sgml_chars)) {
+            for ($i = 0; $i <= 31; $i++) {
+                // non-SGML ASCII chars
+                // save \r, \t and \n
+                if ($i == 9 || $i == 13 || $i == 10) continue;
+                $non_sgml_chars[chr($i)] = '';
+            }
+            for ($i = 127; $i <= 159; $i++) {
+                $non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
+            }
+        }
+        
+        if ($iconv === null) {
+            $iconv = function_exists('iconv');
+        }
+        
+        if ($iconv && !$force_php) {
+            // do the shortcut way
+            $str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
+            return strtr($str, $non_sgml_chars);;
+        }
+        
        $mState = 0; // cached expected number of octets after the current octet
                     // until the beginning of the next UTF8 character sequence
        $mUcs4  = 0; // cached Unicode character
@ -179,6 +192,46 @@ class HTMLPurifier_Encoder
        return $out;
    }
    
+    /**
+     * Translates a Unicode codepoint into its corresponding UTF-8 character.
+     */
+    function unichr($code) {
+        if($code > 1114111 or $code < 0 or
+          ($code >= 55296 and $code <= 57343) ) {
+            // bits are set outside the "valid" range as defined
+            // by UNICODE 4.1.0 
+            return '';
+        }
+        
+        $x = $y = $z = $w = 0; 
+        if ($code < 128) {
+            // regular ASCII character
+            $x = $code;
+        } else {
+            // set up bits for UTF-8
+            $x = ($code & 63) | 128;
+            if ($code < 2048) {
+                $y = (($code & 2047) >> 6) | 192;
+            } else {
+                $y = (($code & 4032) >> 6) | 128;
+                if($code < 65536) {
+                    $z = (($code >> 12) & 15) | 224;
+                } else {
+                    $z = (($code >> 12) & 63) | 128;
+                    $w = (($code >> 18) & 7)  | 240;
+                }
+            } 
+        }
+        // set up the actual character
+        $ret = '';
+        if($w) $ret .= chr($w);
+        if($z) $ret .= chr($z);
+        if($y) $ret .= chr($y);
+        $ret .= chr($x); 
+        
+        return $ret;
+    }
+    
    
 }

--- a/library/HTMLPurifier/EntityParser.php
+++ b/library/HTMLPurifier/EntityParser.php
@ -1,6 +1,7 @@
 <?php

 require_once 'HTMLPurifier/EntityLookup.php';
+require_once 'HTMLPurifier/Encoder.php';

 /**
 * Handles referencing and derefencing character entities
@ -9,7 +10,7 @@ class HTMLPurifier_EntityParser
 {
    
    /**
-     * Reference to entity lookup talbe.
+     * Reference to entity lookup table.
     * @protected
     */
    var $_entity_lookup;
@ -114,40 +115,7 @@ class HTMLPurifier_EntityParser
            // abort for special characters
            if (isset($this->_special_dec2str[$code]))  return $entity;
            
-            if($code > 1114111 or $code < 0 or
-              ($code >= 55296 and $code <= 57343) ) {
-                // bits are set outside the "valid" range as defined
-                // by UNICODE 4.1.0 
-                return '';
-            }
-            
-            $x = $y = $z = $w = 0; 
-            if ($code < 128) {
-                // regular ASCII character
-                $x = $code;
-            } else {
-                // set up bits for UTF-8
-                $x = ($code & 63) | 128;
-                if ($code < 2048) {
-                    $y = (($code & 2047) >> 6) | 192;
-                } else {
-                    $y = (($code & 4032) >> 6) | 128;
-                    if($code < 65536) {
-                        $z = (($code >> 12) & 15) | 224;
-                    } else {
-                        $z = (($code >> 12) & 63) | 128;
-                        $w = (($code >> 18) & 7)  | 240;
-                    }
-                } 
-            }
-            // set up the actual character
-            $ret = '';
-            if($w) $ret .= chr($w);
-            if($z) $ret .= chr($z);
-            if($y) $ret .= chr($y);
-            $ret .= chr($x); 
-            
-            return $ret;
+            return HTMLPurifier_Encoder::unichr($code);
        } else {
            if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
            if (!$this->_entity_lookup) {
--- a/tests/HTMLPurifier/EncoderTest.php
+++ b/tests/HTMLPurifier/EncoderTest.php
@ -15,6 +15,7 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
    function assertCleanUTF8($string, $expect = null) {
        if ($expect === null) $expect = $string;
        $this->assertIdentical($this->Encoder->cleanUTF8($string), $expect);
+        $this->assertIdentical($this->Encoder->cleanUTF8($string, true), $expect);
    }
    
    function test_cleanUTF8() {