From b621602ac1255273c987de95750a16f8a117db48 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <edwardzyang@thewritingpot.com>
Date: Thu, 31 Aug 2006 22:25:48 +0000
Subject: [PATCH] Speed up cleanUTF8 with iconv. Also factored out code point
 to character code.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@351 48356398-32a2-884e-a903-53898d9a118a
---
 library/HTMLPurifier/Encoder.php      | 83 ++++++++++++++++++++++-----
 library/HTMLPurifier/EntityParser.php | 38 +-----------
 tests/HTMLPurifier/EncoderTest.php    |  1 +
 3 files changed, 72 insertions(+), 50 deletions(-)

diff --git a/library/HTMLPurifier/Encoder.php b/library/HTMLPurifier/Encoder.php
index eaef8380..3315b67c 100644
--- a/library/HTMLPurifier/Encoder.php
+++ b/library/HTMLPurifier/Encoder.php
@@ -20,20 +20,7 @@ class HTMLPurifier_Encoder
      *       respectively. 128 and above the code points map to multibyte
      *       UTF-8 representations.
      * 
-     * @note The functionality provided by the original function could be
-     *       implemented with iconv using 'UTF-8//IGNORE', mbstring, or
-     *       even the PCRE modifier 'u', these do not allow us to strip
-     *       control characters or disallowed code points, and the latter
-     *       does not allow invalid UTF-8 characters to be ignored.  Once
-     *       PHP 6 appears all our problems magically disappear.
-     * 
-     * @note Decomposing the string into Unicode code points is necessary
-     *       because SGML disallows the use of specific code points, not
-     *       necessarily bytes.  A naive implementation that simply strtr
-     *       disallowed code points as bytes will break other Unicode
-     *       characters in which using such bytes is valid.
-     * 
-     * @note Code adapted from utf8ToUnicode by Henri Sivonen and
+     * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
      *       hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
      *       LGPL license.  Notes on what changed are inside, but in general,
      *       the original code transformed UTF-8 text into an array of integer
@@ -46,7 +33,33 @@ class HTMLPurifier_Encoder
      *       would need that, and I'm probably not going to implement them.
      *       Once again, PHP 6 should solve all our problems.
      */
-    function cleanUTF8($str) {
+    function cleanUTF8($str, $force_php = false) {
+        
+        static $non_sgml_chars = array();
+        static $iconv = null;
+        
+        if (empty($non_sgml_chars)) {
+            for ($i = 0; $i <= 31; $i++) {
+                // non-SGML ASCII chars
+                // save \r, \t and \n
+                if ($i == 9 || $i == 13 || $i == 10) continue;
+                $non_sgml_chars[chr($i)] = '';
+            }
+            for ($i = 127; $i <= 159; $i++) {
+                $non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
+            }
+        }
+        
+        if ($iconv === null) {
+            $iconv = function_exists('iconv');
+        }
+        
+        if ($iconv && !$force_php) {
+            // do the shortcut way
+            $str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
+            return strtr($str, $non_sgml_chars);;
+        }
+        
         $mState = 0; // cached expected number of octets after the current octet
                      // until the beginning of the next UTF8 character sequence
         $mUcs4  = 0; // cached Unicode character
@@ -179,6 +192,46 @@ class HTMLPurifier_Encoder
         return $out;
     }
     
+    /**
+     * Translates a Unicode codepoint into its corresponding UTF-8 character.
+     */
+    function unichr($code) {
+        if($code > 1114111 or $code < 0 or
+          ($code >= 55296 and $code <= 57343) ) {
+            // bits are set outside the "valid" range as defined
+            // by UNICODE 4.1.0 
+            return '';
+        }
+        
+        $x = $y = $z = $w = 0; 
+        if ($code < 128) {
+            // regular ASCII character
+            $x = $code;
+        } else {
+            // set up bits for UTF-8
+            $x = ($code & 63) | 128;
+            if ($code < 2048) {
+                $y = (($code & 2047) >> 6) | 192;
+            } else {
+                $y = (($code & 4032) >> 6) | 128;
+                if($code < 65536) {
+                    $z = (($code >> 12) & 15) | 224;
+                } else {
+                    $z = (($code >> 12) & 63) | 128;
+                    $w = (($code >> 18) & 7)  | 240;
+                }
+            } 
+        }
+        // set up the actual character
+        $ret = '';
+        if($w) $ret .= chr($w);
+        if($z) $ret .= chr($z);
+        if($y) $ret .= chr($y);
+        $ret .= chr($x); 
+        
+        return $ret;
+    }
+    
     
 }
 
diff --git a/library/HTMLPurifier/EntityParser.php b/library/HTMLPurifier/EntityParser.php
index 61b1c260..83593f7a 100644
--- a/library/HTMLPurifier/EntityParser.php
+++ b/library/HTMLPurifier/EntityParser.php
@@ -1,6 +1,7 @@
 <?php
 
 require_once 'HTMLPurifier/EntityLookup.php';
+require_once 'HTMLPurifier/Encoder.php';
 
 /**
  * Handles referencing and derefencing character entities
@@ -9,7 +10,7 @@ class HTMLPurifier_EntityParser
 {
     
     /**
-     * Reference to entity lookup talbe.
+     * Reference to entity lookup table.
      * @protected
      */
     var $_entity_lookup;
@@ -114,40 +115,7 @@ class HTMLPurifier_EntityParser
             // abort for special characters
             if (isset($this->_special_dec2str[$code]))  return $entity;
             
-            if($code > 1114111 or $code < 0 or
-              ($code >= 55296 and $code <= 57343) ) {
-                // bits are set outside the "valid" range as defined
-                // by UNICODE 4.1.0 
-                return '';
-            }
-            
-            $x = $y = $z = $w = 0; 
-            if ($code < 128) {
-                // regular ASCII character
-                $x = $code;
-            } else {
-                // set up bits for UTF-8
-                $x = ($code & 63) | 128;
-                if ($code < 2048) {
-                    $y = (($code & 2047) >> 6) | 192;
-                } else {
-                    $y = (($code & 4032) >> 6) | 128;
-                    if($code < 65536) {
-                        $z = (($code >> 12) & 15) | 224;
-                    } else {
-                        $z = (($code >> 12) & 63) | 128;
-                        $w = (($code >> 18) & 7)  | 240;
-                    }
-                } 
-            }
-            // set up the actual character
-            $ret = '';
-            if($w) $ret .= chr($w);
-            if($z) $ret .= chr($z);
-            if($y) $ret .= chr($y);
-            $ret .= chr($x); 
-            
-            return $ret;
+            return HTMLPurifier_Encoder::unichr($code);
         } else {
             if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
             if (!$this->_entity_lookup) {
diff --git a/tests/HTMLPurifier/EncoderTest.php b/tests/HTMLPurifier/EncoderTest.php
index df0f100b..1c45d1ef 100644
--- a/tests/HTMLPurifier/EncoderTest.php
+++ b/tests/HTMLPurifier/EncoderTest.php
@@ -15,6 +15,7 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
     function assertCleanUTF8($string, $expect = null) {
         if ($expect === null) $expect = $string;
         $this->assertIdentical($this->Encoder->cleanUTF8($string), $expect);
+        $this->assertIdentical($this->Encoder->cleanUTF8($string, true), $expect);
     }
     
     function test_cleanUTF8() {