Speed up cleanUTF8 with iconv. Also factored out code point to character code.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@351 48356398-32a2-884e-a903-53898d9a118a
2025-01-03 13:21:51 +00:00 · 2006-08-31 22:25:48 +00:00 · 2006-08-31 22:25:48 +00:00 · b621602ac1
commit b621602ac1
parent 14aeafcf22
3 changed files with 72 additions and 50 deletions
--- a/library/HTMLPurifier/Encoder.php
+++ b/library/HTMLPurifier/Encoder.php
@ -20,20 +20,7 @@ class HTMLPurifier_Encoder
     *       respectively. 128 and above the code points map to multibyte
     *       UTF-8 representations.
     * 
-     * @note The functionality provided by the original function could be
+     * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
     *       implemented with iconv using 'UTF-8//IGNORE', mbstring, or
     *       even the PCRE modifier 'u', these do not allow us to strip
     *       control characters or disallowed code points, and the latter
     *       does not allow invalid UTF-8 characters to be ignored.  Once
     *       PHP 6 appears all our problems magically disappear.
     * 
     * @note Decomposing the string into Unicode code points is necessary
     *       because SGML disallows the use of specific code points, not
     *       necessarily bytes.  A naive implementation that simply strtr
     *       disallowed code points as bytes will break other Unicode
     *       characters in which using such bytes is valid.
     * 
     * @note Code adapted from utf8ToUnicode by Henri Sivonen and
     *       hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
     *       LGPL license.  Notes on what changed are inside, but in general,
     *       the original code transformed UTF-8 text into an array of integer
@ -46,7 +33,33 @@ class HTMLPurifier_Encoder
     *       would need that, and I'm probably not going to implement them.
     *       Once again, PHP 6 should solve all our problems.
     */
-    function cleanUTF8($str) {
+    function cleanUTF8($str, $force_php = false) {
        static $non_sgml_chars = array();
        static $iconv = null;
        if (empty($non_sgml_chars)) {
            for ($i = 0; $i <= 31; $i++) {
                // non-SGML ASCII chars
                // save \r, \t and \n
                if ($i == 9 || $i == 13 || $i == 10) continue;
                $non_sgml_chars[chr($i)] = '';
            }
            for ($i = 127; $i <= 159; $i++) {
                $non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
            }
        }
        if ($iconv === null) {
            $iconv = function_exists('iconv');
        }
        if ($iconv && !$force_php) {
            // do the shortcut way
            $str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
            return strtr($str, $non_sgml_chars);;
        }
        $mState = 0; // cached expected number of octets after the current octet
                     // until the beginning of the next UTF8 character sequence
        $mUcs4  = 0; // cached Unicode character
@ -179,6 +192,46 @@ class HTMLPurifier_Encoder
        return $out;
    }
    /**
     * Translates a Unicode codepoint into its corresponding UTF-8 character.
     */
    function unichr($code) {
        if($code > 1114111 or $code < 0 or
          ($code >= 55296 and $code <= 57343) ) {
            // bits are set outside the "valid" range as defined
            // by UNICODE 4.1.0 
            return '';
        }
        $x = $y = $z = $w = 0; 
        if ($code < 128) {
            // regular ASCII character
            $x = $code;
        } else {
            // set up bits for UTF-8
            $x = ($code & 63) | 128;
            if ($code < 2048) {
                $y = (($code & 2047) >> 6) | 192;
            } else {
                $y = (($code & 4032) >> 6) | 128;
                if($code < 65536) {
                    $z = (($code >> 12) & 15) | 224;
                } else {
                    $z = (($code >> 12) & 63) | 128;
                    $w = (($code >> 18) & 7)  | 240;
                }
            } 
        }
        // set up the actual character
        $ret = '';
        if($w) $ret .= chr($w);
        if($z) $ret .= chr($z);
        if($y) $ret .= chr($y);
        $ret .= chr($x); 
        return $ret;
    }
 }
--- a/library/HTMLPurifier/EntityParser.php
+++ b/library/HTMLPurifier/EntityParser.php
@ -1,6 +1,7 @@
 <?php
 require_once 'HTMLPurifier/EntityLookup.php';
 require_once 'HTMLPurifier/Encoder.php';
 /**
 * Handles referencing and derefencing character entities
@ -9,7 +10,7 @@ class HTMLPurifier_EntityParser
 {
    /**
-     * Reference to entity lookup talbe.
+     * Reference to entity lookup table.
     * @protected
     */
    var $_entity_lookup;
@ -114,40 +115,7 @@ class HTMLPurifier_EntityParser
            // abort for special characters
            if (isset($this->_special_dec2str[$code]))  return $entity;
-            if($code > 1114111 or $code < 0 or
+            return HTMLPurifier_Encoder::unichr($code);
              ($code >= 55296 and $code <= 57343) ) {
                // bits are set outside the "valid" range as defined
                // by UNICODE 4.1.0 
                return '';
            }
            $x = $y = $z = $w = 0; 
            if ($code < 128) {
                // regular ASCII character
                $x = $code;
            } else {
                // set up bits for UTF-8
                $x = ($code & 63) | 128;
                if ($code < 2048) {
                    $y = (($code & 2047) >> 6) | 192;
                } else {
                    $y = (($code & 4032) >> 6) | 128;
                    if($code < 65536) {
                        $z = (($code >> 12) & 15) | 224;
                    } else {
                        $z = (($code >> 12) & 63) | 128;
                        $w = (($code >> 18) & 7)  | 240;
                    }
                } 
            }
            // set up the actual character
            $ret = '';
            if($w) $ret .= chr($w);
            if($z) $ret .= chr($z);
            if($y) $ret .= chr($y);
            $ret .= chr($x); 
            return $ret;
        } else {
            if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
            if (!$this->_entity_lookup) {
--- a/tests/HTMLPurifier/EncoderTest.php
+++ b/tests/HTMLPurifier/EncoderTest.php
@ -15,6 +15,7 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
    function assertCleanUTF8($string, $expect = null) {
        if ($expect === null) $expect = $string;
        $this->assertIdentical($this->Encoder->cleanUTF8($string), $expect);
        $this->assertIdentical($this->Encoder->cleanUTF8($string, true), $expect);
    }
    function test_cleanUTF8() {