From b621602ac1255273c987de95750a16f8a117db48 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Thu, 31 Aug 2006 22:25:48 +0000 Subject: [PATCH] Speed up cleanUTF8 with iconv. Also factored out code point to character code. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@351 48356398-32a2-884e-a903-53898d9a118a --- library/HTMLPurifier/Encoder.php | 83 ++++++++++++++++++++++----- library/HTMLPurifier/EntityParser.php | 38 +----------- tests/HTMLPurifier/EncoderTest.php | 1 + 3 files changed, 72 insertions(+), 50 deletions(-) diff --git a/library/HTMLPurifier/Encoder.php b/library/HTMLPurifier/Encoder.php index eaef8380..3315b67c 100644 --- a/library/HTMLPurifier/Encoder.php +++ b/library/HTMLPurifier/Encoder.php @@ -20,20 +20,7 @@ class HTMLPurifier_Encoder * respectively. 128 and above the code points map to multibyte * UTF-8 representations. * - * @note The functionality provided by the original function could be - * implemented with iconv using 'UTF-8//IGNORE', mbstring, or - * even the PCRE modifier 'u', these do not allow us to strip - * control characters or disallowed code points, and the latter - * does not allow invalid UTF-8 characters to be ignored. Once - * PHP 6 appears all our problems magically disappear. - * - * @note Decomposing the string into Unicode code points is necessary - * because SGML disallows the use of specific code points, not - * necessarily bytes. A naive implementation that simply strtr - * disallowed code points as bytes will break other Unicode - * characters in which using such bytes is valid. - * - * @note Code adapted from utf8ToUnicode by Henri Sivonen and + * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and * hsivonen@iki.fi at under the * LGPL license. Notes on what changed are inside, but in general, * the original code transformed UTF-8 text into an array of integer @@ -46,7 +33,33 @@ class HTMLPurifier_Encoder * would need that, and I'm probably not going to implement them. * Once again, PHP 6 should solve all our problems. */ - function cleanUTF8($str) { + function cleanUTF8($str, $force_php = false) { + + static $non_sgml_chars = array(); + static $iconv = null; + + if (empty($non_sgml_chars)) { + for ($i = 0; $i <= 31; $i++) { + // non-SGML ASCII chars + // save \r, \t and \n + if ($i == 9 || $i == 13 || $i == 10) continue; + $non_sgml_chars[chr($i)] = ''; + } + for ($i = 127; $i <= 159; $i++) { + $non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = ''; + } + } + + if ($iconv === null) { + $iconv = function_exists('iconv'); + } + + if ($iconv && !$force_php) { + // do the shortcut way + $str = iconv('UTF-8', 'UTF-8//IGNORE', $str); + return strtr($str, $non_sgml_chars);; + } + $mState = 0; // cached expected number of octets after the current octet // until the beginning of the next UTF8 character sequence $mUcs4 = 0; // cached Unicode character @@ -179,6 +192,46 @@ class HTMLPurifier_Encoder return $out; } + /** + * Translates a Unicode codepoint into its corresponding UTF-8 character. + */ + function unichr($code) { + if($code > 1114111 or $code < 0 or + ($code >= 55296 and $code <= 57343) ) { + // bits are set outside the "valid" range as defined + // by UNICODE 4.1.0 + return ''; + } + + $x = $y = $z = $w = 0; + if ($code < 128) { + // regular ASCII character + $x = $code; + } else { + // set up bits for UTF-8 + $x = ($code & 63) | 128; + if ($code < 2048) { + $y = (($code & 2047) >> 6) | 192; + } else { + $y = (($code & 4032) >> 6) | 128; + if($code < 65536) { + $z = (($code >> 12) & 15) | 224; + } else { + $z = (($code >> 12) & 63) | 128; + $w = (($code >> 18) & 7) | 240; + } + } + } + // set up the actual character + $ret = ''; + if($w) $ret .= chr($w); + if($z) $ret .= chr($z); + if($y) $ret .= chr($y); + $ret .= chr($x); + + return $ret; + } + } diff --git a/library/HTMLPurifier/EntityParser.php b/library/HTMLPurifier/EntityParser.php index 61b1c260..83593f7a 100644 --- a/library/HTMLPurifier/EntityParser.php +++ b/library/HTMLPurifier/EntityParser.php @@ -1,6 +1,7 @@ _special_dec2str[$code])) return $entity; - if($code > 1114111 or $code < 0 or - ($code >= 55296 and $code <= 57343) ) { - // bits are set outside the "valid" range as defined - // by UNICODE 4.1.0 - return ''; - } - - $x = $y = $z = $w = 0; - if ($code < 128) { - // regular ASCII character - $x = $code; - } else { - // set up bits for UTF-8 - $x = ($code & 63) | 128; - if ($code < 2048) { - $y = (($code & 2047) >> 6) | 192; - } else { - $y = (($code & 4032) >> 6) | 128; - if($code < 65536) { - $z = (($code >> 12) & 15) | 224; - } else { - $z = (($code >> 12) & 63) | 128; - $w = (($code >> 18) & 7) | 240; - } - } - } - // set up the actual character - $ret = ''; - if($w) $ret .= chr($w); - if($z) $ret .= chr($z); - if($y) $ret .= chr($y); - $ret .= chr($x); - - return $ret; + return HTMLPurifier_Encoder::unichr($code); } else { if (isset($this->_special_ent2dec[$matches[3]])) return $entity; if (!$this->_entity_lookup) { diff --git a/tests/HTMLPurifier/EncoderTest.php b/tests/HTMLPurifier/EncoderTest.php index df0f100b..1c45d1ef 100644 --- a/tests/HTMLPurifier/EncoderTest.php +++ b/tests/HTMLPurifier/EncoderTest.php @@ -15,6 +15,7 @@ class HTMLPurifier_EncoderTest extends UnitTestCase function assertCleanUTF8($string, $expect = null) { if ($expect === null) $expect = $string; $this->assertIdentical($this->Encoder->cleanUTF8($string), $expect); + $this->assertIdentical($this->Encoder->cleanUTF8($string, true), $expect); } function test_cleanUTF8() {