0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-08 15:11:51 +00:00

Speed up cleanUTF8 with iconv. Also factored out code point to character code.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@351 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-08-31 22:25:48 +00:00
parent 14aeafcf22
commit b621602ac1
3 changed files with 72 additions and 50 deletions

View File

@ -20,20 +20,7 @@ class HTMLPurifier_Encoder
* respectively. 128 and above the code points map to multibyte * respectively. 128 and above the code points map to multibyte
* UTF-8 representations. * UTF-8 representations.
* *
* @note The functionality provided by the original function could be * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
* implemented with iconv using 'UTF-8//IGNORE', mbstring, or
* even the PCRE modifier 'u', these do not allow us to strip
* control characters or disallowed code points, and the latter
* does not allow invalid UTF-8 characters to be ignored. Once
* PHP 6 appears all our problems magically disappear.
*
* @note Decomposing the string into Unicode code points is necessary
* because SGML disallows the use of specific code points, not
* necessarily bytes. A naive implementation that simply strtr
* disallowed code points as bytes will break other Unicode
* characters in which using such bytes is valid.
*
* @note Code adapted from utf8ToUnicode by Henri Sivonen and
* hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the * hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
* LGPL license. Notes on what changed are inside, but in general, * LGPL license. Notes on what changed are inside, but in general,
* the original code transformed UTF-8 text into an array of integer * the original code transformed UTF-8 text into an array of integer
@ -46,7 +33,33 @@ class HTMLPurifier_Encoder
* would need that, and I'm probably not going to implement them. * would need that, and I'm probably not going to implement them.
* Once again, PHP 6 should solve all our problems. * Once again, PHP 6 should solve all our problems.
*/ */
function cleanUTF8($str) { function cleanUTF8($str, $force_php = false) {
static $non_sgml_chars = array();
static $iconv = null;
if (empty($non_sgml_chars)) {
for ($i = 0; $i <= 31; $i++) {
// non-SGML ASCII chars
// save \r, \t and \n
if ($i == 9 || $i == 13 || $i == 10) continue;
$non_sgml_chars[chr($i)] = '';
}
for ($i = 127; $i <= 159; $i++) {
$non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
}
}
if ($iconv === null) {
$iconv = function_exists('iconv');
}
if ($iconv && !$force_php) {
// do the shortcut way
$str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
return strtr($str, $non_sgml_chars);;
}
$mState = 0; // cached expected number of octets after the current octet $mState = 0; // cached expected number of octets after the current octet
// until the beginning of the next UTF8 character sequence // until the beginning of the next UTF8 character sequence
$mUcs4 = 0; // cached Unicode character $mUcs4 = 0; // cached Unicode character
@ -179,6 +192,46 @@ class HTMLPurifier_Encoder
return $out; return $out;
} }
/**
* Translates a Unicode codepoint into its corresponding UTF-8 character.
*/
function unichr($code) {
if($code > 1114111 or $code < 0 or
($code >= 55296 and $code <= 57343) ) {
// bits are set outside the "valid" range as defined
// by UNICODE 4.1.0
return '';
}
$x = $y = $z = $w = 0;
if ($code < 128) {
// regular ASCII character
$x = $code;
} else {
// set up bits for UTF-8
$x = ($code & 63) | 128;
if ($code < 2048) {
$y = (($code & 2047) >> 6) | 192;
} else {
$y = (($code & 4032) >> 6) | 128;
if($code < 65536) {
$z = (($code >> 12) & 15) | 224;
} else {
$z = (($code >> 12) & 63) | 128;
$w = (($code >> 18) & 7) | 240;
}
}
}
// set up the actual character
$ret = '';
if($w) $ret .= chr($w);
if($z) $ret .= chr($z);
if($y) $ret .= chr($y);
$ret .= chr($x);
return $ret;
}
} }

View File

@ -1,6 +1,7 @@
<?php <?php
require_once 'HTMLPurifier/EntityLookup.php'; require_once 'HTMLPurifier/EntityLookup.php';
require_once 'HTMLPurifier/Encoder.php';
/** /**
* Handles referencing and derefencing character entities * Handles referencing and derefencing character entities
@ -9,7 +10,7 @@ class HTMLPurifier_EntityParser
{ {
/** /**
* Reference to entity lookup talbe. * Reference to entity lookup table.
* @protected * @protected
*/ */
var $_entity_lookup; var $_entity_lookup;
@ -114,40 +115,7 @@ class HTMLPurifier_EntityParser
// abort for special characters // abort for special characters
if (isset($this->_special_dec2str[$code])) return $entity; if (isset($this->_special_dec2str[$code])) return $entity;
if($code > 1114111 or $code < 0 or return HTMLPurifier_Encoder::unichr($code);
($code >= 55296 and $code <= 57343) ) {
// bits are set outside the "valid" range as defined
// by UNICODE 4.1.0
return '';
}
$x = $y = $z = $w = 0;
if ($code < 128) {
// regular ASCII character
$x = $code;
} else {
// set up bits for UTF-8
$x = ($code & 63) | 128;
if ($code < 2048) {
$y = (($code & 2047) >> 6) | 192;
} else {
$y = (($code & 4032) >> 6) | 128;
if($code < 65536) {
$z = (($code >> 12) & 15) | 224;
} else {
$z = (($code >> 12) & 63) | 128;
$w = (($code >> 18) & 7) | 240;
}
}
}
// set up the actual character
$ret = '';
if($w) $ret .= chr($w);
if($z) $ret .= chr($z);
if($y) $ret .= chr($y);
$ret .= chr($x);
return $ret;
} else { } else {
if (isset($this->_special_ent2dec[$matches[3]])) return $entity; if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
if (!$this->_entity_lookup) { if (!$this->_entity_lookup) {

View File

@ -15,6 +15,7 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
function assertCleanUTF8($string, $expect = null) { function assertCleanUTF8($string, $expect = null) {
if ($expect === null) $expect = $string; if ($expect === null) $expect = $string;
$this->assertIdentical($this->Encoder->cleanUTF8($string), $expect); $this->assertIdentical($this->Encoder->cleanUTF8($string), $expect);
$this->assertIdentical($this->Encoder->cleanUTF8($string, true), $expect);
} }
function test_cleanUTF8() { function test_cleanUTF8() {