mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-22 16:31:53 +00:00
Speed up cleanUTF8 with iconv. Also factored out code point to character code.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@351 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
14aeafcf22
commit
b621602ac1
@ -20,20 +20,7 @@ class HTMLPurifier_Encoder
|
||||
* respectively. 128 and above the code points map to multibyte
|
||||
* UTF-8 representations.
|
||||
*
|
||||
* @note The functionality provided by the original function could be
|
||||
* implemented with iconv using 'UTF-8//IGNORE', mbstring, or
|
||||
* even the PCRE modifier 'u', these do not allow us to strip
|
||||
* control characters or disallowed code points, and the latter
|
||||
* does not allow invalid UTF-8 characters to be ignored. Once
|
||||
* PHP 6 appears all our problems magically disappear.
|
||||
*
|
||||
* @note Decomposing the string into Unicode code points is necessary
|
||||
* because SGML disallows the use of specific code points, not
|
||||
* necessarily bytes. A naive implementation that simply strtr
|
||||
* disallowed code points as bytes will break other Unicode
|
||||
* characters in which using such bytes is valid.
|
||||
*
|
||||
* @note Code adapted from utf8ToUnicode by Henri Sivonen and
|
||||
* @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
|
||||
* hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
|
||||
* LGPL license. Notes on what changed are inside, but in general,
|
||||
* the original code transformed UTF-8 text into an array of integer
|
||||
@ -46,7 +33,33 @@ class HTMLPurifier_Encoder
|
||||
* would need that, and I'm probably not going to implement them.
|
||||
* Once again, PHP 6 should solve all our problems.
|
||||
*/
|
||||
function cleanUTF8($str) {
|
||||
function cleanUTF8($str, $force_php = false) {
|
||||
|
||||
static $non_sgml_chars = array();
|
||||
static $iconv = null;
|
||||
|
||||
if (empty($non_sgml_chars)) {
|
||||
for ($i = 0; $i <= 31; $i++) {
|
||||
// non-SGML ASCII chars
|
||||
// save \r, \t and \n
|
||||
if ($i == 9 || $i == 13 || $i == 10) continue;
|
||||
$non_sgml_chars[chr($i)] = '';
|
||||
}
|
||||
for ($i = 127; $i <= 159; $i++) {
|
||||
$non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
|
||||
}
|
||||
}
|
||||
|
||||
if ($iconv === null) {
|
||||
$iconv = function_exists('iconv');
|
||||
}
|
||||
|
||||
if ($iconv && !$force_php) {
|
||||
// do the shortcut way
|
||||
$str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
|
||||
return strtr($str, $non_sgml_chars);;
|
||||
}
|
||||
|
||||
$mState = 0; // cached expected number of octets after the current octet
|
||||
// until the beginning of the next UTF8 character sequence
|
||||
$mUcs4 = 0; // cached Unicode character
|
||||
@ -179,6 +192,46 @@ class HTMLPurifier_Encoder
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Translates a Unicode codepoint into its corresponding UTF-8 character.
|
||||
*/
|
||||
function unichr($code) {
|
||||
if($code > 1114111 or $code < 0 or
|
||||
($code >= 55296 and $code <= 57343) ) {
|
||||
// bits are set outside the "valid" range as defined
|
||||
// by UNICODE 4.1.0
|
||||
return '';
|
||||
}
|
||||
|
||||
$x = $y = $z = $w = 0;
|
||||
if ($code < 128) {
|
||||
// regular ASCII character
|
||||
$x = $code;
|
||||
} else {
|
||||
// set up bits for UTF-8
|
||||
$x = ($code & 63) | 128;
|
||||
if ($code < 2048) {
|
||||
$y = (($code & 2047) >> 6) | 192;
|
||||
} else {
|
||||
$y = (($code & 4032) >> 6) | 128;
|
||||
if($code < 65536) {
|
||||
$z = (($code >> 12) & 15) | 224;
|
||||
} else {
|
||||
$z = (($code >> 12) & 63) | 128;
|
||||
$w = (($code >> 18) & 7) | 240;
|
||||
}
|
||||
}
|
||||
}
|
||||
// set up the actual character
|
||||
$ret = '';
|
||||
if($w) $ret .= chr($w);
|
||||
if($z) $ret .= chr($z);
|
||||
if($y) $ret .= chr($y);
|
||||
$ret .= chr($x);
|
||||
|
||||
return $ret;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/EntityLookup.php';
|
||||
require_once 'HTMLPurifier/Encoder.php';
|
||||
|
||||
/**
|
||||
* Handles referencing and derefencing character entities
|
||||
@ -9,7 +10,7 @@ class HTMLPurifier_EntityParser
|
||||
{
|
||||
|
||||
/**
|
||||
* Reference to entity lookup talbe.
|
||||
* Reference to entity lookup table.
|
||||
* @protected
|
||||
*/
|
||||
var $_entity_lookup;
|
||||
@ -114,40 +115,7 @@ class HTMLPurifier_EntityParser
|
||||
// abort for special characters
|
||||
if (isset($this->_special_dec2str[$code])) return $entity;
|
||||
|
||||
if($code > 1114111 or $code < 0 or
|
||||
($code >= 55296 and $code <= 57343) ) {
|
||||
// bits are set outside the "valid" range as defined
|
||||
// by UNICODE 4.1.0
|
||||
return '';
|
||||
}
|
||||
|
||||
$x = $y = $z = $w = 0;
|
||||
if ($code < 128) {
|
||||
// regular ASCII character
|
||||
$x = $code;
|
||||
} else {
|
||||
// set up bits for UTF-8
|
||||
$x = ($code & 63) | 128;
|
||||
if ($code < 2048) {
|
||||
$y = (($code & 2047) >> 6) | 192;
|
||||
} else {
|
||||
$y = (($code & 4032) >> 6) | 128;
|
||||
if($code < 65536) {
|
||||
$z = (($code >> 12) & 15) | 224;
|
||||
} else {
|
||||
$z = (($code >> 12) & 63) | 128;
|
||||
$w = (($code >> 18) & 7) | 240;
|
||||
}
|
||||
}
|
||||
}
|
||||
// set up the actual character
|
||||
$ret = '';
|
||||
if($w) $ret .= chr($w);
|
||||
if($z) $ret .= chr($z);
|
||||
if($y) $ret .= chr($y);
|
||||
$ret .= chr($x);
|
||||
|
||||
return $ret;
|
||||
return HTMLPurifier_Encoder::unichr($code);
|
||||
} else {
|
||||
if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
|
||||
if (!$this->_entity_lookup) {
|
||||
|
@ -15,6 +15,7 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
|
||||
function assertCleanUTF8($string, $expect = null) {
|
||||
if ($expect === null) $expect = $string;
|
||||
$this->assertIdentical($this->Encoder->cleanUTF8($string), $expect);
|
||||
$this->assertIdentical($this->Encoder->cleanUTF8($string, true), $expect);
|
||||
}
|
||||
|
||||
function test_cleanUTF8() {
|
||||
|
Loading…
Reference in New Issue
Block a user