0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-05 06:01:52 +00:00

[3.1.1] Improved adherence to Unicode by checking for non-character codepoints. Thanks Geoffrey Sneddon for reporting.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1773 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2008-05-26 21:27:52 +00:00
parent 322288e6c0
commit 8d1f1e8e73
4 changed files with 25 additions and 39 deletions

3
NEWS
View File

@ -30,6 +30,9 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
a strict superset of printable ASCII. a strict superset of printable ASCII.
- Fix missing configuration parameter in Generator calls. Thanks vs for the - Fix missing configuration parameter in Generator calls. Thanks vs for the
partial patch. partial patch.
- Improved adherence to Unicode by checking for non-character codepoints.
Thanks Geoffrey Sneddon for reporting. This may result in degraded
performance for extremely large inputs.
. Added HTMLPurifier_UnitConverter and HTMLPurifier_Length for convenient . Added HTMLPurifier_UnitConverter and HTMLPurifier_Length for convenient
handling of CSS-style lengths. HTMLPurifier_AttrDef_CSS_Length now uses handling of CSS-style lengths. HTMLPurifier_AttrDef_CSS_Length now uses
this class. this class.

View File

@ -19,7 +19,6 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
// assume that no font names contain commas in them // assume that no font names contain commas in them
$fonts = explode(',', $string); $fonts = explode(',', $string);
$final = ''; $final = '';
$non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
foreach($fonts as $font) { foreach($fonts as $font) {
$font = trim($font); $font = trim($font);
if ($font === '') continue; if ($font === '') continue;
@ -50,8 +49,11 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
if (!ctype_xdigit($font[$i])) break; if (!ctype_xdigit($font[$i])) break;
$code .= $font[$i]; $code .= $font[$i];
} }
// We have to be extremely careful when adding
// new characters, to make sure we're not breaking
// the encoding.
$char = HTMLPurifier_Encoder::unichr(hexdec($code)); $char = HTMLPurifier_Encoder::unichr(hexdec($code));
if (isset($non_sgml[$char])) continue; if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue;
$new_font .= $char; $new_font .= $char;
if ($i < $c && trim($font[$i]) !== '') $i--; if ($i < $c && trim($font[$i]) !== '') $i--;
continue; continue;

View File

@ -7,8 +7,6 @@
class HTMLPurifier_Encoder class HTMLPurifier_Encoder
{ {
private static $nonSgmlCharacters;
/** /**
* Constructor throws fatal error if you attempt to instantiate class * Constructor throws fatal error if you attempt to instantiate class
*/ */
@ -21,24 +19,6 @@ class HTMLPurifier_Encoder
*/ */
private static function muteErrorHandler() {} private static function muteErrorHandler() {}
/**
* Returns a lookup of UTF-8 character byte sequences that are non-SGML.
*/
public static function getNonSgmlCharacters() {
if (empty(HTMLPurifier_Encoder::$nonSgmlCharacters)) {
for ($i = 0; $i <= 31; $i++) {
// non-SGML ASCII chars
// save \r, \t and \n
if ($i == 9 || $i == 13 || $i == 10) continue;
HTMLPurifier_Encoder::$nonSgmlCharacters[chr($i)] = '';
}
for ($i = 127; $i <= 159; $i++) {
HTMLPurifier_Encoder::$nonSgmlCharacters[HTMLPurifier_Encoder::unichr($i)] = '';
}
}
return HTMLPurifier_Encoder::$nonSgmlCharacters;
}
/** /**
* Cleans a UTF-8 string for well-formedness and SGML validity * Cleans a UTF-8 string for well-formedness and SGML validity
* *
@ -66,24 +46,13 @@ class HTMLPurifier_Encoder
*/ */
public static function cleanUTF8($str, $force_php = false) { public static function cleanUTF8($str, $force_php = false) {
$non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
// UTF-8 validity is checked since PHP 4.3.5 // UTF-8 validity is checked since PHP 4.3.5
// This is an optimization: if the string is already valid UTF-8, no // This is an optimization: if the string is already valid UTF-8, no
// need to do iconv/php stuff. 99% of the time, this will be the case. // need to do PHP stuff. 99% of the time, this will be the case.
if (preg_match('/^.{1}/us', $str)) { // The regexp matches the XML char production, as well as well as excluding
return strtr($str, $non_sgml); // non-SGML codepoints U+007F to U+009F
} if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
return $str;
if ($iconv && !$force_php) {
// do the shortcut way
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
$str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
restore_error_handler();
return strtr($str, $non_sgml);
} }
$mState = 0; // cached expected number of octets after the current octet $mState = 0; // cached expected number of octets after the current octet
@ -194,7 +163,17 @@ class HTMLPurifier_Encoder
) { ) {
} elseif (0xFEFF != $mUcs4 && // omit BOM } elseif (0xFEFF != $mUcs4 && // omit BOM
!($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML // check for valid Char unicode codepoints
(
0x9 == $mUcs4 ||
0xA == $mUcs4 ||
0xD == $mUcs4 ||
(0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
// 7F-9F is not strictly prohibited by XML,
// but it is non-SGML, and thus we don't allow it
(0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
(0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
)
) { ) {
$out .= $char; $out .= $char;
} }

View File

@ -25,6 +25,8 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
$this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
$this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8 $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
// invalid codepoints
$this->assertCleanUTF8("\xED\xB0\x80", '');
} }
function test_convertToUTF8_noConvert() { function test_convertToUTF8_noConvert() {