mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-03-23 14:27:02 +00:00
[3.1.1] Fix Shift_JIS encoding wonkiness with yen symbols and whatnot
- Improve parseCDATA algorithm to take into account newline normalization - Fix regression in FontFamily validator. We now have a legit parser in place, albeit somewhat limited in use. Will be superseded by parser for entire grammar - Convert EncoderTest to new format git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1769 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
10530d7f81
commit
bb16d8eae5
8
NEWS
8
NEWS
@ -17,7 +17,12 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
manifest in token until end of operations. This prevents naughty internal
|
manifest in token until end of operations. This prevents naughty internal
|
||||||
code from directly modifying CurrentToken when they're not supposed to.
|
code from directly modifying CurrentToken when they're not supposed to.
|
||||||
- Percent encoding checks enabled for URI query and fragment
|
- Percent encoding checks enabled for URI query and fragment
|
||||||
- Fix stray backslashes in font-family
|
- Fix stray backslashes in font-family; CSS Unicode character escapes are
|
||||||
|
now properly resolved (although *only* in font-family).
|
||||||
|
- Improve parseCDATA algorithm to take into account newline normalization
|
||||||
|
- Account for browser confusion between Yen character and backslash in
|
||||||
|
Shift_JIS encoding. This fix generalizes to any other encoding which is not
|
||||||
|
a strict superset of printable ASCII.
|
||||||
. Added HTMLPurifier_UnitConverter and HTMLPurifier_Length for convenient
|
. Added HTMLPurifier_UnitConverter and HTMLPurifier_Length for convenient
|
||||||
handling of CSS-style lengths. HTMLPurifier_AttrDef_CSS_Length now uses
|
handling of CSS-style lengths. HTMLPurifier_AttrDef_CSS_Length now uses
|
||||||
this class.
|
this class.
|
||||||
@ -38,7 +43,6 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
. Variable parsing types now are magic integers instead of strings
|
. Variable parsing types now are magic integers instead of strings
|
||||||
. Added benchmark for ConfigSchema
|
. Added benchmark for ConfigSchema
|
||||||
|
|
||||||
|
|
||||||
3.1.0, released 2008-05-18
|
3.1.0, released 2008-05-18
|
||||||
# Unnecessary references to objects (vestiges of PHP4) removed from method
|
# Unnecessary references to objects (vestiges of PHP4) removed from method
|
||||||
signatures. The following methods do not need references when assigning from
|
signatures. The following methods do not need references when assigning from
|
||||||
|
@ -51,16 +51,13 @@ abstract class HTMLPurifier_AttrDef
|
|||||||
*
|
*
|
||||||
* @warning This processing is inconsistent with XML's whitespace handling
|
* @warning This processing is inconsistent with XML's whitespace handling
|
||||||
* as specified by section 3.3.3 and referenced XHTML 1.0 section
|
* as specified by section 3.3.3 and referenced XHTML 1.0 section
|
||||||
* 4.7. Compliant processing requires all line breaks normalized
|
* 4.7. However, note that we are NOT necessarily
|
||||||
* to "\n", so the fix is not as simple as fixing it in this
|
* parsing XML, thus, this behavior may still be correct. We
|
||||||
* function. Trim and whitespace collapsing are supposed to only
|
* assume that newlines have been normalized.
|
||||||
* occur in NMTOKENs. However, note that we are NOT necessarily
|
|
||||||
* parsing XML, thus, this behavior may still be correct.
|
|
||||||
*/
|
*/
|
||||||
public function parseCDATA($string) {
|
public function parseCDATA($string) {
|
||||||
$string = trim($string);
|
$string = trim($string);
|
||||||
$string = str_replace("\n", '', $string);
|
$string = str_replace(array("\n", "\t", "\r"), ' ', $string);
|
||||||
$string = str_replace(array("\r", "\t"), ' ', $string);
|
|
||||||
return $string;
|
return $string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -16,10 +16,10 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
|
|||||||
'cursive' => true
|
'cursive' => true
|
||||||
);
|
);
|
||||||
|
|
||||||
$string = $this->parseCDATA($string);
|
|
||||||
// assume that no font names contain commas in them
|
// assume that no font names contain commas in them
|
||||||
$fonts = explode(',', $string);
|
$fonts = explode(',', $string);
|
||||||
$final = '';
|
$final = '';
|
||||||
|
$non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
|
||||||
foreach($fonts as $font) {
|
foreach($fonts as $font) {
|
||||||
$font = trim($font);
|
$font = trim($font);
|
||||||
if ($font === '') continue;
|
if ($font === '') continue;
|
||||||
@ -35,11 +35,33 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
|
|||||||
$quote = $font[0];
|
$quote = $font[0];
|
||||||
if ($font[$length - 1] !== $quote) continue;
|
if ($font[$length - 1] !== $quote) continue;
|
||||||
$font = substr($font, 1, $length - 2);
|
$font = substr($font, 1, $length - 2);
|
||||||
// double-backslash processing is buggy. Namely, it doesn't allow
|
|
||||||
// fonts that contain an adjacent quote, backslash, or comma
|
$new_font = '';
|
||||||
$font = str_replace("\\$quote", $quote, $font); // de-escape quote
|
for ($i = 0, $c = strlen($font); $i < $c; $i++) {
|
||||||
$font = str_replace("\\\n", '', $font); // de-escape newlines
|
if ($font[$i] === '\\') {
|
||||||
$font = str_replace("\\\\", "\\", $font); // de-escape double backslashes
|
$i++;
|
||||||
|
if ($i >= $c) {
|
||||||
|
$new_font .= '\\';
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (ctype_xdigit($font[$i])) {
|
||||||
|
$code = $font[$i];
|
||||||
|
for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
|
||||||
|
if (!ctype_xdigit($font[$i])) break;
|
||||||
|
$code .= $font[$i];
|
||||||
|
}
|
||||||
|
$char = HTMLPurifier_Encoder::unichr(hexdec($code));
|
||||||
|
if (isset($non_sgml[$char])) continue;
|
||||||
|
$new_font .= $char;
|
||||||
|
if ($i < $c && trim($font[$i]) !== '') $i--;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if ($font[$i] === "\n") continue;
|
||||||
|
}
|
||||||
|
$new_font .= $font[$i];
|
||||||
|
}
|
||||||
|
|
||||||
|
$font = $new_font;
|
||||||
}
|
}
|
||||||
// $font is a pure representation of the font name
|
// $font is a pure representation of the font name
|
||||||
|
|
||||||
|
@ -7,6 +7,8 @@
|
|||||||
class HTMLPurifier_Encoder
|
class HTMLPurifier_Encoder
|
||||||
{
|
{
|
||||||
|
|
||||||
|
private static $nonSgmlCharacters;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructor throws fatal error if you attempt to instantiate class
|
* Constructor throws fatal error if you attempt to instantiate class
|
||||||
*/
|
*/
|
||||||
@ -19,6 +21,24 @@ class HTMLPurifier_Encoder
|
|||||||
*/
|
*/
|
||||||
private static function muteErrorHandler() {}
|
private static function muteErrorHandler() {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a lookup of UTF-8 character byte sequences that are non-SGML.
|
||||||
|
*/
|
||||||
|
public static function getNonSgmlCharacters() {
|
||||||
|
if (empty(HTMLPurifier_Encoder::$nonSgmlCharacters)) {
|
||||||
|
for ($i = 0; $i <= 31; $i++) {
|
||||||
|
// non-SGML ASCII chars
|
||||||
|
// save \r, \t and \n
|
||||||
|
if ($i == 9 || $i == 13 || $i == 10) continue;
|
||||||
|
HTMLPurifier_Encoder::$nonSgmlCharacters[chr($i)] = '';
|
||||||
|
}
|
||||||
|
for ($i = 127; $i <= 159; $i++) {
|
||||||
|
HTMLPurifier_Encoder::$nonSgmlCharacters[HTMLPurifier_Encoder::unichr($i)] = '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return HTMLPurifier_Encoder::$nonSgmlCharacters;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Cleans a UTF-8 string for well-formedness and SGML validity
|
* Cleans a UTF-8 string for well-formedness and SGML validity
|
||||||
*
|
*
|
||||||
@ -46,18 +66,7 @@ class HTMLPurifier_Encoder
|
|||||||
*/
|
*/
|
||||||
public static function cleanUTF8($str, $force_php = false) {
|
public static function cleanUTF8($str, $force_php = false) {
|
||||||
|
|
||||||
static $non_sgml_chars = array();
|
$non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
|
||||||
if (empty($non_sgml_chars)) {
|
|
||||||
for ($i = 0; $i <= 31; $i++) {
|
|
||||||
// non-SGML ASCII chars
|
|
||||||
// save \r, \t and \n
|
|
||||||
if ($i == 9 || $i == 13 || $i == 10) continue;
|
|
||||||
$non_sgml_chars[chr($i)] = '';
|
|
||||||
}
|
|
||||||
for ($i = 127; $i <= 159; $i++) {
|
|
||||||
$non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static $iconv = null;
|
static $iconv = null;
|
||||||
if ($iconv === null) $iconv = function_exists('iconv');
|
if ($iconv === null) $iconv = function_exists('iconv');
|
||||||
@ -66,7 +75,7 @@ class HTMLPurifier_Encoder
|
|||||||
// This is an optimization: if the string is already valid UTF-8, no
|
// This is an optimization: if the string is already valid UTF-8, no
|
||||||
// need to do iconv/php stuff. 99% of the time, this will be the case.
|
// need to do iconv/php stuff. 99% of the time, this will be the case.
|
||||||
if (preg_match('/^.{1}/us', $str)) {
|
if (preg_match('/^.{1}/us', $str)) {
|
||||||
return strtr($str, $non_sgml_chars);
|
return strtr($str, $non_sgml);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($iconv && !$force_php) {
|
if ($iconv && !$force_php) {
|
||||||
@ -74,7 +83,7 @@ class HTMLPurifier_Encoder
|
|||||||
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
|
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
|
||||||
$str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
|
$str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
|
||||||
restore_error_handler();
|
restore_error_handler();
|
||||||
return strtr($str, $non_sgml_chars);
|
return strtr($str, $non_sgml);
|
||||||
}
|
}
|
||||||
|
|
||||||
$mState = 0; // cached expected number of octets after the current octet
|
$mState = 0; // cached expected number of octets after the current octet
|
||||||
@ -276,17 +285,20 @@ class HTMLPurifier_Encoder
|
|||||||
* Converts a string to UTF-8 based on configuration.
|
* Converts a string to UTF-8 based on configuration.
|
||||||
*/
|
*/
|
||||||
public static function convertToUTF8($str, $config, $context) {
|
public static function convertToUTF8($str, $config, $context) {
|
||||||
static $iconv = null;
|
|
||||||
if ($iconv === null) $iconv = function_exists('iconv');
|
|
||||||
$encoding = $config->get('Core', 'Encoding');
|
$encoding = $config->get('Core', 'Encoding');
|
||||||
if ($encoding === 'utf-8') return $str;
|
if ($encoding === 'utf-8') return $str;
|
||||||
|
static $iconv = null;
|
||||||
|
if ($iconv === null) $iconv = function_exists('iconv');
|
||||||
|
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
|
||||||
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
|
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
|
||||||
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
|
|
||||||
$str = iconv($encoding, 'utf-8//IGNORE', $str);
|
$str = iconv($encoding, 'utf-8//IGNORE', $str);
|
||||||
|
// If the string is bjorked by Shift_JIS or a similar encoding
|
||||||
|
// that doesn't support all of ASCII, convert the naughty
|
||||||
|
// characters to their true byte-wise ASCII/UTF-8 equivalents.
|
||||||
|
$str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
|
||||||
restore_error_handler();
|
restore_error_handler();
|
||||||
return $str;
|
return $str;
|
||||||
} elseif ($encoding === 'iso-8859-1') {
|
} elseif ($encoding === 'iso-8859-1') {
|
||||||
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
|
|
||||||
$str = utf8_encode($str);
|
$str = utf8_encode($str);
|
||||||
restore_error_handler();
|
restore_error_handler();
|
||||||
return $str;
|
return $str;
|
||||||
@ -300,20 +312,28 @@ class HTMLPurifier_Encoder
|
|||||||
* characters being omitted.
|
* characters being omitted.
|
||||||
*/
|
*/
|
||||||
public static function convertFromUTF8($str, $config, $context) {
|
public static function convertFromUTF8($str, $config, $context) {
|
||||||
static $iconv = null;
|
|
||||||
if ($iconv === null) $iconv = function_exists('iconv');
|
|
||||||
$encoding = $config->get('Core', 'Encoding');
|
$encoding = $config->get('Core', 'Encoding');
|
||||||
if ($encoding === 'utf-8') return $str;
|
if ($encoding === 'utf-8') return $str;
|
||||||
if ($config->get('Core', 'EscapeNonASCIICharacters')) {
|
static $iconv = null;
|
||||||
|
if ($iconv === null) $iconv = function_exists('iconv');
|
||||||
|
if ($escape = $config->get('Core', 'EscapeNonASCIICharacters')) {
|
||||||
$str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
|
$str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
|
||||||
}
|
}
|
||||||
|
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
|
||||||
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
|
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
|
||||||
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
|
// Undo our previous fix in convertToUTF8, otherwise iconv will barf
|
||||||
|
$ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
|
||||||
|
if (!$escape && !empty($ascii_fix)) {
|
||||||
|
$clear_fix = array();
|
||||||
|
foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
|
||||||
|
$str = strtr($str, $clear_fix);
|
||||||
|
}
|
||||||
|
$str = strtr($str, array_flip($ascii_fix));
|
||||||
|
// Normal stuff
|
||||||
$str = iconv('utf-8', $encoding . '//IGNORE', $str);
|
$str = iconv('utf-8', $encoding . '//IGNORE', $str);
|
||||||
restore_error_handler();
|
restore_error_handler();
|
||||||
return $str;
|
return $str;
|
||||||
} elseif ($encoding === 'iso-8859-1') {
|
} elseif ($encoding === 'iso-8859-1') {
|
||||||
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
|
|
||||||
$str = utf8_decode($str);
|
$str = utf8_decode($str);
|
||||||
restore_error_handler();
|
restore_error_handler();
|
||||||
return $str;
|
return $str;
|
||||||
@ -368,6 +388,47 @@ class HTMLPurifier_Encoder
|
|||||||
return $result;
|
return $result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This expensive function tests whether or not a given character
|
||||||
|
* encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
|
||||||
|
* fail this test, and require special processing. Variable width
|
||||||
|
* encodings shouldn't ever fail.
|
||||||
|
*
|
||||||
|
* @param string $encoding Encoding name to test, as per iconv format
|
||||||
|
* @param bool $bypass Whether or not to bypass the precompiled arrays.
|
||||||
|
* @return Array of UTF-8 characters to their corresponding ASCII,
|
||||||
|
* which can be used to "undo" any overzealous iconv action.
|
||||||
|
*/
|
||||||
|
public static function testEncodingSupportsASCII($encoding, $bypass = false) {
|
||||||
|
static $encodings = array();
|
||||||
|
if (!$bypass) {
|
||||||
|
if (isset($encodings[$encoding])) return $encodings[$encoding];
|
||||||
|
$lenc = strtolower($encoding);
|
||||||
|
switch ($lenc) {
|
||||||
|
case 'shift_jis':
|
||||||
|
return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
|
||||||
|
case 'johab':
|
||||||
|
return array("\xE2\x82\xA9" => '\\');
|
||||||
|
}
|
||||||
|
if (strpos($lenc, 'iso-8859-') === 0) return array();
|
||||||
|
}
|
||||||
|
$ret = array();
|
||||||
|
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
|
||||||
|
if (iconv('UTF-8', $encoding, 'a') === false) return false;
|
||||||
|
for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
|
||||||
|
$c = chr($i);
|
||||||
|
if (iconv('UTF-8', "$encoding//IGNORE", $c) === '') {
|
||||||
|
// Reverse engineer: what's the UTF-8 equiv of this byte
|
||||||
|
// sequence? This assumes that there's no variable width
|
||||||
|
// encoding that doesn't support ASCII.
|
||||||
|
$ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
restore_error_handler();
|
||||||
|
$encodings[$encoding] = $ret;
|
||||||
|
return $ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,7 +18,20 @@ class HTMLPurifier_AttrDef_CSS_FontFamilyTest extends HTMLPurifier_AttrDefHarnes
|
|||||||
$this->assertDef($d = "'\xE5\xAE\x8B\xE4\xBD\x93'");
|
$this->assertDef($d = "'\xE5\xAE\x8B\xE4\xBD\x93'");
|
||||||
$this->assertDef("\xE5\xAE\x8B\xE4\xBD\x93", $d);
|
$this->assertDef("\xE5\xAE\x8B\xE4\xBD\x93", $d);
|
||||||
$this->assertDef("'\\','f'", "'\\\\', f");
|
$this->assertDef("'\\','f'", "'\\\\', f");
|
||||||
|
$this->assertDef("'\\01'", "''");
|
||||||
|
$this->assertDef("'\\20'", "' '");
|
||||||
|
$this->assertDef("\\0020", "'\\\\0020'");
|
||||||
|
$this->assertDef("'\\000045'", "E");
|
||||||
|
$this->assertDef("','", false);
|
||||||
|
$this->assertDef("',' foobar','", "' foobar'");
|
||||||
|
$this->assertDef("'\\27'", "'\''");
|
||||||
|
$this->assertDef('"\\22"', "'\"'");
|
||||||
|
$this->assertDef('"\\""', "'\"'");
|
||||||
|
$this->assertDef('"\'"', "'\\''");
|
||||||
|
$this->assertDef("'\\000045a'", "Ea");
|
||||||
|
$this->assertDef("'\\00045 a'", "Ea");
|
||||||
|
$this->assertDef("'\\00045 a'", "'E a'");
|
||||||
|
$this->assertDef("'\\\nf'", "f");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -8,7 +8,7 @@ class HTMLPurifier_AttrDef_TextTest extends HTMLPurifier_AttrDefHarness
|
|||||||
$this->def = new HTMLPurifier_AttrDef_Text();
|
$this->def = new HTMLPurifier_AttrDef_Text();
|
||||||
|
|
||||||
$this->assertDef('This is spiffy text!');
|
$this->assertDef('This is spiffy text!');
|
||||||
$this->assertDef(" Casual\tCDATA parse\ncheck. ", 'Casual CDATA parsecheck.');
|
$this->assertDef(" Casual\tCDATA parse\ncheck. ", 'Casual CDATA parse check.');
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,8 +15,7 @@ class HTMLPurifier_AttrDefTest extends HTMLPurifier_Harness
|
|||||||
$this->assertIdentical('', $def->parseCDATA(''));
|
$this->assertIdentical('', $def->parseCDATA(''));
|
||||||
$this->assertIdentical('', $def->parseCDATA("\t\n\r \t\t"));
|
$this->assertIdentical('', $def->parseCDATA("\t\n\r \t\t"));
|
||||||
$this->assertIdentical('foo', $def->parseCDATA("\t\n\r foo\t\t"));
|
$this->assertIdentical('foo', $def->parseCDATA("\t\n\r foo\t\t"));
|
||||||
$this->assertIdentical('ignorelinefeeds', $def->parseCDATA("ignore\nline\nfeeds"));
|
$this->assertIdentical('translate to space', $def->parseCDATA("translate\nto\tspace"));
|
||||||
$this->assertIdentical('translate to space', $def->parseCDATA("translate\rto\tspace"));
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,6 +7,7 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
|
|||||||
|
|
||||||
function setUp() {
|
function setUp() {
|
||||||
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
|
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
|
||||||
|
parent::setUp();
|
||||||
}
|
}
|
||||||
|
|
||||||
function assertCleanUTF8($string, $expect = null) {
|
function assertCleanUTF8($string, $expect = null) {
|
||||||
@ -26,93 +27,89 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
|
|||||||
$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
|
$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
|
||||||
}
|
}
|
||||||
|
|
||||||
function test_convertToUTF8() {
|
function test_convertToUTF8_noConvert() {
|
||||||
$config = HTMLPurifier_Config::createDefault();
|
|
||||||
$context = new HTMLPurifier_Context();
|
|
||||||
|
|
||||||
// UTF-8 means that we don't touch it
|
// UTF-8 means that we don't touch it
|
||||||
$this->assertIdentical(
|
$this->assertIdentical(
|
||||||
HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context),
|
HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
|
||||||
"\xF6", // this is invalid
|
"\xF6", // this is invalid
|
||||||
'Expected identical [Binary: F6]'
|
'Expected identical [Binary: F6]'
|
||||||
);
|
);
|
||||||
|
}
|
||||||
$config = HTMLPurifier_Config::create(array(
|
|
||||||
'Core.Encoding' => 'ISO-8859-1'
|
function test_convertToUTF8_iso8859_1() {
|
||||||
));
|
$this->config->set('Core', 'Encoding', 'ISO-8859-1');
|
||||||
|
|
||||||
// Now it gets converted
|
|
||||||
$this->assertIdentical(
|
$this->assertIdentical(
|
||||||
HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context),
|
HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
|
||||||
"\xC3\xB6"
|
"\xC3\xB6"
|
||||||
);
|
);
|
||||||
|
}
|
||||||
$config = HTMLPurifier_Config::create(array(
|
|
||||||
'Core.Encoding' => 'ISO-8859-1',
|
function test_convertToUTF8_withoutIconv() {
|
||||||
'Test.ForceNoIconv' => true
|
$this->config->set('Core', 'Encoding', 'ISO-8859-1');
|
||||||
));
|
$this->config->set('Test', 'ForceNoIconv', true);
|
||||||
$this->assertIdentical(
|
$this->assertIdentical(
|
||||||
HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context),
|
HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
|
||||||
"\xC3\xB6"
|
"\xC3\xB6"
|
||||||
);
|
);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function test_convertFromUTF8() {
|
function getZhongWen() {
|
||||||
$config = HTMLPurifier_Config::createDefault();
|
return "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
|
||||||
$context = new HTMLPurifier_Context();
|
}
|
||||||
|
|
||||||
// zhong-wen
|
function test_convertFromUTF8_utf8() {
|
||||||
$chinese = "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
|
|
||||||
|
|
||||||
// UTF-8 means that we don't touch it
|
// UTF-8 means that we don't touch it
|
||||||
$this->assertIdentical(
|
$this->assertIdentical(
|
||||||
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
|
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
|
||||||
"\xC3\xB6"
|
"\xC3\xB6"
|
||||||
);
|
);
|
||||||
|
}
|
||||||
$config = HTMLPurifier_Config::create(array(
|
|
||||||
'Core.Encoding' => 'ISO-8859-1'
|
function test_convertFromUTF8_iso8859_1() {
|
||||||
));
|
$this->config->set('Core', 'Encoding', 'ISO-8859-1');
|
||||||
|
|
||||||
// Now it gets converted
|
|
||||||
$this->assertIdentical(
|
$this->assertIdentical(
|
||||||
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
|
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
|
||||||
"\xF6",
|
"\xF6",
|
||||||
'Expected identical [Binary: F6]'
|
'Expected identical [Binary: F6]'
|
||||||
);
|
);
|
||||||
|
}
|
||||||
if (function_exists('iconv')) {
|
|
||||||
// iconv has it's own way
|
function test_convertFromUTF8_iconvNoChars() {
|
||||||
$this->assertIdentical(
|
if (!function_exists('iconv')) return;
|
||||||
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
|
$this->config->set('Core', 'Encoding', 'ISO-8859-1');
|
||||||
" (Chinese)"
|
$this->assertIdentical(
|
||||||
);
|
HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
|
||||||
}
|
" (Chinese)"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function test_convertFromUTF8_phpNormal() {
|
||||||
// Plain PHP implementation has slightly different behavior
|
// Plain PHP implementation has slightly different behavior
|
||||||
$config = HTMLPurifier_Config::create(array(
|
$this->config->set('Core', 'Encoding', 'ISO-8859-1');
|
||||||
'Core.Encoding' => 'ISO-8859-1',
|
$this->config->set('Test', 'ForceNoIconv', true);
|
||||||
'Test.ForceNoIconv' => true
|
|
||||||
));
|
|
||||||
$this->assertIdentical(
|
$this->assertIdentical(
|
||||||
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
|
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
|
||||||
"\xF6",
|
"\xF6",
|
||||||
'Expected identical [Binary: F6]'
|
'Expected identical [Binary: F6]'
|
||||||
);
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function test_convertFromUTF8_phpNoChars() {
|
||||||
|
$this->config->set('Core', 'Encoding', 'ISO-8859-1');
|
||||||
|
$this->config->set('Test', 'ForceNoIconv', true);
|
||||||
$this->assertIdentical(
|
$this->assertIdentical(
|
||||||
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
|
HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
|
||||||
"?? (Chinese)"
|
"?? (Chinese)"
|
||||||
);
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function test_convertFromUTF8_withProtection() {
|
||||||
// Preserve the characters!
|
// Preserve the characters!
|
||||||
$config = HTMLPurifier_Config::create(array(
|
$this->config->set('Core', 'Encoding', 'ISO-8859-1');
|
||||||
'Core.Encoding' => 'ISO-8859-1',
|
$this->config->set('Core', 'EscapeNonASCIICharacters', true);
|
||||||
'Core.EscapeNonASCIICharacters' => true
|
|
||||||
));
|
|
||||||
$this->assertIdentical(
|
$this->assertIdentical(
|
||||||
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
|
HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
|
||||||
"中文 (Chinese)"
|
"中文 (Chinese)"
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -139,5 +136,39 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function assertASCIISupportCheck($enc, $ret) {
|
||||||
|
$test = HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true);
|
||||||
|
if ($test === false) return;
|
||||||
|
$this->assertIdentical(
|
||||||
|
HTMLPurifier_Encoder::testEncodingSupportsASCII($enc),
|
||||||
|
$ret
|
||||||
|
);
|
||||||
|
$this->assertIdentical(
|
||||||
|
HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true),
|
||||||
|
$ret
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function test_testEncodingSupportsASCII() {
|
||||||
|
$this->assertASCIISupportCheck('Shift_JIS', array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'));
|
||||||
|
$this->assertASCIISupportCheck('JOHAB', array("\xE2\x82\xA9" => '\\'));
|
||||||
|
$this->assertASCIISupportCheck('ISO-8859-1', array());
|
||||||
|
$this->assertASCIISupportCheck('dontexist', array()); // canary
|
||||||
|
}
|
||||||
|
|
||||||
|
function testShiftJIS() {
|
||||||
|
if (!function_exists('iconv')) return;
|
||||||
|
$this->config->set('Core', 'Encoding', 'Shift_JIS');
|
||||||
|
// This actually looks like a Yen, but we're going to treat it differently
|
||||||
|
$this->assertIdentical(
|
||||||
|
HTMLPurifier_Encoder::convertFromUTF8('\\~', $this->config, $this->context),
|
||||||
|
'\\~'
|
||||||
|
);
|
||||||
|
$this->assertIdentical(
|
||||||
|
HTMLPurifier_Encoder::convertToUTF8('\\~', $this->config, $this->context),
|
||||||
|
'\\~'
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -167,5 +167,23 @@ alert("<This is compatible with XHTML>");
|
|||||||
$this->purifier->purify('foo');
|
$this->purifier->purify('foo');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function test_shiftJis() {
|
||||||
|
if (!function_exists('iconv')) return;
|
||||||
|
$this->config->set('Core', 'Encoding', 'Shift_JIS');
|
||||||
|
$this->config->set('Core', 'EscapeNonASCIICharacters', true);
|
||||||
|
$this->assertPurification(
|
||||||
|
"<b style=\"font-family:'¥';\">111</b>"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function test_shiftJisWorstCase() {
|
||||||
|
if (!function_exists('iconv')) return;
|
||||||
|
$this->config->set('Core', 'Encoding', 'Shift_JIS');
|
||||||
|
$this->assertPurification( // Notice how Yen disappears
|
||||||
|
"<b style=\"font-family:'¥';\">111</b>",
|
||||||
|
"<b style=\"font-family:'';\">111</b>"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user