0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-03-23 14:27:02 +00:00

[3.1.1] Fix Shift_JIS encoding wonkiness with yen symbols and whatnot

- Improve parseCDATA algorithm to take into account newline normalization
- Fix regression in FontFamily validator. We now have a legit parser in place, albeit somewhat limited in use. Will be superseded by parser for entire grammar
- Convert EncoderTest to new format

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1769 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2008-05-25 05:40:20 +00:00
parent 10530d7f81
commit bb16d8eae5
9 changed files with 242 additions and 97 deletions

8
NEWS
View File

@ -17,7 +17,12 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
manifest in token until end of operations. This prevents naughty internal manifest in token until end of operations. This prevents naughty internal
code from directly modifying CurrentToken when they're not supposed to. code from directly modifying CurrentToken when they're not supposed to.
- Percent encoding checks enabled for URI query and fragment - Percent encoding checks enabled for URI query and fragment
- Fix stray backslashes in font-family - Fix stray backslashes in font-family; CSS Unicode character escapes are
now properly resolved (although *only* in font-family).
- Improve parseCDATA algorithm to take into account newline normalization
- Account for browser confusion between Yen character and backslash in
Shift_JIS encoding. This fix generalizes to any other encoding which is not
a strict superset of printable ASCII.
. Added HTMLPurifier_UnitConverter and HTMLPurifier_Length for convenient . Added HTMLPurifier_UnitConverter and HTMLPurifier_Length for convenient
handling of CSS-style lengths. HTMLPurifier_AttrDef_CSS_Length now uses handling of CSS-style lengths. HTMLPurifier_AttrDef_CSS_Length now uses
this class. this class.
@ -38,7 +43,6 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
. Variable parsing types now are magic integers instead of strings . Variable parsing types now are magic integers instead of strings
. Added benchmark for ConfigSchema . Added benchmark for ConfigSchema
3.1.0, released 2008-05-18 3.1.0, released 2008-05-18
# Unnecessary references to objects (vestiges of PHP4) removed from method # Unnecessary references to objects (vestiges of PHP4) removed from method
signatures. The following methods do not need references when assigning from signatures. The following methods do not need references when assigning from

View File

@ -51,16 +51,13 @@ abstract class HTMLPurifier_AttrDef
* *
* @warning This processing is inconsistent with XML's whitespace handling * @warning This processing is inconsistent with XML's whitespace handling
* as specified by section 3.3.3 and referenced XHTML 1.0 section * as specified by section 3.3.3 and referenced XHTML 1.0 section
* 4.7. Compliant processing requires all line breaks normalized * 4.7. However, note that we are NOT necessarily
* to "\n", so the fix is not as simple as fixing it in this * parsing XML, thus, this behavior may still be correct. We
* function. Trim and whitespace collapsing are supposed to only * assume that newlines have been normalized.
* occur in NMTOKENs. However, note that we are NOT necessarily
* parsing XML, thus, this behavior may still be correct.
*/ */
public function parseCDATA($string) { public function parseCDATA($string) {
$string = trim($string); $string = trim($string);
$string = str_replace("\n", '', $string); $string = str_replace(array("\n", "\t", "\r"), ' ', $string);
$string = str_replace(array("\r", "\t"), ' ', $string);
return $string; return $string;
} }

View File

@ -16,10 +16,10 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
'cursive' => true 'cursive' => true
); );
$string = $this->parseCDATA($string);
// assume that no font names contain commas in them // assume that no font names contain commas in them
$fonts = explode(',', $string); $fonts = explode(',', $string);
$final = ''; $final = '';
$non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
foreach($fonts as $font) { foreach($fonts as $font) {
$font = trim($font); $font = trim($font);
if ($font === '') continue; if ($font === '') continue;
@ -35,11 +35,33 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
$quote = $font[0]; $quote = $font[0];
if ($font[$length - 1] !== $quote) continue; if ($font[$length - 1] !== $quote) continue;
$font = substr($font, 1, $length - 2); $font = substr($font, 1, $length - 2);
// double-backslash processing is buggy. Namely, it doesn't allow
// fonts that contain an adjacent quote, backslash, or comma $new_font = '';
$font = str_replace("\\$quote", $quote, $font); // de-escape quote for ($i = 0, $c = strlen($font); $i < $c; $i++) {
$font = str_replace("\\\n", '', $font); // de-escape newlines if ($font[$i] === '\\') {
$font = str_replace("\\\\", "\\", $font); // de-escape double backslashes $i++;
if ($i >= $c) {
$new_font .= '\\';
break;
}
if (ctype_xdigit($font[$i])) {
$code = $font[$i];
for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
if (!ctype_xdigit($font[$i])) break;
$code .= $font[$i];
}
$char = HTMLPurifier_Encoder::unichr(hexdec($code));
if (isset($non_sgml[$char])) continue;
$new_font .= $char;
if ($i < $c && trim($font[$i]) !== '') $i--;
continue;
}
if ($font[$i] === "\n") continue;
}
$new_font .= $font[$i];
}
$font = $new_font;
} }
// $font is a pure representation of the font name // $font is a pure representation of the font name

View File

@ -7,6 +7,8 @@
class HTMLPurifier_Encoder class HTMLPurifier_Encoder
{ {
private static $nonSgmlCharacters;
/** /**
* Constructor throws fatal error if you attempt to instantiate class * Constructor throws fatal error if you attempt to instantiate class
*/ */
@ -19,6 +21,24 @@ class HTMLPurifier_Encoder
*/ */
private static function muteErrorHandler() {} private static function muteErrorHandler() {}
/**
* Returns a lookup of UTF-8 character byte sequences that are non-SGML.
*/
public static function getNonSgmlCharacters() {
if (empty(HTMLPurifier_Encoder::$nonSgmlCharacters)) {
for ($i = 0; $i <= 31; $i++) {
// non-SGML ASCII chars
// save \r, \t and \n
if ($i == 9 || $i == 13 || $i == 10) continue;
HTMLPurifier_Encoder::$nonSgmlCharacters[chr($i)] = '';
}
for ($i = 127; $i <= 159; $i++) {
HTMLPurifier_Encoder::$nonSgmlCharacters[HTMLPurifier_Encoder::unichr($i)] = '';
}
}
return HTMLPurifier_Encoder::$nonSgmlCharacters;
}
/** /**
* Cleans a UTF-8 string for well-formedness and SGML validity * Cleans a UTF-8 string for well-formedness and SGML validity
* *
@ -46,18 +66,7 @@ class HTMLPurifier_Encoder
*/ */
public static function cleanUTF8($str, $force_php = false) { public static function cleanUTF8($str, $force_php = false) {
static $non_sgml_chars = array(); $non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
if (empty($non_sgml_chars)) {
for ($i = 0; $i <= 31; $i++) {
// non-SGML ASCII chars
// save \r, \t and \n
if ($i == 9 || $i == 13 || $i == 10) continue;
$non_sgml_chars[chr($i)] = '';
}
for ($i = 127; $i <= 159; $i++) {
$non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
}
}
static $iconv = null; static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv'); if ($iconv === null) $iconv = function_exists('iconv');
@ -66,7 +75,7 @@ class HTMLPurifier_Encoder
// This is an optimization: if the string is already valid UTF-8, no // This is an optimization: if the string is already valid UTF-8, no
// need to do iconv/php stuff. 99% of the time, this will be the case. // need to do iconv/php stuff. 99% of the time, this will be the case.
if (preg_match('/^.{1}/us', $str)) { if (preg_match('/^.{1}/us', $str)) {
return strtr($str, $non_sgml_chars); return strtr($str, $non_sgml);
} }
if ($iconv && !$force_php) { if ($iconv && !$force_php) {
@ -74,7 +83,7 @@ class HTMLPurifier_Encoder
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
$str = iconv('UTF-8', 'UTF-8//IGNORE', $str); $str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
restore_error_handler(); restore_error_handler();
return strtr($str, $non_sgml_chars); return strtr($str, $non_sgml);
} }
$mState = 0; // cached expected number of octets after the current octet $mState = 0; // cached expected number of octets after the current octet
@ -276,17 +285,20 @@ class HTMLPurifier_Encoder
* Converts a string to UTF-8 based on configuration. * Converts a string to UTF-8 based on configuration.
*/ */
public static function convertToUTF8($str, $config, $context) { public static function convertToUTF8($str, $config, $context) {
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
$encoding = $config->get('Core', 'Encoding'); $encoding = $config->get('Core', 'Encoding');
if ($encoding === 'utf-8') return $str; if ($encoding === 'utf-8') return $str;
if ($iconv && !$config->get('Test', 'ForceNoIconv')) { static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
$str = iconv($encoding, 'utf-8//IGNORE', $str); $str = iconv($encoding, 'utf-8//IGNORE', $str);
// If the string is bjorked by Shift_JIS or a similar encoding
// that doesn't support all of ASCII, convert the naughty
// characters to their true byte-wise ASCII/UTF-8 equivalents.
$str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
restore_error_handler(); restore_error_handler();
return $str; return $str;
} elseif ($encoding === 'iso-8859-1') { } elseif ($encoding === 'iso-8859-1') {
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
$str = utf8_encode($str); $str = utf8_encode($str);
restore_error_handler(); restore_error_handler();
return $str; return $str;
@ -300,20 +312,28 @@ class HTMLPurifier_Encoder
* characters being omitted. * characters being omitted.
*/ */
public static function convertFromUTF8($str, $config, $context) { public static function convertFromUTF8($str, $config, $context) {
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
$encoding = $config->get('Core', 'Encoding'); $encoding = $config->get('Core', 'Encoding');
if ($encoding === 'utf-8') return $str; if ($encoding === 'utf-8') return $str;
if ($config->get('Core', 'EscapeNonASCIICharacters')) { static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
if ($escape = $config->get('Core', 'EscapeNonASCIICharacters')) {
$str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str); $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
} }
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
// Undo our previous fix in convertToUTF8, otherwise iconv will barf
$ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
if (!$escape && !empty($ascii_fix)) {
$clear_fix = array();
foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
$str = strtr($str, $clear_fix);
}
$str = strtr($str, array_flip($ascii_fix));
// Normal stuff
$str = iconv('utf-8', $encoding . '//IGNORE', $str); $str = iconv('utf-8', $encoding . '//IGNORE', $str);
restore_error_handler(); restore_error_handler();
return $str; return $str;
} elseif ($encoding === 'iso-8859-1') { } elseif ($encoding === 'iso-8859-1') {
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
$str = utf8_decode($str); $str = utf8_decode($str);
restore_error_handler(); restore_error_handler();
return $str; return $str;
@ -368,6 +388,47 @@ class HTMLPurifier_Encoder
return $result; return $result;
} }
/**
* This expensive function tests whether or not a given character
* encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
* fail this test, and require special processing. Variable width
* encodings shouldn't ever fail.
*
* @param string $encoding Encoding name to test, as per iconv format
* @param bool $bypass Whether or not to bypass the precompiled arrays.
* @return Array of UTF-8 characters to their corresponding ASCII,
* which can be used to "undo" any overzealous iconv action.
*/
public static function testEncodingSupportsASCII($encoding, $bypass = false) {
static $encodings = array();
if (!$bypass) {
if (isset($encodings[$encoding])) return $encodings[$encoding];
$lenc = strtolower($encoding);
switch ($lenc) {
case 'shift_jis':
return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
case 'johab':
return array("\xE2\x82\xA9" => '\\');
}
if (strpos($lenc, 'iso-8859-') === 0) return array();
}
$ret = array();
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
if (iconv('UTF-8', $encoding, 'a') === false) return false;
for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
$c = chr($i);
if (iconv('UTF-8', "$encoding//IGNORE", $c) === '') {
// Reverse engineer: what's the UTF-8 equiv of this byte
// sequence? This assumes that there's no variable width
// encoding that doesn't support ASCII.
$ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
}
}
restore_error_handler();
$encodings[$encoding] = $ret;
return $ret;
}
} }

View File

@ -18,7 +18,20 @@ class HTMLPurifier_AttrDef_CSS_FontFamilyTest extends HTMLPurifier_AttrDefHarnes
$this->assertDef($d = "'\xE5\xAE\x8B\xE4\xBD\x93'"); $this->assertDef($d = "'\xE5\xAE\x8B\xE4\xBD\x93'");
$this->assertDef("\xE5\xAE\x8B\xE4\xBD\x93", $d); $this->assertDef("\xE5\xAE\x8B\xE4\xBD\x93", $d);
$this->assertDef("'\\','f'", "'\\\\', f"); $this->assertDef("'\\','f'", "'\\\\', f");
$this->assertDef("'\\01'", "''");
$this->assertDef("'\\20'", "' '");
$this->assertDef("\\0020", "'\\\\0020'");
$this->assertDef("'\\000045'", "E");
$this->assertDef("','", false);
$this->assertDef("',' foobar','", "' foobar'");
$this->assertDef("'\\27'", "'\''");
$this->assertDef('"\\22"', "'\"'");
$this->assertDef('"\\""', "'\"'");
$this->assertDef('"\'"', "'\\''");
$this->assertDef("'\\000045a'", "Ea");
$this->assertDef("'\\00045 a'", "Ea");
$this->assertDef("'\\00045 a'", "'E a'");
$this->assertDef("'\\\nf'", "f");
} }
} }

View File

@ -15,8 +15,7 @@ class HTMLPurifier_AttrDefTest extends HTMLPurifier_Harness
$this->assertIdentical('', $def->parseCDATA('')); $this->assertIdentical('', $def->parseCDATA(''));
$this->assertIdentical('', $def->parseCDATA("\t\n\r \t\t")); $this->assertIdentical('', $def->parseCDATA("\t\n\r \t\t"));
$this->assertIdentical('foo', $def->parseCDATA("\t\n\r foo\t\t")); $this->assertIdentical('foo', $def->parseCDATA("\t\n\r foo\t\t"));
$this->assertIdentical('ignorelinefeeds', $def->parseCDATA("ignore\nline\nfeeds")); $this->assertIdentical('translate to space', $def->parseCDATA("translate\nto\tspace"));
$this->assertIdentical('translate to space', $def->parseCDATA("translate\rto\tspace"));
} }

View File

@ -7,6 +7,7 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
function setUp() { function setUp() {
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
parent::setUp();
} }
function assertCleanUTF8($string, $expect = null) { function assertCleanUTF8($string, $expect = null) {
@ -26,93 +27,89 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8 $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
} }
function test_convertToUTF8() { function test_convertToUTF8_noConvert() {
$config = HTMLPurifier_Config::createDefault();
$context = new HTMLPurifier_Context();
// UTF-8 means that we don't touch it // UTF-8 means that we don't touch it
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context), HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
"\xF6", // this is invalid "\xF6", // this is invalid
'Expected identical [Binary: F6]' 'Expected identical [Binary: F6]'
); );
}
$config = HTMLPurifier_Config::create(array( function test_convertToUTF8_iso8859_1() {
'Core.Encoding' => 'ISO-8859-1' $this->config->set('Core', 'Encoding', 'ISO-8859-1');
));
// Now it gets converted
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context), HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
"\xC3\xB6" "\xC3\xB6"
); );
}
$config = HTMLPurifier_Config::create(array( function test_convertToUTF8_withoutIconv() {
'Core.Encoding' => 'ISO-8859-1', $this->config->set('Core', 'Encoding', 'ISO-8859-1');
'Test.ForceNoIconv' => true $this->config->set('Test', 'ForceNoIconv', true);
));
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context), HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
"\xC3\xB6" "\xC3\xB6"
); );
} }
function test_convertFromUTF8() { function getZhongWen() {
$config = HTMLPurifier_Config::createDefault(); return "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
$context = new HTMLPurifier_Context(); }
// zhong-wen
$chinese = "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
function test_convertFromUTF8_utf8() {
// UTF-8 means that we don't touch it // UTF-8 means that we don't touch it
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context), HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
"\xC3\xB6" "\xC3\xB6"
); );
}
$config = HTMLPurifier_Config::create(array( function test_convertFromUTF8_iso8859_1() {
'Core.Encoding' => 'ISO-8859-1' $this->config->set('Core', 'Encoding', 'ISO-8859-1');
));
// Now it gets converted
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context), HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
"\xF6", "\xF6",
'Expected identical [Binary: F6]' 'Expected identical [Binary: F6]'
); );
}
if (function_exists('iconv')) { function test_convertFromUTF8_iconvNoChars() {
// iconv has it's own way if (!function_exists('iconv')) return;
$this->config->set('Core', 'Encoding', 'ISO-8859-1');
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context), HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
" (Chinese)" " (Chinese)"
); );
} }
function test_convertFromUTF8_phpNormal() {
// Plain PHP implementation has slightly different behavior // Plain PHP implementation has slightly different behavior
$config = HTMLPurifier_Config::create(array( $this->config->set('Core', 'Encoding', 'ISO-8859-1');
'Core.Encoding' => 'ISO-8859-1', $this->config->set('Test', 'ForceNoIconv', true);
'Test.ForceNoIconv' => true
));
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context), HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
"\xF6", "\xF6",
'Expected identical [Binary: F6]' 'Expected identical [Binary: F6]'
); );
}
function test_convertFromUTF8_phpNoChars() {
$this->config->set('Core', 'Encoding', 'ISO-8859-1');
$this->config->set('Test', 'ForceNoIconv', true);
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context), HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
"?? (Chinese)" "?? (Chinese)"
); );
}
function test_convertFromUTF8_withProtection() {
// Preserve the characters! // Preserve the characters!
$config = HTMLPurifier_Config::create(array( $this->config->set('Core', 'Encoding', 'ISO-8859-1');
'Core.Encoding' => 'ISO-8859-1', $this->config->set('Core', 'EscapeNonASCIICharacters', true);
'Core.EscapeNonASCIICharacters' => true
));
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context), HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
"&#20013;&#25991; (Chinese)" "&#20013;&#25991; (Chinese)"
); );
@ -139,5 +136,39 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
} }
function assertASCIISupportCheck($enc, $ret) {
$test = HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true);
if ($test === false) return;
$this->assertIdentical(
HTMLPurifier_Encoder::testEncodingSupportsASCII($enc),
$ret
);
$this->assertIdentical(
HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true),
$ret
);
}
function test_testEncodingSupportsASCII() {
$this->assertASCIISupportCheck('Shift_JIS', array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'));
$this->assertASCIISupportCheck('JOHAB', array("\xE2\x82\xA9" => '\\'));
$this->assertASCIISupportCheck('ISO-8859-1', array());
$this->assertASCIISupportCheck('dontexist', array()); // canary
}
function testShiftJIS() {
if (!function_exists('iconv')) return;
$this->config->set('Core', 'Encoding', 'Shift_JIS');
// This actually looks like a Yen, but we're going to treat it differently
$this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8('\\~', $this->config, $this->context),
'\\~'
);
$this->assertIdentical(
HTMLPurifier_Encoder::convertToUTF8('\\~', $this->config, $this->context),
'\\~'
);
}
} }

View File

@ -167,5 +167,23 @@ alert("<This is compatible with XHTML>");
$this->purifier->purify('foo'); $this->purifier->purify('foo');
} }
function test_shiftJis() {
if (!function_exists('iconv')) return;
$this->config->set('Core', 'Encoding', 'Shift_JIS');
$this->config->set('Core', 'EscapeNonASCIICharacters', true);
$this->assertPurification(
"<b style=\"font-family:'&#165;';\">111</b>"
);
}
function test_shiftJisWorstCase() {
if (!function_exists('iconv')) return;
$this->config->set('Core', 'Encoding', 'Shift_JIS');
$this->assertPurification( // Notice how Yen disappears
"<b style=\"font-family:'&#165;';\">111</b>",
"<b style=\"font-family:'';\">111</b>"
);
}
} }