2006-08-29 19:36:40 +00:00
|
|
|
<?php
|
|
|
|
|
2007-08-01 14:06:59 +00:00
|
|
|
class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
|
2006-08-29 19:36:40 +00:00
|
|
|
{
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2007-11-25 02:24:39 +00:00
|
|
|
protected $_entity_lookup;
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2006-08-29 19:36:40 +00:00
|
|
|
function setUp() {
|
|
|
|
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
|
2008-05-25 05:40:20 +00:00
|
|
|
parent::setUp();
|
2006-08-29 19:36:40 +00:00
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2006-08-29 19:36:40 +00:00
|
|
|
function assertCleanUTF8($string, $expect = null) {
|
|
|
|
if ($expect === null) $expect = $string;
|
2007-01-18 22:55:44 +00:00
|
|
|
$this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string), $expect, 'iconv: %s');
|
|
|
|
$this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string, true), $expect, 'PHP: %s');
|
2006-08-29 19:36:40 +00:00
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2006-08-29 19:36:40 +00:00
|
|
|
function test_cleanUTF8() {
|
|
|
|
$this->assertCleanUTF8('Normal string.');
|
|
|
|
$this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
|
|
|
|
$this->assertCleanUTF8("null byte: \0", 'null byte: ');
|
|
|
|
$this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
|
|
|
|
$this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
|
|
|
|
$this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
|
|
|
|
$this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
|
|
|
|
$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
|
2008-05-26 21:27:52 +00:00
|
|
|
// invalid codepoints
|
|
|
|
$this->assertCleanUTF8("\xED\xB0\x80", '');
|
2006-08-29 19:36:40 +00:00
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2008-05-25 05:40:20 +00:00
|
|
|
function test_convertToUTF8_noConvert() {
|
2006-09-01 00:54:38 +00:00
|
|
|
// UTF-8 means that we don't touch it
|
|
|
|
$this->assertIdentical(
|
2008-05-25 05:40:20 +00:00
|
|
|
HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
|
2008-01-07 00:17:49 +00:00
|
|
|
"\xF6", // this is invalid
|
|
|
|
'Expected identical [Binary: F6]'
|
2006-09-01 00:54:38 +00:00
|
|
|
);
|
2008-05-25 05:40:20 +00:00
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2008-07-05 07:14:32 +00:00
|
|
|
function test_convertToUTF8_spuriousEncoding() {
|
2012-10-27 09:27:57 +00:00
|
|
|
if (!HTMLPurifier_Encoder::iconvAvailable()) return;
|
2009-02-20 00:17:49 +00:00
|
|
|
$this->config->set('Core.Encoding', 'utf99');
|
2008-10-23 19:22:31 +00:00
|
|
|
$this->expectError('Invalid encoding utf99');
|
2008-07-05 07:14:32 +00:00
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
|
|
|
|
''
|
|
|
|
);
|
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2008-05-25 05:40:20 +00:00
|
|
|
function test_convertToUTF8_iso8859_1() {
|
2009-02-20 00:17:49 +00:00
|
|
|
$this->config->set('Core.Encoding', 'ISO-8859-1');
|
2006-09-01 00:54:38 +00:00
|
|
|
$this->assertIdentical(
|
2008-05-25 05:40:20 +00:00
|
|
|
HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
|
2006-09-01 00:54:38 +00:00
|
|
|
"\xC3\xB6"
|
|
|
|
);
|
2008-05-25 05:40:20 +00:00
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2008-05-25 05:40:20 +00:00
|
|
|
function test_convertToUTF8_withoutIconv() {
|
2009-02-20 00:17:49 +00:00
|
|
|
$this->config->set('Core.Encoding', 'ISO-8859-1');
|
|
|
|
$this->config->set('Test.ForceNoIconv', true);
|
2006-09-01 16:54:23 +00:00
|
|
|
$this->assertIdentical(
|
2008-05-25 05:40:20 +00:00
|
|
|
HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
|
2006-09-01 16:54:23 +00:00
|
|
|
"\xC3\xB6"
|
|
|
|
);
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2006-09-01 00:54:38 +00:00
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2008-05-25 05:40:20 +00:00
|
|
|
function getZhongWen() {
|
|
|
|
return "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
|
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2008-05-25 05:40:20 +00:00
|
|
|
function test_convertFromUTF8_utf8() {
|
2006-09-01 00:54:38 +00:00
|
|
|
// UTF-8 means that we don't touch it
|
|
|
|
$this->assertIdentical(
|
2008-05-25 05:40:20 +00:00
|
|
|
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
|
2006-09-01 00:54:38 +00:00
|
|
|
"\xC3\xB6"
|
|
|
|
);
|
2008-05-25 05:40:20 +00:00
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2008-05-25 05:40:20 +00:00
|
|
|
function test_convertFromUTF8_iso8859_1() {
|
2009-02-20 00:17:49 +00:00
|
|
|
$this->config->set('Core.Encoding', 'ISO-8859-1');
|
2006-09-01 00:54:38 +00:00
|
|
|
$this->assertIdentical(
|
2008-05-25 05:40:20 +00:00
|
|
|
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
|
2008-01-07 00:17:49 +00:00
|
|
|
"\xF6",
|
|
|
|
'Expected identical [Binary: F6]'
|
2006-09-01 00:54:38 +00:00
|
|
|
);
|
2008-05-25 05:40:20 +00:00
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2008-05-25 05:40:20 +00:00
|
|
|
function test_convertFromUTF8_iconvNoChars() {
|
2012-10-27 09:27:57 +00:00
|
|
|
if (!HTMLPurifier_Encoder::iconvAvailable()) return;
|
2009-02-20 00:17:49 +00:00
|
|
|
$this->config->set('Core.Encoding', 'ISO-8859-1');
|
2008-05-25 05:40:20 +00:00
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
|
|
|
|
" (Chinese)"
|
|
|
|
);
|
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2008-05-25 05:40:20 +00:00
|
|
|
function test_convertFromUTF8_phpNormal() {
|
2007-01-19 03:54:55 +00:00
|
|
|
// Plain PHP implementation has slightly different behavior
|
2009-02-20 00:17:49 +00:00
|
|
|
$this->config->set('Core.Encoding', 'ISO-8859-1');
|
|
|
|
$this->config->set('Test.ForceNoIconv', true);
|
2006-09-01 16:54:23 +00:00
|
|
|
$this->assertIdentical(
|
2008-05-25 05:40:20 +00:00
|
|
|
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
|
2008-01-07 00:17:49 +00:00
|
|
|
"\xF6",
|
|
|
|
'Expected identical [Binary: F6]'
|
2006-09-01 16:54:23 +00:00
|
|
|
);
|
2008-05-25 05:40:20 +00:00
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2008-05-25 05:40:20 +00:00
|
|
|
function test_convertFromUTF8_phpNoChars() {
|
2009-02-20 00:17:49 +00:00
|
|
|
$this->config->set('Core.Encoding', 'ISO-8859-1');
|
|
|
|
$this->config->set('Test.ForceNoIconv', true);
|
2007-01-19 03:54:55 +00:00
|
|
|
$this->assertIdentical(
|
2008-05-25 05:40:20 +00:00
|
|
|
HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
|
2007-01-19 03:54:55 +00:00
|
|
|
"?? (Chinese)"
|
|
|
|
);
|
2008-05-25 05:40:20 +00:00
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2008-05-25 05:40:20 +00:00
|
|
|
function test_convertFromUTF8_withProtection() {
|
2007-01-19 03:54:55 +00:00
|
|
|
// Preserve the characters!
|
2009-02-20 00:17:49 +00:00
|
|
|
$this->config->set('Core.Encoding', 'ISO-8859-1');
|
|
|
|
$this->config->set('Core.EscapeNonASCIICharacters', true);
|
2007-01-19 03:54:55 +00:00
|
|
|
$this->assertIdentical(
|
2008-05-25 05:40:20 +00:00
|
|
|
HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
|
2007-01-19 03:54:55 +00:00
|
|
|
"中文 (Chinese)"
|
|
|
|
);
|
2011-12-25 08:32:25 +00:00
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2011-12-25 08:32:25 +00:00
|
|
|
function test_convertFromUTF8_withProtectionButUtf8() {
|
|
|
|
// Preserve the characters!
|
|
|
|
$this->config->set('Core.EscapeNonASCIICharacters', true);
|
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
|
|
|
|
"中文 (Chinese)"
|
|
|
|
);
|
2007-01-19 03:54:55 +00:00
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2007-01-19 03:54:55 +00:00
|
|
|
function test_convertToASCIIDumbLossless() {
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2007-01-19 03:54:55 +00:00
|
|
|
// Uppercase thorn letter
|
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"),
|
|
|
|
"Þorn"
|
|
|
|
);
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2007-01-19 03:54:55 +00:00
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"),
|
|
|
|
"an"
|
|
|
|
);
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2007-01-19 03:54:55 +00:00
|
|
|
// test up to four bytes
|
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
|
|
|
|
"󠀠"
|
|
|
|
);
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2006-09-01 00:54:38 +00:00
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2008-05-25 05:40:20 +00:00
|
|
|
function assertASCIISupportCheck($enc, $ret) {
|
|
|
|
$test = HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true);
|
|
|
|
if ($test === false) return;
|
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::testEncodingSupportsASCII($enc),
|
|
|
|
$ret
|
|
|
|
);
|
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true),
|
|
|
|
$ret
|
|
|
|
);
|
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2008-05-25 05:40:20 +00:00
|
|
|
function test_testEncodingSupportsASCII() {
|
2012-10-27 09:27:57 +00:00
|
|
|
if (HTMLPurifier_Encoder::iconvAvailable()) {
|
|
|
|
$this->assertASCIISupportCheck('Shift_JIS', array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'));
|
|
|
|
$this->assertASCIISupportCheck('JOHAB', array("\xE2\x82\xA9" => '\\'));
|
|
|
|
}
|
2008-05-25 05:40:20 +00:00
|
|
|
$this->assertASCIISupportCheck('ISO-8859-1', array());
|
|
|
|
$this->assertASCIISupportCheck('dontexist', array()); // canary
|
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2008-05-25 05:40:20 +00:00
|
|
|
function testShiftJIS() {
|
2012-10-27 09:27:57 +00:00
|
|
|
if (!HTMLPurifier_Encoder::iconvAvailable()) return;
|
2009-02-20 00:17:49 +00:00
|
|
|
$this->config->set('Core.Encoding', 'Shift_JIS');
|
2008-05-25 05:40:20 +00:00
|
|
|
// This actually looks like a Yen, but we're going to treat it differently
|
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::convertFromUTF8('\\~', $this->config, $this->context),
|
|
|
|
'\\~'
|
|
|
|
);
|
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::convertToUTF8('\\~', $this->config, $this->context),
|
|
|
|
'\\~'
|
|
|
|
);
|
|
|
|
}
|
2008-12-06 07:28:20 +00:00
|
|
|
|
2011-12-24 04:42:58 +00:00
|
|
|
function testIconvTruncateBug() {
|
2012-10-27 09:27:57 +00:00
|
|
|
if (!HTMLPurifier_Encoder::iconvAvailable()) return;
|
2011-12-24 04:42:58 +00:00
|
|
|
if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) return;
|
|
|
|
$this->config->set('Core.Encoding', 'ISO-8859-1');
|
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::convertFromUTF8("\xE4\xB8\xAD" . str_repeat('a', 10000), $this->config, $this->context),
|
|
|
|
str_repeat('a', 10000)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
function testIconvChunking() {
|
2012-10-27 09:27:57 +00:00
|
|
|
if (!HTMLPurifier_Encoder::iconvAvailable()) return;
|
2011-12-24 04:42:58 +00:00
|
|
|
if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) return;
|
|
|
|
$this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "a\xF3\xA0\x80\xA0b", 4), 'ab');
|
|
|
|
$this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aa\xE4\xB8\xADb", 4), 'aab');
|
|
|
|
$this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaa\xCE\xB1b", 4), 'aaab');
|
|
|
|
$this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xF3\xA0\x80\xA0b", 4), 'aaaab');
|
|
|
|
$this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xE4\xB8\xADb", 4), 'aaaab');
|
|
|
|
$this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xCE\xB1b", 4), 'aaaab');
|
|
|
|
}
|
|
|
|
|
2006-08-29 19:36:40 +00:00
|
|
|
}
|
|
|
|
|
2008-12-06 09:24:59 +00:00
|
|
|
// vim: et sw=4 sts=4
|