2006-08-29 19:36:40 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
require_once 'HTMLPurifier/Encoder.php';
|
|
|
|
|
|
|
|
class HTMLPurifier_EncoderTest extends UnitTestCase
|
|
|
|
{
|
|
|
|
|
2007-01-19 03:54:55 +00:00
|
|
|
var $_entity_lookup;
|
2006-08-29 19:36:40 +00:00
|
|
|
|
|
|
|
function setUp() {
|
|
|
|
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
|
|
|
|
}
|
|
|
|
|
|
|
|
function assertCleanUTF8($string, $expect = null) {
|
|
|
|
if ($expect === null) $expect = $string;
|
2007-01-18 22:55:44 +00:00
|
|
|
$this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string), $expect, 'iconv: %s');
|
|
|
|
$this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string, true), $expect, 'PHP: %s');
|
2006-08-29 19:36:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
function test_cleanUTF8() {
|
|
|
|
$this->assertCleanUTF8('Normal string.');
|
|
|
|
$this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
|
|
|
|
$this->assertCleanUTF8("null byte: \0", 'null byte: ');
|
|
|
|
$this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
|
|
|
|
$this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
|
|
|
|
$this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
|
|
|
|
$this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
|
|
|
|
$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
|
|
|
|
}
|
|
|
|
|
2006-09-01 00:54:38 +00:00
|
|
|
function test_convertToUTF8() {
|
|
|
|
$config = HTMLPurifier_Config::createDefault();
|
2006-10-01 20:47:07 +00:00
|
|
|
$context = new HTMLPurifier_Context();
|
2006-09-01 00:54:38 +00:00
|
|
|
|
|
|
|
// UTF-8 means that we don't touch it
|
|
|
|
$this->assertIdentical(
|
2007-01-18 22:55:44 +00:00
|
|
|
HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context),
|
2006-09-01 00:54:38 +00:00
|
|
|
"\xF6" // this is invalid
|
|
|
|
);
|
|
|
|
$this->assertNoErrors();
|
|
|
|
|
2007-05-20 18:06:51 +00:00
|
|
|
$config = HTMLPurifier_Config::create(array(
|
|
|
|
'Core.Encoding' => 'ISO-8859-1'
|
|
|
|
));
|
2006-09-01 00:54:38 +00:00
|
|
|
|
|
|
|
// Now it gets converted
|
|
|
|
$this->assertIdentical(
|
2007-01-18 22:55:44 +00:00
|
|
|
HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context),
|
2006-09-01 00:54:38 +00:00
|
|
|
"\xC3\xB6"
|
|
|
|
);
|
2006-09-01 16:54:23 +00:00
|
|
|
|
2007-05-20 18:06:51 +00:00
|
|
|
$config = HTMLPurifier_Config::create(array(
|
|
|
|
'Core.Encoding' => 'ISO-8859-1',
|
|
|
|
'Test.ForceNoIconv' => true
|
|
|
|
));
|
2006-09-01 16:54:23 +00:00
|
|
|
$this->assertIdentical(
|
2007-01-18 22:55:44 +00:00
|
|
|
HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context),
|
2006-09-01 16:54:23 +00:00
|
|
|
"\xC3\xB6"
|
|
|
|
);
|
|
|
|
|
2006-09-01 00:54:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
function test_convertFromUTF8() {
|
|
|
|
$config = HTMLPurifier_Config::createDefault();
|
2006-10-01 20:47:07 +00:00
|
|
|
$context = new HTMLPurifier_Context();
|
2006-09-01 00:54:38 +00:00
|
|
|
|
2007-01-19 03:54:55 +00:00
|
|
|
// zhong-wen
|
|
|
|
$chinese = "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
|
|
|
|
|
2006-09-01 00:54:38 +00:00
|
|
|
// UTF-8 means that we don't touch it
|
|
|
|
$this->assertIdentical(
|
2007-01-18 22:55:44 +00:00
|
|
|
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
|
2006-09-01 00:54:38 +00:00
|
|
|
"\xC3\xB6"
|
|
|
|
);
|
|
|
|
|
2007-05-20 18:06:51 +00:00
|
|
|
$config = HTMLPurifier_Config::create(array(
|
|
|
|
'Core.Encoding' => 'ISO-8859-1'
|
|
|
|
));
|
2006-09-01 00:54:38 +00:00
|
|
|
|
|
|
|
// Now it gets converted
|
|
|
|
$this->assertIdentical(
|
2007-01-18 22:55:44 +00:00
|
|
|
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
|
2006-09-01 00:54:38 +00:00
|
|
|
"\xF6"
|
|
|
|
);
|
2006-09-01 16:54:23 +00:00
|
|
|
|
2007-01-19 03:54:55 +00:00
|
|
|
if (function_exists('iconv')) {
|
|
|
|
// iconv has it's own way
|
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
|
|
|
|
" (Chinese)"
|
|
|
|
);
|
|
|
|
}
|
2006-09-01 16:54:23 +00:00
|
|
|
|
2007-01-19 03:54:55 +00:00
|
|
|
// Plain PHP implementation has slightly different behavior
|
2007-05-20 18:06:51 +00:00
|
|
|
$config = HTMLPurifier_Config::create(array(
|
|
|
|
'Core.Encoding' => 'ISO-8859-1',
|
|
|
|
'Test.ForceNoIconv' => true
|
|
|
|
));
|
2006-09-01 16:54:23 +00:00
|
|
|
$this->assertIdentical(
|
2007-01-18 22:55:44 +00:00
|
|
|
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
|
2006-09-01 16:54:23 +00:00
|
|
|
"\xF6"
|
|
|
|
);
|
|
|
|
|
2007-01-19 03:54:55 +00:00
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
|
|
|
|
"?? (Chinese)"
|
|
|
|
);
|
|
|
|
|
|
|
|
// Preserve the characters!
|
2007-05-20 18:06:51 +00:00
|
|
|
$config = HTMLPurifier_Config::create(array(
|
|
|
|
'Core.Encoding' => 'ISO-8859-1',
|
|
|
|
'Core.EscapeNonASCIICharacters' => true
|
|
|
|
));
|
2007-01-19 03:54:55 +00:00
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
|
|
|
|
"中文 (Chinese)"
|
|
|
|
);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
function test_convertToASCIIDumbLossless() {
|
|
|
|
|
|
|
|
// Uppercase thorn letter
|
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"),
|
|
|
|
"Þorn"
|
|
|
|
);
|
|
|
|
|
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"),
|
|
|
|
"an"
|
|
|
|
);
|
|
|
|
|
|
|
|
// test up to four bytes
|
|
|
|
$this->assertIdentical(
|
|
|
|
HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
|
|
|
|
"󠀠"
|
|
|
|
);
|
|
|
|
|
2006-09-01 00:54:38 +00:00
|
|
|
}
|
|
|
|
|
2006-08-29 19:36:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
?>
|