htmlpurifier/tests/HTMLPurifier/EncoderTest.php

<?php

require_once 'HTMLPurifier/Encoder.php';

class HTMLPurifier_EncoderTest extends UnitTestCase
{
    
    var $_entity_lookup;
    
    function setUp() {
        $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
    }
    
    function assertCleanUTF8($string, $expect = null) {
        if ($expect === null) $expect = $string;
        $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string), $expect, 'iconv: %s');
        $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string, true), $expect, 'PHP: %s');
    }
    
    function test_cleanUTF8() {
        $this->assertCleanUTF8('Normal string.');
        $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
        $this->assertCleanUTF8("null byte: \0", 'null byte: ');
        $this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
        $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
        $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
        $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
        $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
    }
    
    function test_convertToUTF8() {
        $config = HTMLPurifier_Config::createDefault();
        $context = new HTMLPurifier_Context();
        
        // UTF-8 means that we don't touch it
        $this->assertIdentical(
            HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context),
            "\xF6" // this is invalid
        );
        $this->assertNoErrors();
        
        $config = HTMLPurifier_Config::create(array(
            'Core.Encoding' => 'ISO-8859-1'
        ));
        
        // Now it gets converted
        $this->assertIdentical(
            HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context),
            "\xC3\xB6"
        );
        
        $config = HTMLPurifier_Config::create(array(
            'Core.Encoding' => 'ISO-8859-1',
            'Test.ForceNoIconv' => true
        ));
        $this->assertIdentical(
            HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context),
            "\xC3\xB6"
        );
        
    }
    
    function test_convertFromUTF8() {
        $config = HTMLPurifier_Config::createDefault();
        $context = new HTMLPurifier_Context();
        
        // zhong-wen
        $chinese = "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
        
        // UTF-8 means that we don't touch it
        $this->assertIdentical(
            HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
            "\xC3\xB6"
        );
        
        $config = HTMLPurifier_Config::create(array(
            'Core.Encoding' => 'ISO-8859-1'
        ));
        
        // Now it gets converted
        $this->assertIdentical(
            HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
            "\xF6"
        );
        
        if (function_exists('iconv')) {
            // iconv has it's own way
            $this->assertIdentical(
                HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
                " (Chinese)"
            );
        }
        
        // Plain PHP implementation has slightly different behavior
        $config = HTMLPurifier_Config::create(array(
            'Core.Encoding' => 'ISO-8859-1',
            'Test.ForceNoIconv' => true
        ));
        $this->assertIdentical(
            HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
            "\xF6"
        );
        
        $this->assertIdentical(
            HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
            "?? (Chinese)"
        );
        
        // Preserve the characters!
        $config = HTMLPurifier_Config::create(array(
            'Core.Encoding' => 'ISO-8859-1',
            'Core.EscapeNonASCIICharacters' => true
        ));
        $this->assertIdentical(
            HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
            "&#20013;&#25991; (Chinese)"
        );
        
    }
    
    function test_convertToASCIIDumbLossless() {
        
        // Uppercase thorn letter
        $this->assertIdentical(
            HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"),
            "&#222;orn"
        );
        
        $this->assertIdentical(
            HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"),
            "an"
        );
        
        // test up to four bytes
        $this->assertIdentical(
            HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
            "&#917536;"
        );
        
    }
    
}

?>
Refactor encoding and entity specific processing to HTMLPurifier_Encoder. We also need to refactor the escaping to this class too. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@339 48356398-32a2-884e-a903-53898d9a118a 2006-08-29 19:36:40 +00:00			`<?php`

			`require_once 'HTMLPurifier/Encoder.php';`

			`class HTMLPurifier_EncoderTest extends UnitTestCase`
			`{`

[1.4.0] - Added %Core.EscapeNonASCIICharacters to workaround %Core.Encoding misbehavior - Add "All Tests" to test runner title and reorder subfile names - Specific file is now called with ?f= - Link to UTF-8 docs, even though they're not done - 1000th unit test passed! W00t! (that's a third as many as SimpleTest has for itself.) git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@658 48356398-32a2-884e-a903-53898d9a118a 2007-01-19 03:54:55 +00:00			`var $_entity_lookup;`
Refactor encoding and entity specific processing to HTMLPurifier_Encoder. We also need to refactor the escaping to this class too. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@339 48356398-32a2-884e-a903-53898d9a118a 2006-08-29 19:36:40 +00:00
			`function setUp() {`
			`$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();`
			`}`

			`function assertCleanUTF8($string, $expect = null) {`
			`if ($expect === null) $expect = $string;`
[1.4.0] Make all functions in Encoder static. Affects branches/strict git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@656 48356398-32a2-884e-a903-53898d9a118a 2007-01-18 22:55:44 +00:00			`$this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string), $expect, 'iconv: %s');`
			`$this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string, true), $expect, 'PHP: %s');`
Refactor encoding and entity specific processing to HTMLPurifier_Encoder. We also need to refactor the escaping to this class too. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@339 48356398-32a2-884e-a903-53898d9a118a 2006-08-29 19:36:40 +00:00			`}`

			`function test_cleanUTF8() {`
			`$this->assertCleanUTF8('Normal string.');`
			`$this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");`
			`$this->assertCleanUTF8("null byte: \0", 'null byte: ');`
			`$this->assertCleanUTF8("\1\2\3\4\5\6\7", '');`
			`$this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char`
			`$this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML`
			`$this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte`
			`$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8`
			`}`

Add rudimentary extra encoding support. We are now release-ready! git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 00:54:38 +00:00			`function test_convertToUTF8() {`
			`$config = HTMLPurifier_Config::createDefault();`
[1.2.0] - Partially finished migrating to new Context object (done in r485). - Created HTMLPurifier_Harness to assist with testing, ChildDefTest migrated to that framework. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@484 48356398-32a2-884e-a903-53898d9a118a 2006-10-01 20:47:07 +00:00			`$context = new HTMLPurifier_Context();`
Add rudimentary extra encoding support. We are now release-ready! git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 00:54:38 +00:00
			`// UTF-8 means that we don't touch it`
			`$this->assertIdentical(`
[1.4.0] Make all functions in Encoder static. Affects branches/strict git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@656 48356398-32a2-884e-a903-53898d9a118a 2007-01-18 22:55:44 +00:00			`HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context),`
Add rudimentary extra encoding support. We are now release-ready! git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 00:54:38 +00:00			`"\xF6" // this is invalid`
			`);`
			`$this->assertNoErrors();`

[1.7.0] Configuration object now finalizes itself after first read operation git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1075 48356398-32a2-884e-a903-53898d9a118a 2007-05-20 18:06:51 +00:00			`$config = HTMLPurifier_Config::create(array(`
			`'Core.Encoding' => 'ISO-8859-1'`
			`));`
Add rudimentary extra encoding support. We are now release-ready! git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 00:54:38 +00:00
			`// Now it gets converted`
			`$this->assertIdentical(`
[1.4.0] Make all functions in Encoder static. Affects branches/strict git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@656 48356398-32a2-884e-a903-53898d9a118a 2007-01-18 22:55:44 +00:00			`HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context),`
Add rudimentary extra encoding support. We are now release-ready! git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 00:54:38 +00:00			`"\xC3\xB6"`
			`);`
- Add Test namespace - Further fix the no iconv problem, and extend test cases to cover that case. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@368 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 16:54:23 +00:00
[1.7.0] Configuration object now finalizes itself after first read operation git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1075 48356398-32a2-884e-a903-53898d9a118a 2007-05-20 18:06:51 +00:00			`$config = HTMLPurifier_Config::create(array(`
			`'Core.Encoding' => 'ISO-8859-1',`
			`'Test.ForceNoIconv' => true`
			`));`
- Add Test namespace - Further fix the no iconv problem, and extend test cases to cover that case. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@368 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 16:54:23 +00:00			`$this->assertIdentical(`
[1.4.0] Make all functions in Encoder static. Affects branches/strict git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@656 48356398-32a2-884e-a903-53898d9a118a 2007-01-18 22:55:44 +00:00			`HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context),`
- Add Test namespace - Further fix the no iconv problem, and extend test cases to cover that case. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@368 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 16:54:23 +00:00			`"\xC3\xB6"`
			`);`

Add rudimentary extra encoding support. We are now release-ready! git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 00:54:38 +00:00			`}`

			`function test_convertFromUTF8() {`
			`$config = HTMLPurifier_Config::createDefault();`
[1.2.0] - Partially finished migrating to new Context object (done in r485). - Created HTMLPurifier_Harness to assist with testing, ChildDefTest migrated to that framework. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@484 48356398-32a2-884e-a903-53898d9a118a 2006-10-01 20:47:07 +00:00			`$context = new HTMLPurifier_Context();`
Add rudimentary extra encoding support. We are now release-ready! git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 00:54:38 +00:00
[1.4.0] - Added %Core.EscapeNonASCIICharacters to workaround %Core.Encoding misbehavior - Add "All Tests" to test runner title and reorder subfile names - Specific file is now called with ?f= - Link to UTF-8 docs, even though they're not done - 1000th unit test passed! W00t! (that's a third as many as SimpleTest has for itself.) git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@658 48356398-32a2-884e-a903-53898d9a118a 2007-01-19 03:54:55 +00:00			`// zhong-wen`
			`$chinese = "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";`

Add rudimentary extra encoding support. We are now release-ready! git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 00:54:38 +00:00			`// UTF-8 means that we don't touch it`
			`$this->assertIdentical(`
[1.4.0] Make all functions in Encoder static. Affects branches/strict git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@656 48356398-32a2-884e-a903-53898d9a118a 2007-01-18 22:55:44 +00:00			`HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),`
Add rudimentary extra encoding support. We are now release-ready! git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 00:54:38 +00:00			`"\xC3\xB6"`
			`);`

[1.7.0] Configuration object now finalizes itself after first read operation git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1075 48356398-32a2-884e-a903-53898d9a118a 2007-05-20 18:06:51 +00:00			`$config = HTMLPurifier_Config::create(array(`
			`'Core.Encoding' => 'ISO-8859-1'`
			`));`
Add rudimentary extra encoding support. We are now release-ready! git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 00:54:38 +00:00
			`// Now it gets converted`
			`$this->assertIdentical(`
[1.4.0] Make all functions in Encoder static. Affects branches/strict git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@656 48356398-32a2-884e-a903-53898d9a118a 2007-01-18 22:55:44 +00:00			`HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),`
Add rudimentary extra encoding support. We are now release-ready! git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 00:54:38 +00:00			`"\xF6"`
			`);`
- Add Test namespace - Further fix the no iconv problem, and extend test cases to cover that case. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@368 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 16:54:23 +00:00
[1.4.0] - Added %Core.EscapeNonASCIICharacters to workaround %Core.Encoding misbehavior - Add "All Tests" to test runner title and reorder subfile names - Specific file is now called with ?f= - Link to UTF-8 docs, even though they're not done - 1000th unit test passed! W00t! (that's a third as many as SimpleTest has for itself.) git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@658 48356398-32a2-884e-a903-53898d9a118a 2007-01-19 03:54:55 +00:00			`if (function_exists('iconv')) {`
			`// iconv has it's own way`
			`$this->assertIdentical(`
			`HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),`
			`" (Chinese)"`
			`);`
			`}`
- Add Test namespace - Further fix the no iconv problem, and extend test cases to cover that case. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@368 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 16:54:23 +00:00
[1.4.0] - Added %Core.EscapeNonASCIICharacters to workaround %Core.Encoding misbehavior - Add "All Tests" to test runner title and reorder subfile names - Specific file is now called with ?f= - Link to UTF-8 docs, even though they're not done - 1000th unit test passed! W00t! (that's a third as many as SimpleTest has for itself.) git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@658 48356398-32a2-884e-a903-53898d9a118a 2007-01-19 03:54:55 +00:00			`// Plain PHP implementation has slightly different behavior`
[1.7.0] Configuration object now finalizes itself after first read operation git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1075 48356398-32a2-884e-a903-53898d9a118a 2007-05-20 18:06:51 +00:00			`$config = HTMLPurifier_Config::create(array(`
			`'Core.Encoding' => 'ISO-8859-1',`
			`'Test.ForceNoIconv' => true`
			`));`
- Add Test namespace - Further fix the no iconv problem, and extend test cases to cover that case. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@368 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 16:54:23 +00:00			`$this->assertIdentical(`
[1.4.0] Make all functions in Encoder static. Affects branches/strict git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@656 48356398-32a2-884e-a903-53898d9a118a 2007-01-18 22:55:44 +00:00			`HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),`
- Add Test namespace - Further fix the no iconv problem, and extend test cases to cover that case. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@368 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 16:54:23 +00:00			`"\xF6"`
			`);`

[1.4.0] - Added %Core.EscapeNonASCIICharacters to workaround %Core.Encoding misbehavior - Add "All Tests" to test runner title and reorder subfile names - Specific file is now called with ?f= - Link to UTF-8 docs, even though they're not done - 1000th unit test passed! W00t! (that's a third as many as SimpleTest has for itself.) git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@658 48356398-32a2-884e-a903-53898d9a118a 2007-01-19 03:54:55 +00:00			`$this->assertIdentical(`
			`HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),`
			`"?? (Chinese)"`
			`);`

			`// Preserve the characters!`
[1.7.0] Configuration object now finalizes itself after first read operation git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1075 48356398-32a2-884e-a903-53898d9a118a 2007-05-20 18:06:51 +00:00			`$config = HTMLPurifier_Config::create(array(`
			`'Core.Encoding' => 'ISO-8859-1',`
			`'Core.EscapeNonASCIICharacters' => true`
			`));`
[1.4.0] - Added %Core.EscapeNonASCIICharacters to workaround %Core.Encoding misbehavior - Add "All Tests" to test runner title and reorder subfile names - Specific file is now called with ?f= - Link to UTF-8 docs, even though they're not done - 1000th unit test passed! W00t! (that's a third as many as SimpleTest has for itself.) git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@658 48356398-32a2-884e-a903-53898d9a118a 2007-01-19 03:54:55 +00:00			`$this->assertIdentical(`
			`HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),`
			`"中文 (Chinese)"`
			`);`

			`}`

			`function test_convertToASCIIDumbLossless() {`

			`// Uppercase thorn letter`
			`$this->assertIdentical(`
			`HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"),`
			`"Þorn"`
			`);`

			`$this->assertIdentical(`
			`HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"),`
			`"an"`
			`);`

			`// test up to four bytes`
			`$this->assertIdentical(`
			`HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),`
			`"󠀠"`
			`);`

Add rudimentary extra encoding support. We are now release-ready! git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a 2006-09-01 00:54:38 +00:00			`}`

Refactor encoding and entity specific processing to HTMLPurifier_Encoder. We also need to refactor the escaping to this class too. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@339 48356398-32a2-884e-a903-53898d9a118a 2006-08-29 19:36:40 +00:00			`}`

			`?>`