[1.4.0]

- Added %Core.EscapeNonASCIICharacters to workaround %Core.Encoding misbehavior - Add "All Tests" to test runner title and reorder subfile names - Specific file is now called with ?f= - Link to UTF-8 docs, even though they're not done - 1000th unit test passed! W00t! (that's a third as many as SimpleTest has for itself.) git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@658 48356398-32a2-884e-a903-53898d9a118a
2024-12-22 08:21:52 +00:00 · 2007-01-19 03:54:55 +00:00 · 2007-01-19 03:54:55 +00:00 · 0dd866cc15
commit 0dd866cc15
parent ad1169c711
5 changed files with 128 additions and 8 deletions
--- a/2
+++ b/2
@ -16,6 +16,8 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
  to allow these, and background-position IS NOT implemented yet.
 ! Configuration documentation looks nicer
 ! Added smoketest 'all.php', which loads all other smoketests via frames
+! Added %Core.EscapeNonASCIICharacters to workaround loss of Unicode
+  characters while %Core.Encoding is set to a non-UTF-8 encoding.
 . Implemented AttrDef_CSSURI for url(http://google.com) style declarations

 1.3.3, unknown release date, likely to be dropped
--- a/docs/index.html
+++ b/docs/index.html
@ -31,6 +31,9 @@ information for casual developers using HTML Purifier.</p>
 <dt><a href="enduser-slow.html">Speeding up HTML Purifier</a></dt>
 <dd>Explains how to speed up HTML Purifier through caching or inbound filtering.</dd>

+<dt><a href="enduser-utf8.html">UTF-8</a></dt>
+<dd>Describes the rationale for using UTF-8, the ramifications otherwise, and how to make the switch.</dd>
+
 </dl>

 <h2>Development</h2>
--- a/library/HTMLPurifier/Encoder.php
+++ b/library/HTMLPurifier/Encoder.php
@ -6,15 +6,29 @@ HTMLPurifier_ConfigSchema::define(
    'Core', 'Encoding', 'utf-8', 'istring', 
    'If for some reason you are unable to convert all webpages to UTF-8, '. 
    'you can use this directive as a stop-gap compatibility change to '. 
-    'let HTMLPurifier deal with non UTF-8 input.  This technique has '. 
+    'let HTML Purifier deal with non UTF-8 input.  This technique has '. 
    'notable deficiencies: absolutely no characters outside of the selected '. 
    'character encoding will be preserved, not even the ones that have '. 
    'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '.
    'that automatically resolves all entities), making it pretty useless '.
-    'for anything except the most I18N-blind applications.  This directive '.
+    'for anything except the most I18N-blind applications, although '.
+    '%Core.EscapeNonASCIICharacters offers fixes this trouble with '.
+    'another tradeoff. This directive '.
    'only accepts ISO-8859-1 if iconv is not enabled.'
 );

+HTMLPurifier_ConfigSchema::define(
+    'Core', 'EscapeNonASCIICharacters', false, 'bool',
+    'This directive overcomes a deficiency in %Core.Encoding by blindly '.
+    'converting all non-ASCII characters into decimal numeric entities before '.
+    'converting it to its native encoding. This means that even '.
+    'characters that can be expressed in the non-UTF-8 encoding will '.
+    'be entity-ized, which can be a real downer for encodings like Big5. '.
+    'It also assumes that the ASCII repetoire is available, although '.
+    'this is the case for almost all encodings. Anyway, use UTF-8! This '.
+    'directive has been available since 1.4.0.'
+);
+
 if ( !function_exists('iconv') ) {
    // only encodings with native PHP support
    HTMLPurifier_ConfigSchema::defineAllowedValues(
@ -310,6 +324,7 @@ class HTMLPurifier_Encoder
        } elseif ($encoding === 'iso-8859-1') {
            return @utf8_encode($str);
        }
+        trigger_error('Encoding not supported', E_USER_ERROR);
    }
    
    /**
@ -323,11 +338,63 @@ class HTMLPurifier_Encoder
        if ($iconv === null) $iconv = function_exists('iconv');
        $encoding = $config->get('Core', 'Encoding');
        if ($encoding === 'utf-8') return $str;
+        if ($config->get('Core', 'EscapeNonASCIICharacters')) {
+            $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
+        }
        if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
            return @iconv('utf-8', $encoding . '//IGNORE', $str);
        } elseif ($encoding === 'iso-8859-1') {
            return @utf8_decode($str);
        }
+        trigger_error('Encoding not supported', E_USER_ERROR);
+    }
+    
+    /**
+     * Lossless (character-wise) conversion of HTML to ASCII
+     * @static
+     * @param $str UTF-8 string to be converted to ASCII
+     * @returns ASCII encoded string with non-ASCII character entity-ized
+     * @warning Adapted from MediaWiki, claiming fair use: this is a common
+     *       algorithm. If you disagree with this license fudgery,
+     *       implement it yourself.
+     * @note Uses decimal numeric entities since they are best supported.
+     * @note This is a DUMB function: it has no concept of keeping
+     *       character entities that the projected character encoding
+     *       can allow. We could possibly implement a smart version
+     *       but that would require it to also know which Unicode
+     *       codepoints the charset supported (not an easy task).
+     * @note Sort of with cleanUTF8() but it assumes that $str is
+     *       well-formed UTF-8
+     */
+    function convertToASCIIDumbLossless($str) {
+        $bytesleft = 0;
+        $result = '';
+        $working = 0;
+        $len = strlen($str);
+        for( $i = 0; $i < $len; $i++ ) {
+            $bytevalue = ord( $str[$i] );
+            if( $bytevalue <= 0x7F ) { //0xxx xxxx
+                $result .= chr( $bytevalue );
+                $bytesleft = 0;
+            } elseif( $bytevalue <= 0xBF ) { //10xx xxxx
+                $working = $working << 6;
+                $working += ($bytevalue & 0x3F);
+                $bytesleft--;
+                if( $bytesleft <= 0 ) {
+                    $result .= "&#" . $working . ";";
+                }
+            } elseif( $bytevalue <= 0xDF ) { //110x xxxx
+                $working = $bytevalue & 0x1F;
+                $bytesleft = 1;
+            } elseif( $bytevalue <= 0xEF ) { //1110 xxxx
+                $working = $bytevalue & 0x0F;
+                $bytesleft = 2;
+            } else { //1111 0xxx
+                $working = $bytevalue & 0x07;
+                $bytesleft = 3;
+            }
+        }
+        return $result;
    }
    
    
--- a/tests/HTMLPurifier/EncoderTest.php
+++ b/tests/HTMLPurifier/EncoderTest.php
@ -5,7 +5,7 @@ require_once 'HTMLPurifier/Encoder.php';
 class HTMLPurifier_EncoderTest extends UnitTestCase
 {
    
-    var $Encoder;
+    var $_entity_lookup;
    
    function setUp() {
        $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
@ -60,6 +60,9 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
        $config = HTMLPurifier_Config::createDefault();
        $context = new HTMLPurifier_Context();
        
+        // zhong-wen
+        $chinese = "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
+        
        // UTF-8 means that we don't touch it
        $this->assertIdentical(
            HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
@ -74,13 +77,55 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
            "\xF6"
        );
        
-        $config->set('Test', 'ForceNoIconv', true);
+        if (function_exists('iconv')) {
+            // iconv has it's own way
+            $this->assertIdentical(
+                HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
+                " (Chinese)"
+            );
+        }
        
+        // Plain PHP implementation has slightly different behavior
+        $config->set('Test', 'ForceNoIconv', true);
        $this->assertIdentical(
            HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
            "\xF6"
        );
        
+        $this->assertIdentical(
+            HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
+            "?? (Chinese)"
+        );
+        
+        // Preserve the characters!
+        
+        $config->set('Core', 'EscapeNonASCIICharacters', true);
+        $this->assertIdentical(
+            HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
+            "&#20013;&#25991; (Chinese)"
+        );
+        
+    }
+    
+    function test_convertToASCIIDumbLossless() {
+        
+        // Uppercase thorn letter
+        $this->assertIdentical(
+            HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"),
+            "&#222;orn"
+        );
+        
+        $this->assertIdentical(
+            HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"),
+            "an"
+        );
+        
+        // test up to four bytes
+        $this->assertIdentical(
+            HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
+            "&#917536;"
+        );
+        
    }
    
 }
--- a/tests/index.php
+++ b/tests/index.php
@ -1,5 +1,8 @@
 <?php

+// call one file using /?f=FileTest.php , see $test_files array for
+// valid values
+
 error_reporting(E_ALL);

 // wishlist: automated calling of this file from multiple PHP versions so we
@ -118,19 +121,19 @@ function htmlpurifier_path2class($path) {

 // we can't use addTestFile because SimpleTest chokes on E_STRICT warnings

-if (isset($_GET['file']) && isset($test_file_lookup[$_GET['file']])) {
+if (isset($_GET['f']) && isset($test_file_lookup[$_GET['f']])) {
    
    // execute only one test
-    $test_file = $_GET['file'];
+    $test_file = $_GET['f'];
    
-    $test = new GroupTest('HTML Purifier - ' . $test_file);
+    $test = new GroupTest($test_file . ' - HTML Purifier');
    $path = 'HTMLPurifier/' . $test_file;
    require_once $path;
    $test->addTestClass(htmlpurifier_path2class($path));
    
 } else {
    
-    $test = new GroupTest('HTML Purifier');
+    $test = new GroupTest('All Tests - HTML Purifier');

    foreach ($test_files as $test_file) {
        $path = 'HTMLPurifier/' . $test_file;