From bb16d8eae571dd4e30e3a62cce03d436d46cefaf Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <edwardzyang@thewritingpot.com>
Date: Sun, 25 May 2008 05:40:20 +0000
Subject: [PATCH] [3.1.1] Fix Shift_JIS encoding wonkiness with yen symbols and
 whatnot - Improve parseCDATA algorithm to take into account newline
 normalization - Fix regression in FontFamily validator. We now have a legit
 parser in place, albeit somewhat limited in use. Will be superseded by parser
 for entire grammar - Convert EncoderTest to new format

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1769 48356398-32a2-884e-a903-53898d9a118a
---
 NEWS                                          |   8 +-
 library/HTMLPurifier/AttrDef.php              |  11 +-
 .../HTMLPurifier/AttrDef/CSS/FontFamily.php   |  34 ++++-
 library/HTMLPurifier/Encoder.php              | 107 ++++++++++---
 .../AttrDef/CSS/FontFamilyTest.php            |  15 +-
 tests/HTMLPurifier/AttrDef/TextTest.php       |   2 +-
 tests/HTMLPurifier/AttrDefTest.php            |   3 +-
 tests/HTMLPurifier/EncoderTest.php            | 141 +++++++++++-------
 tests/HTMLPurifierTest.php                    |  18 +++
 9 files changed, 242 insertions(+), 97 deletions(-)

diff --git a/NEWS b/NEWS
index a1cdc662..27d1326d 100644
--- a/NEWS
+++ b/NEWS
@@ -17,7 +17,12 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
   manifest in token until end of operations. This prevents naughty internal
   code from directly modifying CurrentToken when they're not supposed to.
 - Percent encoding checks enabled for URI query and fragment
-- Fix stray backslashes in font-family
+- Fix stray backslashes in font-family; CSS Unicode character escapes are
+  now properly resolved (although *only* in font-family).
+- Improve parseCDATA algorithm to take into account newline normalization
+- Account for browser confusion between Yen character and backslash in
+  Shift_JIS encoding. This fix generalizes to any other encoding which is not
+  a strict superset of printable ASCII.
 . Added HTMLPurifier_UnitConverter and HTMLPurifier_Length for convenient
   handling of CSS-style lengths. HTMLPurifier_AttrDef_CSS_Length now uses
   this class.
@@ -38,7 +43,6 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
 . Variable parsing types now are magic integers instead of strings
 . Added benchmark for ConfigSchema
 
-
 3.1.0, released 2008-05-18
 # Unnecessary references to objects (vestiges of PHP4) removed from method
   signatures.  The following methods do not need references when assigning from
diff --git a/library/HTMLPurifier/AttrDef.php b/library/HTMLPurifier/AttrDef.php
index 2c59a8d7..eefe1c5d 100644
--- a/library/HTMLPurifier/AttrDef.php
+++ b/library/HTMLPurifier/AttrDef.php
@@ -51,16 +51,13 @@ abstract class HTMLPurifier_AttrDef
      * 
      * @warning This processing is inconsistent with XML's whitespace handling
      *          as specified by section 3.3.3 and referenced XHTML 1.0 section
-     *          4.7.  Compliant processing requires all line breaks normalized
-     *          to "\n", so the fix is not as simple as fixing it in this
-     *          function.  Trim and whitespace collapsing are supposed to only
-     *          occur in NMTOKENs.  However, note that we are NOT necessarily
-     *          parsing XML, thus, this behavior may still be correct.
+     *          4.7.  However, note that we are NOT necessarily
+     *          parsing XML, thus, this behavior may still be correct. We
+     *          assume that newlines have been normalized.
      */
     public function parseCDATA($string) {
         $string = trim($string);
-        $string = str_replace("\n", '', $string);
-        $string = str_replace(array("\r", "\t"), ' ', $string);
+        $string = str_replace(array("\n", "\t", "\r"), ' ', $string);
         return $string;
     }
     
diff --git a/library/HTMLPurifier/AttrDef/CSS/FontFamily.php b/library/HTMLPurifier/AttrDef/CSS/FontFamily.php
index a3b4341e..6bbf9b8c 100644
--- a/library/HTMLPurifier/AttrDef/CSS/FontFamily.php
+++ b/library/HTMLPurifier/AttrDef/CSS/FontFamily.php
@@ -16,10 +16,10 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
             'cursive' => true
         );
         
-        $string = $this->parseCDATA($string);
         // assume that no font names contain commas in them
         $fonts = explode(',', $string);
         $final = '';
+        $non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
         foreach($fonts as $font) {
             $font = trim($font);
             if ($font === '') continue;
@@ -35,11 +35,33 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
                 $quote = $font[0];
                 if ($font[$length - 1] !== $quote) continue;
                 $font = substr($font, 1, $length - 2);
-                // double-backslash processing is buggy. Namely, it doesn't allow
-                // fonts that contain an adjacent quote, backslash, or comma
-                $font = str_replace("\\$quote", $quote, $font); // de-escape quote
-                $font = str_replace("\\\n", '', $font);       // de-escape newlines
-                $font = str_replace("\\\\", "\\", $font);       // de-escape double backslashes
+                
+                $new_font = '';
+                for ($i = 0, $c = strlen($font); $i < $c; $i++) {
+                    if ($font[$i] === '\\') {
+                        $i++;
+                        if ($i >= $c) {
+                            $new_font .= '\\';
+                            break;
+                        }
+                        if (ctype_xdigit($font[$i])) {
+                            $code = $font[$i];
+                            for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
+                                if (!ctype_xdigit($font[$i])) break;
+                                $code .= $font[$i];
+                            }
+                            $char = HTMLPurifier_Encoder::unichr(hexdec($code));
+                            if (isset($non_sgml[$char])) continue;
+                            $new_font .= $char;
+                            if ($i < $c && trim($font[$i]) !== '') $i--;
+                            continue;
+                        }
+                        if ($font[$i] === "\n") continue;
+                    }
+                    $new_font .= $font[$i];
+                }
+                
+                $font = $new_font;
             }
             // $font is a pure representation of the font name
             
diff --git a/library/HTMLPurifier/Encoder.php b/library/HTMLPurifier/Encoder.php
index 763684f7..c14f7dbd 100644
--- a/library/HTMLPurifier/Encoder.php
+++ b/library/HTMLPurifier/Encoder.php
@@ -7,6 +7,8 @@
 class HTMLPurifier_Encoder
 {
     
+    private static $nonSgmlCharacters;
+    
     /**
      * Constructor throws fatal error if you attempt to instantiate class
      */
@@ -19,6 +21,24 @@ class HTMLPurifier_Encoder
      */
     private static function muteErrorHandler() {}
     
+    /**
+     * Returns a lookup of UTF-8 character byte sequences that are non-SGML.
+     */
+    public static function getNonSgmlCharacters() {
+        if (empty(HTMLPurifier_Encoder::$nonSgmlCharacters)) {
+            for ($i = 0; $i <= 31; $i++) {
+                // non-SGML ASCII chars
+                // save \r, \t and \n
+                if ($i == 9 || $i == 13 || $i == 10) continue;
+                HTMLPurifier_Encoder::$nonSgmlCharacters[chr($i)] = '';
+            }
+            for ($i = 127; $i <= 159; $i++) {
+                HTMLPurifier_Encoder::$nonSgmlCharacters[HTMLPurifier_Encoder::unichr($i)] = '';
+            }
+        }
+        return HTMLPurifier_Encoder::$nonSgmlCharacters;
+    }
+    
     /**
      * Cleans a UTF-8 string for well-formedness and SGML validity
      * 
@@ -46,18 +66,7 @@ class HTMLPurifier_Encoder
      */
     public static function cleanUTF8($str, $force_php = false) {
         
-        static $non_sgml_chars = array();
-        if (empty($non_sgml_chars)) {
-            for ($i = 0; $i <= 31; $i++) {
-                // non-SGML ASCII chars
-                // save \r, \t and \n
-                if ($i == 9 || $i == 13 || $i == 10) continue;
-                $non_sgml_chars[chr($i)] = '';
-            }
-            for ($i = 127; $i <= 159; $i++) {
-                $non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
-            }
-        }
+        $non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
         
         static $iconv = null;
         if ($iconv === null) $iconv = function_exists('iconv');
@@ -66,7 +75,7 @@ class HTMLPurifier_Encoder
         // This is an optimization: if the string is already valid UTF-8, no
         // need to do iconv/php stuff. 99% of the time, this will be the case.
         if (preg_match('/^.{1}/us', $str)) {
-            return strtr($str, $non_sgml_chars);
+            return strtr($str, $non_sgml);
         }
         
         if ($iconv && !$force_php) {
@@ -74,7 +83,7 @@ class HTMLPurifier_Encoder
             set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
             $str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
             restore_error_handler();
-            return strtr($str, $non_sgml_chars);
+            return strtr($str, $non_sgml);
         }
         
         $mState = 0; // cached expected number of octets after the current octet
@@ -276,17 +285,20 @@ class HTMLPurifier_Encoder
      * Converts a string to UTF-8 based on configuration.
      */
     public static function convertToUTF8($str, $config, $context) {
-        static $iconv = null;
-        if ($iconv === null) $iconv = function_exists('iconv');
         $encoding = $config->get('Core', 'Encoding');
         if ($encoding === 'utf-8') return $str;
+        static $iconv = null;
+        if ($iconv === null) $iconv = function_exists('iconv');
+        set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
         if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
-            set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
             $str = iconv($encoding, 'utf-8//IGNORE', $str);
+            // If the string is bjorked by Shift_JIS or a similar encoding
+            // that doesn't support all of ASCII, convert the naughty
+            // characters to their true byte-wise ASCII/UTF-8 equivalents.
+            $str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
             restore_error_handler();
             return $str;
         } elseif ($encoding === 'iso-8859-1') {
-            set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
             $str = utf8_encode($str);
             restore_error_handler();
             return $str;
@@ -300,20 +312,28 @@ class HTMLPurifier_Encoder
      *       characters being omitted.
      */
     public static function convertFromUTF8($str, $config, $context) {
-        static $iconv = null;
-        if ($iconv === null) $iconv = function_exists('iconv');
         $encoding = $config->get('Core', 'Encoding');
         if ($encoding === 'utf-8') return $str;
-        if ($config->get('Core', 'EscapeNonASCIICharacters')) {
+        static $iconv = null;
+        if ($iconv === null) $iconv = function_exists('iconv');
+        if ($escape = $config->get('Core', 'EscapeNonASCIICharacters')) {
             $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
         }
+        set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
         if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
-            set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
+            // Undo our previous fix in convertToUTF8, otherwise iconv will barf
+            $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
+            if (!$escape && !empty($ascii_fix)) {
+                $clear_fix = array();
+                foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
+                $str = strtr($str, $clear_fix);
+            }
+            $str = strtr($str, array_flip($ascii_fix));
+            // Normal stuff
             $str = iconv('utf-8', $encoding . '//IGNORE', $str);
             restore_error_handler();
             return $str;
         } elseif ($encoding === 'iso-8859-1') {
-            set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
             $str = utf8_decode($str);
             restore_error_handler();
             return $str;
@@ -368,6 +388,47 @@ class HTMLPurifier_Encoder
         return $result;
     }
     
+    /**
+     * This expensive function tests whether or not a given character
+     * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
+     * fail this test, and require special processing. Variable width
+     * encodings shouldn't ever fail.
+     * 
+     * @param string $encoding Encoding name to test, as per iconv format
+     * @param bool $bypass Whether or not to bypass the precompiled arrays.
+     * @return Array of UTF-8 characters to their corresponding ASCII,
+     *      which can be used to "undo" any overzealous iconv action.
+     */
+    public static function testEncodingSupportsASCII($encoding, $bypass = false) {
+        static $encodings = array();
+        if (!$bypass) {
+            if (isset($encodings[$encoding])) return $encodings[$encoding];
+            $lenc = strtolower($encoding);
+            switch ($lenc) {
+                case 'shift_jis':
+                    return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
+                case 'johab':
+                    return array("\xE2\x82\xA9" => '\\');
+            }
+            if (strpos($lenc, 'iso-8859-') === 0) return array();
+        }
+        $ret = array();
+        set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
+        if (iconv('UTF-8', $encoding, 'a') === false) return false;
+        for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
+            $c = chr($i);
+            if (iconv('UTF-8', "$encoding//IGNORE", $c) === '') {
+                // Reverse engineer: what's the UTF-8 equiv of this byte
+                // sequence? This assumes that there's no variable width
+                // encoding that doesn't support ASCII.
+                $ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
+            }
+        }
+        restore_error_handler();
+        $encodings[$encoding] = $ret;
+        return $ret;
+    }
+    
     
 }
 
diff --git a/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php b/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php
index 6db47110..99a36dd2 100644
--- a/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php
+++ b/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php
@@ -18,7 +18,20 @@ class HTMLPurifier_AttrDef_CSS_FontFamilyTest extends HTMLPurifier_AttrDefHarnes
         $this->assertDef($d = "'\xE5\xAE\x8B\xE4\xBD\x93'");
         $this->assertDef("\xE5\xAE\x8B\xE4\xBD\x93", $d);
         $this->assertDef("'\\','f'", "'\\\\', f");
-        
+        $this->assertDef("'\\01'", "''");
+        $this->assertDef("'\\20'", "' '");
+        $this->assertDef("\\0020", "'\\\\0020'");
+        $this->assertDef("'\\000045'", "E");
+        $this->assertDef("','", false);
+        $this->assertDef("',' foobar','", "' foobar'");
+        $this->assertDef("'\\27'", "'\''");
+        $this->assertDef('"\\22"', "'\"'");
+        $this->assertDef('"\\""', "'\"'");
+        $this->assertDef('"\'"', "'\\''");
+        $this->assertDef("'\\000045a'", "Ea");
+        $this->assertDef("'\\00045 a'", "Ea");
+        $this->assertDef("'\\00045  a'", "'E a'");
+        $this->assertDef("'\\\nf'", "f");
     }
     
 }
diff --git a/tests/HTMLPurifier/AttrDef/TextTest.php b/tests/HTMLPurifier/AttrDef/TextTest.php
index 59b42465..c6e34493 100644
--- a/tests/HTMLPurifier/AttrDef/TextTest.php
+++ b/tests/HTMLPurifier/AttrDef/TextTest.php
@@ -8,7 +8,7 @@ class HTMLPurifier_AttrDef_TextTest extends HTMLPurifier_AttrDefHarness
         $this->def = new HTMLPurifier_AttrDef_Text();
         
         $this->assertDef('This is spiffy text!');
-        $this->assertDef(" Casual\tCDATA parse\ncheck. ", 'Casual CDATA parsecheck.');
+        $this->assertDef(" Casual\tCDATA parse\ncheck. ", 'Casual CDATA parse check.');
         
     }
     
diff --git a/tests/HTMLPurifier/AttrDefTest.php b/tests/HTMLPurifier/AttrDefTest.php
index dc165971..c299c342 100644
--- a/tests/HTMLPurifier/AttrDefTest.php
+++ b/tests/HTMLPurifier/AttrDefTest.php
@@ -15,8 +15,7 @@ class HTMLPurifier_AttrDefTest extends HTMLPurifier_Harness
         $this->assertIdentical('', $def->parseCDATA(''));
         $this->assertIdentical('', $def->parseCDATA("\t\n\r \t\t"));
         $this->assertIdentical('foo', $def->parseCDATA("\t\n\r foo\t\t"));
-        $this->assertIdentical('ignorelinefeeds', $def->parseCDATA("ignore\nline\nfeeds"));
-        $this->assertIdentical('translate to space', $def->parseCDATA("translate\rto\tspace"));
+        $this->assertIdentical('translate to space', $def->parseCDATA("translate\nto\tspace"));
         
     }
     
diff --git a/tests/HTMLPurifier/EncoderTest.php b/tests/HTMLPurifier/EncoderTest.php
index ac35d223..cc1ddfde 100644
--- a/tests/HTMLPurifier/EncoderTest.php
+++ b/tests/HTMLPurifier/EncoderTest.php
@@ -7,6 +7,7 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
     
     function setUp() {
         $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
+        parent::setUp();
     }
     
     function assertCleanUTF8($string, $expect = null) {
@@ -26,93 +27,89 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
         $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
     }
     
-    function test_convertToUTF8() {
-        $config = HTMLPurifier_Config::createDefault();
-        $context = new HTMLPurifier_Context();
-        
+    function test_convertToUTF8_noConvert() {
         // UTF-8 means that we don't touch it
         $this->assertIdentical(
-            HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context),
+            HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
             "\xF6", // this is invalid
             'Expected identical [Binary: F6]'
         );
-        
-        $config = HTMLPurifier_Config::create(array(
-            'Core.Encoding' => 'ISO-8859-1'
-        ));
-        
-        // Now it gets converted
+    }
+    
+    function test_convertToUTF8_iso8859_1() {
+        $this->config->set('Core', 'Encoding', 'ISO-8859-1');
         $this->assertIdentical(
-            HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context),
+            HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
             "\xC3\xB6"
         );
-        
-        $config = HTMLPurifier_Config::create(array(
-            'Core.Encoding' => 'ISO-8859-1',
-            'Test.ForceNoIconv' => true
-        ));
+    }
+    
+    function test_convertToUTF8_withoutIconv() {
+        $this->config->set('Core', 'Encoding', 'ISO-8859-1');
+        $this->config->set('Test', 'ForceNoIconv', true);
         $this->assertIdentical(
-            HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context),
+            HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
             "\xC3\xB6"
         );
         
     }
     
-    function test_convertFromUTF8() {
-        $config = HTMLPurifier_Config::createDefault();
-        $context = new HTMLPurifier_Context();
-        
-        // zhong-wen
-        $chinese = "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
-        
+    function getZhongWen() {
+        return "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
+    }
+    
+    function test_convertFromUTF8_utf8() {
         // UTF-8 means that we don't touch it
         $this->assertIdentical(
-            HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
+            HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
             "\xC3\xB6"
         );
-        
-        $config = HTMLPurifier_Config::create(array(
-            'Core.Encoding' => 'ISO-8859-1'
-        ));
-        
-        // Now it gets converted
+    }
+    
+    function test_convertFromUTF8_iso8859_1() {
+        $this->config->set('Core', 'Encoding', 'ISO-8859-1');
         $this->assertIdentical(
-            HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
+            HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
             "\xF6",
             'Expected identical [Binary: F6]'
         );
-        
-        if (function_exists('iconv')) {
-            // iconv has it's own way
-            $this->assertIdentical(
-                HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
-                " (Chinese)"
-            );
-        }
-        
+    }
+    
+    function test_convertFromUTF8_iconvNoChars() {
+        if (!function_exists('iconv')) return;
+        $this->config->set('Core', 'Encoding', 'ISO-8859-1');
+        $this->assertIdentical(
+            HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
+            " (Chinese)"
+        );
+    }
+    
+    function test_convertFromUTF8_phpNormal() {
         // Plain PHP implementation has slightly different behavior
-        $config = HTMLPurifier_Config::create(array(
-            'Core.Encoding' => 'ISO-8859-1',
-            'Test.ForceNoIconv' => true
-        ));
+        $this->config->set('Core', 'Encoding', 'ISO-8859-1');
+        $this->config->set('Test', 'ForceNoIconv', true);
         $this->assertIdentical(
-            HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
+            HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
             "\xF6",
             'Expected identical [Binary: F6]'
         );
-        
+    }
+    
+    function test_convertFromUTF8_phpNoChars() {
+        $this->config->set('Core', 'Encoding', 'ISO-8859-1');
+        $this->config->set('Test', 'ForceNoIconv', true);
         $this->assertIdentical(
-            HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
+            HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
             "?? (Chinese)"
         );
-        
+    }
+    
+    function test_convertFromUTF8_withProtection() {
         // Preserve the characters!
-        $config = HTMLPurifier_Config::create(array(
-            'Core.Encoding' => 'ISO-8859-1',
-            'Core.EscapeNonASCIICharacters' => true
-        ));
+        $this->config->set('Core', 'Encoding', 'ISO-8859-1');
+        $this->config->set('Core', 'EscapeNonASCIICharacters', true);
         $this->assertIdentical(
-            HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
+            HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
             "&#20013;&#25991; (Chinese)"
         );
         
@@ -139,5 +136,39 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
         
     }
     
+    function assertASCIISupportCheck($enc, $ret) {
+        $test = HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true);
+        if ($test === false) return;
+        $this->assertIdentical(
+            HTMLPurifier_Encoder::testEncodingSupportsASCII($enc),
+            $ret
+        );
+        $this->assertIdentical(
+            HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true),
+            $ret
+        );
+    }
+    
+    function test_testEncodingSupportsASCII() {
+        $this->assertASCIISupportCheck('Shift_JIS', array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'));
+        $this->assertASCIISupportCheck('JOHAB', array("\xE2\x82\xA9" => '\\'));
+        $this->assertASCIISupportCheck('ISO-8859-1', array());
+        $this->assertASCIISupportCheck('dontexist', array()); // canary
+    }
+    
+    function testShiftJIS() {
+        if (!function_exists('iconv')) return;
+        $this->config->set('Core', 'Encoding', 'Shift_JIS');
+        // This actually looks like a Yen, but we're going to treat it differently
+        $this->assertIdentical(
+            HTMLPurifier_Encoder::convertFromUTF8('\\~', $this->config, $this->context),
+            '\\~'
+        );
+        $this->assertIdentical(
+            HTMLPurifier_Encoder::convertToUTF8('\\~', $this->config, $this->context),
+            '\\~'
+        );
+    }
+    
 }
 
diff --git a/tests/HTMLPurifierTest.php b/tests/HTMLPurifierTest.php
index f3722e80..685a6f4d 100644
--- a/tests/HTMLPurifierTest.php
+++ b/tests/HTMLPurifierTest.php
@@ -167,5 +167,23 @@ alert("<This is compatible with XHTML>");
         $this->purifier->purify('foo');
     }
     
+    function test_shiftJis() {
+        if (!function_exists('iconv')) return;
+        $this->config->set('Core', 'Encoding', 'Shift_JIS');
+        $this->config->set('Core', 'EscapeNonASCIICharacters', true);
+        $this->assertPurification(
+            "<b style=\"font-family:'&#165;';\">111</b>"
+        );
+    }
+    
+    function test_shiftJisWorstCase() {
+        if (!function_exists('iconv')) return;
+        $this->config->set('Core', 'Encoding', 'Shift_JIS');
+        $this->assertPurification( // Notice how Yen disappears
+            "<b style=\"font-family:'&#165;';\">111</b>",
+            "<b style=\"font-family:'';\">111</b>"
+        );
+    }
+    
 }