Refactor encoding and entity specific processing to HTMLPurifier_Encoder. We also need to refactor the escaping to this class too.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@339 48356398-32a2-884e-a903-53898d9a118a
2025-04-25 03:44:37 +00:00 · 2006-08-29 19:36:40 +00:00 · 2006-08-29 19:36:40 +00:00 · 1de3088276
commit 1de3088276
parent 55503744ee
12 changed files with 495 additions and 502 deletions
--- a/docs/examples/demo.php
+++ b/docs/examples/demo.php
@ -62,7 +62,7 @@ if (isset($_GET['profile']) || isset($_GET['XDEBUG_PROFILE'])) {

 if (isset($html)) {
    echo htmlspecialchars(
-            HTMLPurifier_Lexer::cleanUTF8($html), ENT_COMPAT, 'UTF-8');
+            HTMLPurifier_Encoder::cleanUTF8($html), ENT_COMPAT, 'UTF-8');
 }
        ?></textarea>
        <div>
--- a/library/HTMLPurifier.php
+++ b/library/HTMLPurifier.php
@ -28,48 +28,6 @@ require_once 'HTMLPurifier/HTMLDefinition.php';
 require_once 'HTMLPurifier/Generator.php';
 require_once 'HTMLPurifier/Strategy/Core.php';

-/*
-
-// Darn you fellas still using ISO-8859-1!  It would be so easy for me
-// to just drop the characters that can't be expressed this way, but I'm
-// a stickler for code quality, so I won't do that to you.  You'll have
-// to wait for this functionality to be implemented later.
-
-HTMLPurifier_ConfigDef::define(
-    'Core', 'Encoding', 'utf-8', 'istring',
-    'Set this to the encoding your webpages are served as.  This defines '.
-    'the encoding the HTMLPurifier will convert to and from before passing '.
-    'the text back to you.  Note that although we offer full HTML document '.
-    'parsing functionality, we ignore meta tags in such documents, because '.
-    'most modern browsers have already re-encoded the file in the correct '.
-    'encoding (though it did not change the meta tag).  '.
-    'Since browsers do not alter file uploads, '.
-    'HTML from a file will fail fantastically if its real encoding is does '.
-    'match the encoding passed here (which is often the case).'
-);
-
-if ( !function_exists('iconv') ) {
-    
-    // these are the only encodings we offer native PHP support for.
-    // if iconv is enabled, iconv's encoding support dictates what we can
-    // use.
-    
-    HTMLPurifier_ConfigDef::defineAllowedValues(
-        'Core', 'Encoding', array(
-            'utf-8',
-            'iso-8859-1'
-        )
-    );
-    HTMLPurifier_ConfigDef::defineValueAliases(
-        'Core', 'Encoding', array(
-            'iso8859-1' => 'iso-8859-1'
-        )
-    );
-    
-}
-
-*/
-
 /**
 * Main library execution class.
 * 
--- a/library/HTMLPurifier/Encoder.php
+++ b/library/HTMLPurifier/Encoder.php
@ -0,0 +1,366 @@
+<?php
+
+require_once 'HTMLPurifier/EntityLookup.php';
+
+/**
+ * An HTML and UTF-8 specific encoder that cleans, unentity-izes and transforms.
+ */
+class HTMLPurifier_Encoder
+{
+    
+    var $_entity_lookup;
+    
+    /**
+     * Callback regex string for parsing entities.
+     * @protected
+     */                             
+    var $_substituteEntitiesRegex =
+'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z]+));?/';
+//     1. hex             2. dec      3. string
+    
+    
+    /**
+     * Decimal to parsed string conversion table for special entities.
+     * @protected
+     */
+    var $_special_dec2str =
+            array(
+                    34 => '"',
+                    38 => '&',
+                    39 => "'",
+                    60 => '<',
+                    62 => '>'
+            );
+    
+    /**
+     * Stripped entity names to decimal conversion table for special entities.
+     * @protected
+     */
+    var $_special_ent2dec =
+            array(
+                    'quot' => 34,
+                    'amp'  => 38,
+                    'lt'   => 60,
+                    'gt'   => 62
+            );
+    
+    /**
+     * Substitutes non-special entities with their parsed equivalents. Since
+     * running this whenever you have parsed character is t3h 5uck, we run
+     * it before everything else.
+     * 
+     * @protected
+     * @param $string String to have non-special entities parsed.
+     * @returns Parsed string.
+     */
+    function substituteNonSpecialEntities($string) {
+        // it will try to detect missing semicolons, but don't rely on it
+        return preg_replace_callback(
+            $this->_substituteEntitiesRegex,
+            array($this, 'nonSpecialEntityCallback'),
+            $string
+            );
+    }
+    
+    /**
+     * Callback function for substituteNonSpecialEntities() that does the work.
+     * 
+     * @warning Though this is public in order to let the callback happen,
+     *          calling it directly is not recommended.
+     * @note Based on Feyd's function at
+     *       <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
+     *       which is in public domain.
+     * @note While we're going to do code point parsing anyway, a good
+     *       optimization would be to refuse to translate code points that
+     *       are non-SGML characters.  However, this could lead to duplication.
+     * @param $matches  PCRE matches array, with 0 the entire match, and
+     *                  either index 1, 2 or 3 set with a hex value, dec value,
+     *                  or string (respectively).
+     * @returns Replacement string.
+     * @todo Implement string translations
+     */
+    
+    // +----------+----------+----------+----------+
+    // | 33222222 | 22221111 | 111111   |          |
+    // | 10987654 | 32109876 | 54321098 | 76543210 | bit
+    // +----------+----------+----------+----------+
+    // |          |          |          | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
+    // |          |          | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
+    // |          | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
+    // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
+    // +----------+----------+----------+----------+
+    // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
+    // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
+    // +----------+----------+----------+----------+ 
+    
+    function nonSpecialEntityCallback($matches) {
+        // replaces all but big five
+        $entity = $matches[0];
+        $is_num = (@$matches[0][1] === '#');
+        if ($is_num) {
+            $is_hex = (@$entity[2] === 'x');
+            $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
+            
+            // abort for special characters
+            if (isset($this->_special_dec2str[$code]))  return $entity;
+            
+            if($code > 1114111 or $code < 0 or
+              ($code >= 55296 and $code <= 57343) ) {
+                // bits are set outside the "valid" range as defined
+                // by UNICODE 4.1.0 
+                return '';
+            }
+            
+            $x = $y = $z = $w = 0; 
+            if ($code < 128) {
+                // regular ASCII character
+                $x = $code;
+            } else {
+                // set up bits for UTF-8
+                $x = ($code & 63) | 128;
+                if ($code < 2048) {
+                    $y = (($code & 2047) >> 6) | 192;
+                } else {
+                    $y = (($code & 4032) >> 6) | 128;
+                    if($code < 65536) {
+                        $z = (($code >> 12) & 15) | 224;
+                    } else {
+                        $z = (($code >> 12) & 63) | 128;
+                        $w = (($code >> 18) & 7)  | 240;
+                    }
+                } 
+            }
+            // set up the actual character
+            $ret = '';
+            if($w) $ret .= chr($w);
+            if($z) $ret .= chr($z);
+            if($y) $ret .= chr($y);
+            $ret .= chr($x); 
+            
+            return $ret;
+        } else {
+            if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
+            if (!$this->_entity_lookup) {
+                require_once 'HTMLPurifier/EntityLookup.php';
+                $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
+            }
+            if (isset($this->_entity_lookup->table[$matches[3]])) {
+                return $this->_entity_lookup->table[$matches[3]];
+            } else {
+                return $entity;
+            }
+        }
+    }
+    
+    /**
+     * Cleans a UTF-8 string for well-formedness and SGML validity
+     * 
+     * It will parse according to UTF-8 and return a valid UTF8 string, with
+     * non-SGML codepoints excluded.
+     * 
+     * @warning This function can find a lot of use, so we may be moving
+     *          it to a dedicated class.
+     * 
+     * @note Just for reference, the non-SGML code points are 0 to 31 and
+     *       127 to 159, inclusive.  However, we allow code points 9, 10
+     *       and 13, which are the tab, line feed and carriage return
+     *       respectively. 128 and above the code points map to multibyte
+     *       UTF-8 representations.
+     * 
+     * @note The functionality provided by the original function could be
+     *       implemented with iconv using 'UTF-8//IGNORE', mbstring, or
+     *       even the PCRE modifier 'u', these do not allow us to strip
+     *       control characters or disallowed code points, and the latter
+     *       does not allow invalid UTF8 characters to be ignored.
+     * 
+     * @note Decomposing the string into Unicode code points is necessary
+     *       because SGML disallows the use of specific code points, not
+     *       necessarily bytes.  A naive implementation that simply strtr
+     *       disallowed code points as bytes will break other Unicode
+     *       characters in which using such bytes is valid.
+     * 
+     * @note Code adapted from utf8ToUnicode by Henri Sivonen and
+     *       hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
+     *       LGPL license.  Notes on what changed are inside.
+     */
+    function cleanUTF8($str) {
+        $mState = 0; // cached expected number of octets after the current octet
+                     // until the beginning of the next UTF8 character sequence
+        $mUcs4  = 0; // cached Unicode character
+        $mBytes = 1; // cached expected number of octets in the current sequence
+        
+        // original code involved an $out that was an array of Unicode
+        // codepoints.  Instead of having to convert back into UTF-8, we've
+        // decided to directly append valid UTF-8 characters onto a string
+        // $out once they're done.  $char accumulates raw bytes, while $mUcs4
+        // turns into the Unicode code point, so there's some redundancy.
+        
+        $out = '';
+        $char = '';
+        
+        $len = strlen($str);
+        for($i = 0; $i < $len; $i++) {
+            $in = ord($str{$i});
+            $char .= $str[$i]; // append byte to char
+            if (0 == $mState) {
+                // When mState is zero we expect either a US-ASCII character 
+                // or a multi-octet sequence.
+                if (0 == (0x80 & ($in))) {
+                    // US-ASCII, pass straight through.
+                    if (($in <= 31 || $in == 127) && 
+                        !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
+                    ) {
+                        // control characters, remove
+                    } else {
+                        $out .= $char;
+                    }
+                    // reset
+                    $char = '';
+                    $mBytes = 1;
+                } elseif (0xC0 == (0xE0 & ($in))) {
+                    // First octet of 2 octet sequence
+                    $mUcs4 = ($in);
+                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
+                    $mState = 1;
+                    $mBytes = 2;
+                } elseif (0xE0 == (0xF0 & ($in))) {
+                    // First octet of 3 octet sequence
+                    $mUcs4 = ($in);
+                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
+                    $mState = 2;
+                    $mBytes = 3;
+                } elseif (0xF0 == (0xF8 & ($in))) {
+                    // First octet of 4 octet sequence
+                    $mUcs4 = ($in);
+                    $mUcs4 = ($mUcs4 & 0x07) << 18;
+                    $mState = 3;
+                    $mBytes = 4;
+                } elseif (0xF8 == (0xFC & ($in))) {
+                    // First octet of 5 octet sequence.
+                    // 
+                    // This is illegal because the encoded codepoint must be 
+                    // either:
+                    // (a) not the shortest form or
+                    // (b) outside the Unicode range of 0-0x10FFFF.
+                    // Rather than trying to resynchronize, we will carry on 
+                    // until the end of the sequence and let the later error
+                    // handling code catch it.
+                    $mUcs4 = ($in);
+                    $mUcs4 = ($mUcs4 & 0x03) << 24;
+                    $mState = 4;
+                    $mBytes = 5;
+                } elseif (0xFC == (0xFE & ($in))) {
+                    // First octet of 6 octet sequence, see comments for 5
+                    // octet sequence.
+                    $mUcs4 = ($in);
+                    $mUcs4 = ($mUcs4 & 1) << 30;
+                    $mState = 5;
+                    $mBytes = 6;
+                } else {
+                    // Current octet is neither in the US-ASCII range nor a 
+                    // legal first octet of a multi-octet sequence.
+                    $mState = 0;
+                    $mUcs4  = 0;
+                    $mBytes = 1;
+                    $char = '';
+                }
+            } else {
+                // When mState is non-zero, we expect a continuation of the
+                // multi-octet sequence
+                if (0x80 == (0xC0 & ($in))) {
+                    // Legal continuation.
+                    $shift = ($mState - 1) * 6;
+                    $tmp = $in;
+                    $tmp = ($tmp & 0x0000003F) << $shift;
+                    $mUcs4 |= $tmp;
+                    
+                    if (0 == --$mState) {
+                        // End of the multi-octet sequence. mUcs4 now contains
+                        // the final Unicode codepoint to be output
+                        
+                        // Check for illegal sequences and codepoints.
+                        
+                        // From Unicode 3.1, non-shortest form is illegal
+                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
+                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
+                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
+                            (4 < $mBytes) ||
+                            // From Unicode 3.2, surrogate characters = illegal
+                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
+                            // Codepoints outside the Unicode range are illegal
+                            ($mUcs4 > 0x10FFFF)
+                        ) {
+                            
+                        } elseif (0xFEFF != $mUcs4 && // omit BOM
+                            !($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
+                        ) {
+                            $out .= $char;
+                        }
+                        // initialize UTF8 cache (reset)
+                        $mState = 0;
+                        $mUcs4  = 0;
+                        $mBytes = 1;
+                        $char = '';
+                    }
+                } else {
+                    // ((0xC0 & (*in) != 0x80) && (mState != 0))
+                    // Incomplete multi-octet sequence.
+                    // used to result in complete fail, but we'll reset
+                    $mState = 0;
+                    $mUcs4  = 0;
+                    $mBytes = 1;
+                    $char ='';
+                }
+            }
+        }
+        return $out;
+    }
+    
+    /**
+     * Substitutes only special entities with their parsed equivalents.
+     * 
+     * @notice We try to avoid calling this function because otherwise, it
+     * would have to be called a lot (for every parsed section).
+     * 
+     * @protected
+     * @param $string String to have non-special entities parsed.
+     * @returns Parsed string.
+     */
+    function substituteSpecialEntities($string) {
+        return preg_replace_callback(
+            $this->_substituteEntitiesRegex,
+            array('HTMLPurifier_Encoder', 'specialEntityCallback'),
+            $string);
+    }
+    
+    /**
+     * Callback function for substituteSpecialEntities() that does the work.
+     * 
+     * This callback has same syntax as nonSpecialEntityCallback().
+     * 
+     * @warning Though this is public in order to let the callback happen,
+     *          calling it directly is not recommended.
+     * @param $matches  PCRE-style matches array, with 0 the entire match, and
+     *                  either index 1, 2 or 3 set with a hex value, dec value,
+     *                  or string (respectively).
+     * @returns Replacement string.
+     */
+    function specialEntityCallback($matches) {
+        $entity = $matches[0];
+        $is_num = (@$matches[0][1] === '#');
+        if ($is_num) {
+            $is_hex = (@$entity[2] === 'x');
+            $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
+            return isset($this->_special_dec2str[$int]) ?
+                $this->_special_dec2str[$int] :
+                $entity;
+        } else {
+            return isset($this->_special_ent2dec[$matches[3]]) ?
+                $this->_special_ent2dec[$matches[3]] :
+                $entity;
+        }
+    }
+    
+}
+
+?>
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@ -1,6 +1,7 @@
 <?php

 require_once 'HTMLPurifier/Token.php';
+require_once 'HTMLPurifier/Encoder.php';

 HTMLPurifier_ConfigDef::define(
    'Core', 'AcceptFullDocuments', true, 'bool',
@ -54,6 +55,12 @@ HTMLPurifier_ConfigDef::define(
 class HTMLPurifier_Lexer
 {
    
+    function HTMLPurifier_Lexer() {
+        $this->_encoder = new HTMLPurifier_Encoder();
+    }
+    
+    var $_encoder;
+    
    /**
     * Lexes an HTML string into tokens.
     * 
@ -101,168 +108,6 @@ class HTMLPurifier_Lexer
        return $lexer;
    }
    
-    /**
-     * Decimal to parsed string conversion table for special entities.
-     * @protected
-     */
-    var $_special_dec2str =
-            array(
-                    34 => '"',
-                    38 => '&',
-                    39 => "'",
-                    60 => '<',
-                    62 => '>'
-            );
-    
-    /**
-     * Stripped entity names to decimal conversion table for special entities.
-     * @protected
-     */
-    var $_special_ent2dec =
-            array(
-                    'quot' => 34,
-                    'amp'  => 38,
-                    'lt'   => 60,
-                    'gt'   => 62
-            );
-    
-    /**
-     * Most common entity to raw value conversion table for special entities.
-     * @protected
-     */
-    var $_special_entity2str =
-            array(
-                    '&quot;' => '"',
-                    '&amp;'  => '&',
-                    '&lt;'   => '<',
-                    '&gt;'   => '>',
-                    '&#39;'  => "'",
-                    '&#039;' => "'",
-                    '&#x27;' => "'"
-            );
-    
-    /**
-     * Callback regex string for parsing entities.
-     * @protected
-     */                             
-    var $_substituteEntitiesRegex =
-'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z]+));?/';
-//     1. hex             2. dec      3. string
-    
-    /**
-     * Substitutes non-special entities with their parsed equivalents. Since
-     * running this whenever you have parsed character is t3h 5uck, we run
-     * it before everything else.
-     * 
-     * @protected
-     * @param $string String to have non-special entities parsed.
-     * @returns Parsed string.
-     */
-    function substituteNonSpecialEntities($string) {
-        // it will try to detect missing semicolons, but don't rely on it
-        return preg_replace_callback(
-            $this->_substituteEntitiesRegex,
-            array($this, 'nonSpecialEntityCallback'),
-            $string
-            );
-    }
-    
-    /**
-     * Callback function for substituteNonSpecialEntities() that does the work.
-     * 
-     * @warning Though this is public in order to let the callback happen,
-     *          calling it directly is not recommended.
-     * @note Based on Feyd's function at
-     *       <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
-     *       which is in public domain.
-     * @note While we're going to do code point parsing anyway, a good
-     *       optimization would be to refuse to translate code points that
-     *       are non-SGML characters.  However, this could lead to duplication.
-     * @param $matches  PCRE matches array, with 0 the entire match, and
-     *                  either index 1, 2 or 3 set with a hex value, dec value,
-     *                  or string (respectively).
-     * @returns Replacement string.
-     * @todo Implement string translations
-     */
-    
-    // +----------+----------+----------+----------+
-    // | 33222222 | 22221111 | 111111   |          |
-    // | 10987654 | 32109876 | 54321098 | 76543210 | bit
-    // +----------+----------+----------+----------+
-    // |          |          |          | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
-    // |          |          | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
-    // |          | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
-    // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
-    // +----------+----------+----------+----------+
-    // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
-    // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
-    // +----------+----------+----------+----------+ 
-    
-    function nonSpecialEntityCallback($matches) {
-        // replaces all but big five
-        $entity = $matches[0];
-        $is_num = (@$matches[0][1] === '#');
-        if ($is_num) {
-            $is_hex = (@$entity[2] === 'x');
-            $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
-            
-            // abort for special characters
-            if (isset($this->_special_dec2str[$code]))  return $entity;
-            
-            if($code > 1114111 or $code < 0 or
-              ($code >= 55296 and $code <= 57343) ) {
-                // bits are set outside the "valid" range as defined
-                // by UNICODE 4.1.0 
-                return '';
-            }
-            
-            $x = $y = $z = $w = 0; 
-            if ($code < 128) {
-                // regular ASCII character
-                $x = $code;
-            } else {
-                // set up bits for UTF-8
-                $x = ($code & 63) | 128;
-                if ($code < 2048) {
-                    $y = (($code & 2047) >> 6) | 192;
-                } else {
-                    $y = (($code & 4032) >> 6) | 128;
-                    if($code < 65536) {
-                        $z = (($code >> 12) & 15) | 224;
-                    } else {
-                        $z = (($code >> 12) & 63) | 128;
-                        $w = (($code >> 18) & 7)  | 240;
-                    }
-                } 
-            }
-            // set up the actual character
-            $ret = '';
-            if($w) $ret .= chr($w);
-            if($z) $ret .= chr($z);
-            if($y) $ret .= chr($y);
-            $ret .= chr($x); 
-            
-            return $ret;
-        } else {
-            if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
-            if (!$this->_entity_lookup) {
-                require_once 'HTMLPurifier/EntityLookup.php';
-                $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
-            }
-            if (isset($this->_entity_lookup->table[$matches[3]])) {
-                return $this->_entity_lookup->table[$matches[3]];
-            } else {
-                return $entity;
-            }
-        }
-    }
-    
-    /**
-     * Contains a copy of the EntityLookup table.
-     * @protected
-     */
-    var $_entity_lookup;
-    
    /**
     * Translates CDATA sections into regular sections (through escaping).
     * 
@ -305,170 +150,6 @@ class HTMLPurifier_Lexer
        }
    }
    
-    /**
-     * Cleans a UTF-8 string for well-formedness and SGML validity
-     * 
-     * It will parse according to UTF-8 and return a valid UTF8 string, with
-     * non-SGML codepoints excluded.
-     * 
-     * @warning This function can find a lot of use, so we may be moving
-     *          it to a dedicated class.
-     * 
-     * @note Just for reference, the non-SGML code points are 0 to 31 and
-     *       127 to 159, inclusive.  However, we allow code points 9, 10
-     *       and 13, which are the tab, line feed and carriage return
-     *       respectively. 128 and above the code points map to multibyte
-     *       UTF-8 representations.
-     * 
-     * @note The functionality provided by the original function could be
-     *       implemented with iconv using 'UTF-8//IGNORE', mbstring, or
-     *       even the PCRE modifier 'u', these do not allow us to strip
-     *       control characters or disallowed code points, and the latter
-     *       does not allow invalid UTF8 characters to be ignored.
-     * 
-     * @note Decomposing the string into Unicode code points is necessary
-     *       because SGML disallows the use of specific code points, not
-     *       necessarily bytes.  A naive implementation that simply strtr
-     *       disallowed code points as bytes will break other Unicode
-     *       characters in which using such bytes is valid.
-     * 
-     * @note Code adapted from utf8ToUnicode by Henri Sivonen and
-     *       hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
-     *       LGPL license.  Notes on what changed are inside.
-     */
-    function cleanUTF8($str) {
-        $mState = 0; // cached expected number of octets after the current octet
-                     // until the beginning of the next UTF8 character sequence
-        $mUcs4  = 0; // cached Unicode character
-        $mBytes = 1; // cached expected number of octets in the current sequence
-        
-        // original code involved an $out that was an array of Unicode
-        // codepoints.  Instead of having to convert back into UTF-8, we've
-        // decided to directly append valid UTF-8 characters onto a string
-        // $out once they're done.  $char accumulates raw bytes, while $mUcs4
-        // turns into the Unicode code point, so there's some redundancy.
-        
-        $out = '';
-        $char = '';
-        
-        $len = strlen($str);
-        for($i = 0; $i < $len; $i++) {
-            $in = ord($str{$i});
-            $char .= $str[$i]; // append byte to char
-            if (0 == $mState) {
-                // When mState is zero we expect either a US-ASCII character 
-                // or a multi-octet sequence.
-                if (0 == (0x80 & ($in))) {
-                    // US-ASCII, pass straight through.
-                    if (($in <= 31 || $in == 127) && 
-                        !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
-                    ) {
-                        // control characters, remove
-                    } else {
-                        $out .= $char;
-                    }
-                    // reset
-                    $char = '';
-                    $mBytes = 1;
-                } elseif (0xC0 == (0xE0 & ($in))) {
-                    // First octet of 2 octet sequence
-                    $mUcs4 = ($in);
-                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
-                    $mState = 1;
-                    $mBytes = 2;
-                } elseif (0xE0 == (0xF0 & ($in))) {
-                    // First octet of 3 octet sequence
-                    $mUcs4 = ($in);
-                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
-                    $mState = 2;
-                    $mBytes = 3;
-                } elseif (0xF0 == (0xF8 & ($in))) {
-                    // First octet of 4 octet sequence
-                    $mUcs4 = ($in);
-                    $mUcs4 = ($mUcs4 & 0x07) << 18;
-                    $mState = 3;
-                    $mBytes = 4;
-                } elseif (0xF8 == (0xFC & ($in))) {
-                    // First octet of 5 octet sequence.
-                    // 
-                    // This is illegal because the encoded codepoint must be 
-                    // either:
-                    // (a) not the shortest form or
-                    // (b) outside the Unicode range of 0-0x10FFFF.
-                    // Rather than trying to resynchronize, we will carry on 
-                    // until the end of the sequence and let the later error
-                    // handling code catch it.
-                    $mUcs4 = ($in);
-                    $mUcs4 = ($mUcs4 & 0x03) << 24;
-                    $mState = 4;
-                    $mBytes = 5;
-                } elseif (0xFC == (0xFE & ($in))) {
-                    // First octet of 6 octet sequence, see comments for 5
-                    // octet sequence.
-                    $mUcs4 = ($in);
-                    $mUcs4 = ($mUcs4 & 1) << 30;
-                    $mState = 5;
-                    $mBytes = 6;
-                } else {
-                    // Current octet is neither in the US-ASCII range nor a 
-                    // legal first octet of a multi-octet sequence.
-                    $mState = 0;
-                    $mUcs4  = 0;
-                    $mBytes = 1;
-                    $char = '';
-                }
-            } else {
-                // When mState is non-zero, we expect a continuation of the
-                // multi-octet sequence
-                if (0x80 == (0xC0 & ($in))) {
-                    // Legal continuation.
-                    $shift = ($mState - 1) * 6;
-                    $tmp = $in;
-                    $tmp = ($tmp & 0x0000003F) << $shift;
-                    $mUcs4 |= $tmp;
-                    
-                    if (0 == --$mState) {
-                        // End of the multi-octet sequence. mUcs4 now contains
-                        // the final Unicode codepoint to be output
-                        
-                        // Check for illegal sequences and codepoints.
-                        
-                        // From Unicode 3.1, non-shortest form is illegal
-                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
-                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
-                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
-                            (4 < $mBytes) ||
-                            // From Unicode 3.2, surrogate characters = illegal
-                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
-                            // Codepoints outside the Unicode range are illegal
-                            ($mUcs4 > 0x10FFFF)
-                        ) {
-                            
-                        } elseif (0xFEFF != $mUcs4 && // omit BOM
-                            !($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
-                        ) {
-                            $out .= $char;
-                        }
-                        // initialize UTF8 cache (reset)
-                        $mState = 0;
-                        $mUcs4  = 0;
-                        $mBytes = 1;
-                        $char = '';
-                    }
-                } else {
-                    // ((0xC0 & (*in) != 0x80) && (mState != 0))
-                    // Incomplete multi-octet sequence.
-                    // used to result in complete fail, but we'll reset
-                    $mState = 0;
-                    $mUcs4  = 0;
-                    $mBytes = 1;
-                    $char ='';
-                }
-            }
-        }
-        return $out;
-    }
-    
 }

 ?>
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@ -30,6 +30,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
    
    public function __construct() {
        // setup the factory
+        parent::HTMLPurifier_Lexer();
        $this->factory = new HTMLPurifier_TokenFactory();
    }
    
@ -50,10 +51,10 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
        // substitute non-special entities. While DOM is perfectly capable
        // of doing this, we need to get at the UTF-8 characters in
        // cleanUTF8
-        $string = $this->substituteNonSpecialEntities($string);
+        $string = $this->_encoder->substituteNonSpecialEntities($string);
        
        // clean it into well-formed UTF-8 string
-        $string = $this->cleanUTF8($string);
+        $string = $this->_encoder->cleanUTF8($string);
        
        // preprocess string, essential for UTF-8
        $string =
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@ -20,6 +20,21 @@ require_once 'HTMLPurifier/Lexer.php';
 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
 {
    
+    /**
+     * Most common entity to raw value conversion table for special entities.
+     * @protected
+     */
+    var $_special_entity2str =
+            array(
+                    '&quot;' => '"',
+                    '&amp;'  => '&',
+                    '&lt;'   => '<',
+                    '&gt;'   => '>',
+                    '&#39;'  => "'",
+                    '&#039;' => "'",
+                    '&#x27;' => "'"
+            );
+    
    /**
     * Parses special entities into the proper characters.
     * 
@ -51,7 +66,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
        if ($num_amp_2 <= $num_esc_amp) return $string;
        
        // hmm... now we have some uncommon entities. Use the callback.
-        $string = $this->substituteSpecialEntities($string);
+        $string = $this->_encoder->substituteSpecialEntities($string);
        return $string;
    }
    
@ -61,51 +76,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
     */
    var $_whitespace = "\x20\x09\x0D\x0A";
    
-    /**
-     * Substitutes only special entities with their parsed equivalents.
-     * 
-     * @notice We try to avoid calling this function because otherwise, it
-     * would have to be called a lot (for every parsed section).
-     * 
-     * @protected
-     * @param $string String to have non-special entities parsed.
-     * @returns Parsed string.
-     */
-    function substituteSpecialEntities($string) {
-        return preg_replace_callback(
-            $this->_substituteEntitiesRegex,
-            array('HTMLPurifier_Lexer_DirectLex', 'specialEntityCallback'),
-            $string);
-    }
-    
-    /**
-     * Callback function for substituteSpecialEntities() that does the work.
-     * 
-     * This callback has same syntax as nonSpecialEntityCallback().
-     * 
-     * @warning Though this is public in order to let the callback happen,
-     *          calling it directly is not recommended.
-     * @param $matches  PCRE-style matches array, with 0 the entire match, and
-     *                  either index 1, 2 or 3 set with a hex value, dec value,
-     *                  or string (respectively).
-     * @returns Replacement string.
-     */
-    function specialEntityCallback($matches) {
-        $entity = $matches[0];
-        $is_num = (@$matches[0][1] === '#');
-        if ($is_num) {
-            $is_hex = (@$entity[2] === 'x');
-            $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
-            return isset($this->_special_dec2str[$int]) ?
-                $this->_special_dec2str[$int] :
-                $entity;
-        } else {
-            return isset($this->_special_ent2dec[$matches[3]]) ?
-                $this->_special_ent2dec[$matches[3]] :
-                $entity;
-        }
-    }
-    
    function tokenizeHTML($string, $config = null) {
        
        if (!$config) $config = HTMLPurifier_Config::createDefault();
@ -126,10 +96,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
        $string = $this->escapeCDATA($string);
        
        // expand entities THAT AREN'T THE BIG FIVE
-        $string = $this->substituteNonSpecialEntities($string);
+        $string = $this->_encoder->substituteNonSpecialEntities($string);
        
        // clean it into wellformed UTF-8 string
-        $string = $this->cleanUTF8($string);
+        $string = $this->_encoder->cleanUTF8($string);
        
        // infinite loop protection
        // has to be pretty big, since html docs can be big
--- a/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/library/HTMLPurifier/Lexer/PEARSax3.php
@ -35,8 +35,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
        if ($config->get('Core', 'AcceptFullDocuments')) {
            $string = $this->extractBody($string);
        }
-        $string = $this->substituteNonSpecialEntities($string);
-        $string = $this->cleanUTF8($string);
+        $string = $this->_encoder->substituteNonSpecialEntities($string);
+        $string = $this->_encoder->cleanUTF8($string);
        $parser=& new XML_HTMLSax3();
        $parser->set_object($this);
        $parser->set_element_handler('openHandler','closeHandler');
--- a/smoketests/common.php
+++ b/smoketests/common.php
@ -6,7 +6,7 @@ set_include_path('../library' . PATH_SEPARATOR . get_include_path());
 require_once 'HTMLPurifier.php';

 function escapeHTML($string) {
-    $string = HTMLPurifier_Lexer::cleanUTF8($string);
+    $string = HTMLPurifier_Encoder::cleanUTF8($string);
    $string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
    return $string;
 }
--- a/tests/HTMLPurifier/EncoderTest.php
+++ b/tests/HTMLPurifier/EncoderTest.php
@ -0,0 +1,96 @@
+<?php
+
+require_once 'HTMLPurifier/Encoder.php';
+
+class HTMLPurifier_EncoderTest extends UnitTestCase
+{
+    
+    var $Encoder;
+    
+    function setUp() {
+        $this->Encoder = new HTMLPurifier_Encoder();
+        $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
+    }
+    
+    function assertCleanUTF8($string, $expect = null) {
+        if ($expect === null) $expect = $string;
+        $this->assertIdentical($this->Encoder->cleanUTF8($string), $expect);
+    }
+    
+    function test_cleanUTF8() {
+        $this->assertCleanUTF8('Normal string.');
+        $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
+        $this->assertCleanUTF8("null byte: \0", 'null byte: ');
+        $this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
+        $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
+        $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
+        $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
+        $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
+    }
+    
+    function test_substituteNonSpecialEntities() {
+        $char_theta = $this->_entity_lookup->table['theta'];
+        $this->assertIdentical($char_theta,
+            $this->Encoder->substituteNonSpecialEntities('&theta;') );
+        $this->assertIdentical('"',
+            $this->Encoder->substituteNonSpecialEntities('"') );
+        
+        // numeric tests, adapted from Feyd
+        $args = array();
+        $args[] = array(1114112,false     );
+        $args[] = array(1114111,'F48FBFBF'); // 0x0010FFFF
+        $args[] = array(1048576,'F4808080'); // 0x00100000
+        $args[] = array(1048575,'F3BFBFBF'); // 0x000FFFFF
+        $args[] = array(262144, 'F1808080'); // 0x00040000
+        $args[] = array(262143, 'F0BFBFBF'); // 0x0003FFFF
+        $args[] = array(65536,  'F0908080'); // 0x00010000
+        $args[] = array(65535,  'EFBFBF'  ); // 0x0000FFFF
+        $args[] = array(57344,  'EE8080'  ); // 0x0000E000
+        $args[] = array(57343,  false     ); // 0x0000DFFF  these are ill-formed
+        $args[] = array(56040,  false     ); // 0x0000DAE8  these are ill-formed
+        $args[] = array(55296,  false     ); // 0x0000D800  these are ill-formed
+        $args[] = array(55295,  'ED9FBF'  ); // 0x0000D7FF
+        $args[] = array(53248,  'ED8080'  ); // 0x0000D000
+        $args[] = array(53247,  'ECBFBF'  ); // 0x0000CFFF
+        $args[] = array(4096,   'E18080'  ); // 0x00001000
+        $args[] = array(4095,   'E0BFBF'  ); // 0x00000FFF
+        $args[] = array(2048,   'E0A080'  ); // 0x00000800
+        $args[] = array(2047,   'DFBF'    ); // 0x000007FF
+        $args[] = array(128,    'C280'    ); // 0x00000080  invalid SGML char
+        $args[] = array(127,    '7F'      ); // 0x0000007F  invalid SGML char
+        $args[] = array(0,      '00'      ); // 0x00000000  invalid SGML char
+
+        $args[] = array(20108,  'E4BA8C'  ); // 0x00004E8C
+        $args[] = array(77,     '4D'      ); // 0x0000004D
+        $args[] = array(66306,  'F0908C82'); // 0x00010302
+        $args[] = array(1072,   'D0B0'    ); // 0x00000430 
+        
+        foreach ($args as $arg) {
+            $string = '&#' . $arg[0] . ';' . // decimal
+                      '&#x' . dechex($arg[0]) . ';'; // hex
+            $expect = '';
+            if ($arg[1] !== false) {
+                $chars = str_split($arg[1], 2);
+                foreach ($chars as $char) {
+                    $expect .= chr(hexdec($char));
+                }
+                $expect .= $expect; // double it
+            }
+            $this->assertIdentical(
+                $this->Encoder->substituteNonSpecialEntities($string),
+                $expect,
+                $arg[0] . ': %s'
+            );
+        }
+        
+    }
+    
+    function test_specialEntityCallback() {
+        
+        $this->assertIdentical("'",$this->Encoder->specialEntityCallback(
+            array('&#39;', null, '39', null) ));
+    }
+    
+}
+
+?>
--- a/tests/HTMLPurifier/Lexer/DirectLexTest.php
+++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php
@ -11,13 +11,6 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase
        $this->DirectLex = new HTMLPurifier_Lexer_DirectLex();
    }
    
-    function test_specialEntityCallback() {
-        $HP =& $this->DirectLex;
-        
-        $this->assertIdentical("'",$HP->specialEntityCallback(
-            array('&#39;', null, '39', null) ));
-    }
-    
    function test_parseData() {
        $HP =& $this->DirectLex;
        
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -32,79 +32,6 @@ class HTMLPurifier_LexerTest extends UnitTestCase
        
    }
    
-    function assertCleanUTF8($string, $expect = null) {
-        if ($expect === null) $expect = $string;
-        $this->assertIdentical($this->Lexer->cleanUTF8($string), $expect);
-    }
-    
-    function test_cleanUTF8() {
-        $this->assertCleanUTF8('Normal string.');
-        $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
-        $this->assertCleanUTF8("null byte: \0", 'null byte: ');
-        $this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
-        $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
-        $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
-        $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
-        $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
-    }
-    
-    function test_substituteNonSpecialEntities() {
-        $char_theta = $this->_entity_lookup->table['theta'];
-        $this->assertIdentical($char_theta,
-            $this->Lexer->substituteNonSpecialEntities('&theta;') );
-        $this->assertIdentical('"',
-            $this->Lexer->substituteNonSpecialEntities('"') );
-        
-        // numeric tests, adapted from Feyd
-        $args = array();
-        $args[] = array(1114112,false     );
-        $args[] = array(1114111,'F48FBFBF'); // 0x0010FFFF
-        $args[] = array(1048576,'F4808080'); // 0x00100000
-        $args[] = array(1048575,'F3BFBFBF'); // 0x000FFFFF
-        $args[] = array(262144, 'F1808080'); // 0x00040000
-        $args[] = array(262143, 'F0BFBFBF'); // 0x0003FFFF
-        $args[] = array(65536,  'F0908080'); // 0x00010000
-        $args[] = array(65535,  'EFBFBF'  ); // 0x0000FFFF
-        $args[] = array(57344,  'EE8080'  ); // 0x0000E000
-        $args[] = array(57343,  false     ); // 0x0000DFFF  these are ill-formed
-        $args[] = array(56040,  false     ); // 0x0000DAE8  these are ill-formed
-        $args[] = array(55296,  false     ); // 0x0000D800  these are ill-formed
-        $args[] = array(55295,  'ED9FBF'  ); // 0x0000D7FF
-        $args[] = array(53248,  'ED8080'  ); // 0x0000D000
-        $args[] = array(53247,  'ECBFBF'  ); // 0x0000CFFF
-        $args[] = array(4096,   'E18080'  ); // 0x00001000
-        $args[] = array(4095,   'E0BFBF'  ); // 0x00000FFF
-        $args[] = array(2048,   'E0A080'  ); // 0x00000800
-        $args[] = array(2047,   'DFBF'    ); // 0x000007FF
-        $args[] = array(128,    'C280'    ); // 0x00000080  invalid SGML char
-        $args[] = array(127,    '7F'      ); // 0x0000007F  invalid SGML char
-        $args[] = array(0,      '00'      ); // 0x00000000  invalid SGML char
-
-        $args[] = array(20108,  'E4BA8C'  ); // 0x00004E8C
-        $args[] = array(77,     '4D'      ); // 0x0000004D
-        $args[] = array(66306,  'F0908C82'); // 0x00010302
-        $args[] = array(1072,   'D0B0'    ); // 0x00000430 
-        
-        foreach ($args as $arg) {
-            $string = '&#' . $arg[0] . ';' . // decimal
-                      '&#x' . dechex($arg[0]) . ';'; // hex
-            $expect = '';
-            if ($arg[1] !== false) {
-                $chars = str_split($arg[1], 2);
-                foreach ($chars as $char) {
-                    $expect .= chr(hexdec($char));
-                }
-                $expect .= $expect; // double it
-            }
-            $this->assertIdentical(
-                $this->Lexer->substituteNonSpecialEntities($string),
-                $expect,
-                $arg[0] . ': %s'
-            );
-        }
-        
-    }
-    
    function assertExtractBody($text, $extract = true) {
        $result = $this->Lexer->extractBody($text);
        if ($extract === true) $extract = $text;
--- a/tests/index.php
+++ b/tests/index.php
@ -86,6 +86,7 @@ $test_files[] = 'AttrTransform/BdoDirTest.php';
 $test_files[] = 'AttrTransform/ImgRequiredTest.php';
 $test_files[] = 'URISchemeRegistryTest.php';
 $test_files[] = 'URISchemeTest.php';
+$test_files[] = 'EncoderTest.php';

 if (version_compare(PHP_VERSION, '5', '>=')) {
    $test_files[] = 'TokenFactoryTest.php';