diff --git a/NEWS b/NEWS
index 39955416..00139015 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,7 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
 
 1.0.0rc1, released 2006-??-??
 - Fixed broken numeric entity conversion
+- Malformed UTF-8 and non-SGML character detection and cleaning implemented
 
 1.0.0beta, released 2006-08-16
 - First public release, most functionality implemented. Notable omissions are:
diff --git a/library/HTMLPurifier/Generator.php b/library/HTMLPurifier/Generator.php
index d1ac6d40..aa36896a 100644
--- a/library/HTMLPurifier/Generator.php
+++ b/library/HTMLPurifier/Generator.php
@@ -2,35 +2,51 @@
 
 // pretty-printing with indentation would be pretty cool
 
+require_once 'HTMLPurifier/Lexer.php';
+
+HTMLPurifier_ConfigDef::define(
+    'Core', 'CleanUTF8DuringGeneration', false,
+    'When true, HTMLPurifier_Generator will also check all strings it '.
+    'escapes for UTF-8 well-formedness as a defense in depth measure. '.
+    'This could cause a considerable performance impact, and is not '.
+    'strictly necessary due to the fact that the Lexers should have '.
+    'ensured that all the UTF-8 strings were well-formed.  Note that '.
+    'the configuration value is only read at the beginning of '.
+    'generateFromTokens.'
+);
+
 class HTMLPurifier_Generator
 {
     
+    var $clean_utf8 = false;
+    
     // only unit tests may omit configuration: internals MUST pass config
     function generateFromTokens($tokens, $config = null) {
         $html = '';
         if (!$config) $config = HTMLPurifier_Config::createDefault();
+        $this->clean_utf8 = $config->get('Core', 'CleanUTF8DuringGeneration');
         if (!$tokens) return '';
         foreach ($tokens as $token) {
-            $html .= $this->generateFromToken($token, $config);
+            $html .= $this->generateFromToken($token);
         }
         return $html;
     }
     
-    function generateFromToken($token, $config) {
+    function generateFromToken($token) {
         if (!isset($token->type)) return '';
         if ($token->type == 'start') {
-            $attr = $this->generateAttributes($token->attributes, $config);
+            $attr = $this->generateAttributes($token->attributes);
             return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
             
         } elseif ($token->type == 'end') {
             return '</' . $token->name . '>';
             
         } elseif ($token->type == 'empty') {
-            $attr = $this->generateAttributes($token->attributes, $config);
+            $attr = $this->generateAttributes($token->attributes);
              return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />';
             
         } elseif ($token->type == 'text') {
-            return htmlspecialchars($token->data, ENT_COMPAT, 'UTF-8');
+            return $this->escape($token->data);
             
         } else {
             return '';
@@ -38,14 +54,19 @@ class HTMLPurifier_Generator
         }
     }
     
-    function generateAttributes($assoc_array_of_attributes, $config) {
+    function generateAttributes($assoc_array_of_attributes) {
         $html = '';
         foreach ($assoc_array_of_attributes as $key => $value) {
-            $html .= $key.'="'.htmlspecialchars($value, ENT_COMPAT, 'UTF-8').'" ';
+            $html .= $key.'="'.$this->escape($value).'" ';
         }
         return rtrim($html);
     }
     
+    function escape($string) {
+        if ($this->clean_utf8) $string = HTMLPurifier_Lexer::cleanUTF8($string);
+        return htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
+    }
+    
 }
 
 ?>
\ No newline at end of file
diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php
index 2f7225a3..35235658 100644
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@@ -307,16 +307,19 @@ class HTMLPurifier_Lexer
     }
     
     /**
-     * Currently converts UTF8 into an array of Unicode codepoints. (changing)
+     * Cleans a UTF-8 string for well-formedness and SGML validity
      * 
-     * We're going to convert this into a multi-purpose UTF-8 well-formedness
-     * checker as well as handler for the control characters that are illegal
-     * in SGML documents.  But *after* we draw up some unit-tests.  This means
-     * that the function, in the end, will not return an array of codepoints
-     * but a valid UTF8 string, with non-SGML codepoints excluded.
+     * It will parse according to UTF-8 and return a valid UTF8 string, with
+     * non-SGML codepoints excluded.
+     * 
+     * @warning This function can find a lot of use, so we may be moving
+     *          it to a dedicated class.
      * 
      * @note Just for reference, the non-SGML code points are 0 to 31 and
-     *       127 to 159, inclusive.
+     *       127 to 159, inclusive.  However, we allow code points 9, 10
+     *       and 13, which are the tab, line feed and carriage return
+     *       respectively. 128 and above the code points map to multibyte
+     *       UTF-8 representations.
      * 
      * @note The functionality provided by the original function could be
      *       implemented with iconv using 'UTF-8//IGNORE', mbstring, or
@@ -332,7 +335,7 @@ class HTMLPurifier_Lexer
      * 
      * @note Code adapted from utf8ToUnicode by Henri Sivonen and
      *       hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
-     *       LGPL license.
+     *       LGPL license.  Notes on what changed are inside.
      */
     function cleanUTF8($str) {
         $mState = 0; // cached expected number of octets after the current octet
@@ -340,17 +343,33 @@ class HTMLPurifier_Lexer
         $mUcs4  = 0; // cached Unicode character
         $mBytes = 1; // cached expected number of octets in the current sequence
         
-        $out = array();
+        // original code involved an $out that was an array of Unicode
+        // codepoints.  Instead of having to convert back into UTF-8, we've
+        // decided to directly append valid UTF-8 characters onto a string
+        // $out once they're done.  $char accumulates raw bytes, while $mUcs4
+        // turns into the Unicode code point, so there's some redundancy.
+        
+        $out = '';
+        $char = '';
         
         $len = strlen($str);
         for($i = 0; $i < $len; $i++) {
             $in = ord($str{$i});
+            $char .= $str[$i]; // append byte to char
             if (0 == $mState) {
                 // When mState is zero we expect either a US-ASCII character 
                 // or a multi-octet sequence.
                 if (0 == (0x80 & ($in))) {
                     // US-ASCII, pass straight through.
-                    $out[] = $in;
+                    if (($in <= 31 || $in == 127) && 
+                        !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
+                    ) {
+                        // control characters, remove
+                    } else {
+                        $out .= $char;
+                    }
+                    // reset
+                    $char = '';
                     $mBytes = 1;
                 } elseif (0xC0 == (0xE0 & ($in))) {
                     // First octet of 2 octet sequence
@@ -394,7 +413,10 @@ class HTMLPurifier_Lexer
                 } else {
                     // Current octet is neither in the US-ASCII range nor a 
                     // legal first octet of a multi-octet sequence.
-                    return false;
+                    $mState = 0;
+                    $mUcs4  = 0;
+                    $mBytes = 1;
+                    $char = '';
                 }
             } else {
                 // When mState is non-zero, we expect a continuation of the
@@ -422,23 +444,26 @@ class HTMLPurifier_Lexer
                             // Codepoints outside the Unicode range are illegal
                             ($mUcs4 > 0x10FFFF)
                         ) {
-                            return false;
+                            
+                        } elseif (0xFEFF != $mUcs4 && // omit BOM
+                            !($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
+                        ) {
+                            $out .= $char;
                         }
-                        if (0xFEFF != $mUcs4) {
-                            // BOM is legal but we don't want to output it
-                            $out[] = $mUcs4;
-                        }
-                        //initialize UTF8 cache
+                        // initialize UTF8 cache (reset)
                         $mState = 0;
                         $mUcs4  = 0;
                         $mBytes = 1;
+                        $char = '';
                     }
                 } else {
                     // ((0xC0 & (*in) != 0x80) && (mState != 0))
-                    //
                     // Incomplete multi-octet sequence.
-                    //
-                    return false;
+                    // used to result in complete fail, but we'll reset
+                    $mState = 0;
+                    $mUcs4  = 0;
+                    $mBytes = 1;
+                    $char ='';
                 }
             }
         }
diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php
index 4e85575c..9def5257 100644
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -47,6 +47,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
         // mode won't get 'em.
         $string = $this->escapeCDATA($string);
         
+        // substitute non-special entities. While DOM is perfectly capable
+        // of doing this, we need to get at the UTF-8 characters in
+        // cleanUTF8
+        $string = $this->substituteNonSpecialEntities($string);
+        
+        // clean it into well-formed UTF-8 string
+        $string = $this->cleanUTF8($string);
+        
         if (!$is_full) {
         // preprocess string, essential for UTF-8
           $string =
diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php
index 535b3866..aa1250df 100644
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -128,6 +128,9 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
         // expand entities THAT AREN'T THE BIG FIVE
         $string = $this->substituteNonSpecialEntities($string);
         
+        // clean it into wellformed UTF-8 string
+        $string = $this->cleanUTF8($string);
+        
         // infinite loop protection
         // has to be pretty big, since html docs can be big
         // we're allow two hundred thousand tags... more than enough?
diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php
index 02b3a484..a0bdfe66 100644
--- a/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/library/HTMLPurifier/Lexer/PEARSax3.php
@@ -29,20 +29,21 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
      */
     var $tokens = array();
     
-    function tokenizeHTML($html, $config = null) {
+    function tokenizeHTML($string, $config = null) {
         if (!$config) $config = HTMLPurifier_Config::createDefault();
-        $html = $this->escapeCDATA($html);
+        $string = $this->escapeCDATA($string);
         if ($config->get('Core', 'AcceptFullDocuments')) {
-            $html = $this->extractBody($html);
+            $string = $this->extractBody($string);
         }
-        $html = $this->substituteNonSpecialEntities($html);
+        $string = $this->substituteNonSpecialEntities($string);
+        $string = $this->cleanUTF8($string);
         $parser=& new XML_HTMLSax3();
         $parser->set_object($this);
         $parser->set_element_handler('openHandler','closeHandler');
         $parser->set_data_handler('dataHandler');
         $parser->set_escape_handler('escapeHandler');
         $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
-        $parser->parse($html);
+        $parser->parse($string);
         $tokens = $this->tokens;
         $this->tokens = array();
         return $tokens;
diff --git a/smoketests/common.php b/smoketests/common.php
new file mode 100644
index 00000000..a6a8c146
--- /dev/null
+++ b/smoketests/common.php
@@ -0,0 +1,14 @@
+<?php
+
+header('Content-type: text/html; charset=UTF-8');
+
+set_include_path('../library' . PATH_SEPARATOR . get_include_path());
+require_once 'HTMLPurifier.php';
+
+function escapeHTML($string) {
+    $string = HTMLPurifier_Lexer::cleanUTF8($string);
+    $string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
+    return $string;
+}
+
+?>
\ No newline at end of file
diff --git a/smoketests/utf8.php b/smoketests/utf8.php
index 09b3d7af..6e62f03c 100644
--- a/smoketests/utf8.php
+++ b/smoketests/utf8.php
@@ -1,7 +1,6 @@
 <?php
 
-// there must not be a byte order mark
-header('Content-type: text/html; charset=UTF-8');
+require_once 'common.php';
 
 ?><!DOCTYPE html 
      PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
@@ -15,9 +14,6 @@ header('Content-type: text/html; charset=UTF-8');
 <h1>HTMLPurifier UTF-8 Smoketest</h1>
 <?php
 
-set_include_path('../library' . PATH_SEPARATOR . get_include_path());
-require_once 'HTMLPurifier.php';
-
 $purifier = new HTMLPurifier();
 $string = '
 <ul>
diff --git a/smoketests/variableWidthAttack.php b/smoketests/variableWidthAttack.php
index c9bbcb1e..73ce6f0c 100644
--- a/smoketests/variableWidthAttack.php
+++ b/smoketests/variableWidthAttack.php
@@ -1,6 +1,6 @@
 <?php
 
-header('Content-type: text/html; charset=UTF-8');
+require_once 'common.php';
 
 ?><!DOCTYPE html 
      PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
@@ -19,16 +19,8 @@ in Internet Explorer, if it works at all.</p>
 <h2>Test</h2>
 <?php
 
-set_include_path('../library' . PATH_SEPARATOR . get_include_path());
-require_once 'HTMLPurifier.php';
 $purifier = new HTMLPurifier();
 
-function escape($string) {
-    $string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
-    $string = iconv('UTF-8', 'UTF-8//IGNORE', $string);
-    return $string;
-}
-
 ?>
 <table>
 <thead><tr><th>ASCII</th><th width="30%">Raw</th><th>Output</th><th>Render</th></tr></thead>
@@ -44,8 +36,8 @@ for ($i = 0; $i < 256; $i++) {
 ?>
 <tr>
     <td><?php echo $i; ?></td>
-    <td style="font-size:8pt;"><?php echo escape($html); ?></td>
-    <td style="font-size:8pt;"><?php echo escape($pure_html); ?></td>
+    <td style="font-size:8pt;"><?php echo escapeHTML($html); ?></td>
+    <td style="font-size:8pt;"><?php echo escapeHTML($pure_html); ?></td>
     <td><?php echo $pure_html; ?></td>
 </tr>
 <?php } ?>
@@ -54,9 +46,8 @@ for ($i = 0; $i < 256; $i++) {
 
 <h2>Analysis</h2>
 
-<p>This test currently passes the XSS aspect but fails the validation aspect
-due to generalized encoding issues.  An augmented UTF-8 smoketest is 
-pending, until then, consider this a pass.</p>
+<p>By making sure that UTF-8 is well formed and non-SGML codepoints are
+removed, as well as escaping quotes outside of tags, this is a non-threat.</p>
 
 </body>
 </html>
\ No newline at end of file
diff --git a/smoketests/xssAttacks.php b/smoketests/xssAttacks.php
index bd6800a6..48d020a1 100644
--- a/smoketests/xssAttacks.php
+++ b/smoketests/xssAttacks.php
@@ -1,6 +1,6 @@
 <?php
 
-header('Content-type: text/html; charset=UTF-8');
+require_once('common.php');
 
 ?><!DOCTYPE html 
      PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
@@ -23,9 +23,6 @@ relevant.</p>
 
 if (version_compare(PHP_VERSION, '5', '<')) exit('<p>Requires PHP 5.</p>');
 
-set_include_path('../library' . PATH_SEPARATOR . get_include_path());
-require_once 'HTMLPurifier.php';
-
 $xml = simplexml_load_file('xssAttacks.xml');
 $purifier = new HTMLPurifier();
 
@@ -43,10 +40,10 @@ foreach ($xml->attack as $attack) {
     if ($attack->name == 'US-ASCII encoding') $code = urldecode($code);
 ?>
     <tr>
-        <td><?php echo htmlspecialchars($attack->name); ?></td>
-        <td><textarea readonly="readonly" cols="20" rows="2"><?php echo htmlspecialchars($code); ?></textarea></td>
+        <td><?php echo escapeHTML($attack->name); ?></td>
+        <td><textarea readonly="readonly" cols="20" rows="2"><?php echo escapeHTML($code); ?></textarea></td>
         <?php $pure_html = $purifier->purify($code); ?>
-        <td><textarea readonly="readonly" cols="20" rows="2"><?php echo htmlspecialchars($pure_html); ?></textarea></td>
+        <td><textarea readonly="readonly" cols="20" rows="2"><?php echo escapeHTML($pure_html); ?></textarea></td>
         <td><?php echo $pure_html ?></td>
     </tr>
 <?php
diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php
index 8f65f88a..09eec396 100644
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@@ -32,6 +32,22 @@ class HTMLPurifier_LexerTest extends UnitTestCase
         
     }
     
+    function assertCleanUTF8($string, $expect = null) {
+        if ($expect === null) $expect = $string;
+        $this->assertIdentical($this->Lexer->cleanUTF8($string), $expect);
+    }
+    
+    function test_cleanUTF8() {
+        $this->assertCleanUTF8('Normal string.');
+        $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
+        $this->assertCleanUTF8("null byte: \0", 'null byte: ');
+        $this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
+        $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
+        $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
+        $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
+        $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
+    }
+    
     function test_substituteNonSpecialEntities() {
         $char_theta = $this->_entity_lookup->table['theta'];
         $this->assertIdentical($char_theta,