diff --git a/NEWS b/NEWS index 39955416..00139015 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier 1.0.0rc1, released 2006-??-?? - Fixed broken numeric entity conversion +- Malformed UTF-8 and non-SGML character detection and cleaning implemented 1.0.0beta, released 2006-08-16 - First public release, most functionality implemented. Notable omissions are: diff --git a/library/HTMLPurifier/Generator.php b/library/HTMLPurifier/Generator.php index d1ac6d40..aa36896a 100644 --- a/library/HTMLPurifier/Generator.php +++ b/library/HTMLPurifier/Generator.php @@ -2,35 +2,51 @@ // pretty-printing with indentation would be pretty cool +require_once 'HTMLPurifier/Lexer.php'; + +HTMLPurifier_ConfigDef::define( + 'Core', 'CleanUTF8DuringGeneration', false, + 'When true, HTMLPurifier_Generator will also check all strings it '. + 'escapes for UTF-8 well-formedness as a defense in depth measure. '. + 'This could cause a considerable performance impact, and is not '. + 'strictly necessary due to the fact that the Lexers should have '. + 'ensured that all the UTF-8 strings were well-formed. Note that '. + 'the configuration value is only read at the beginning of '. + 'generateFromTokens.' +); + class HTMLPurifier_Generator { + var $clean_utf8 = false; + // only unit tests may omit configuration: internals MUST pass config function generateFromTokens($tokens, $config = null) { $html = ''; if (!$config) $config = HTMLPurifier_Config::createDefault(); + $this->clean_utf8 = $config->get('Core', 'CleanUTF8DuringGeneration'); if (!$tokens) return ''; foreach ($tokens as $token) { - $html .= $this->generateFromToken($token, $config); + $html .= $this->generateFromToken($token); } return $html; } - function generateFromToken($token, $config) { + function generateFromToken($token) { if (!isset($token->type)) return ''; if ($token->type == 'start') { - $attr = $this->generateAttributes($token->attributes, $config); + $attr = $this->generateAttributes($token->attributes); return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>'; } elseif ($token->type == 'end') { return 'name . '>'; } elseif ($token->type == 'empty') { - $attr = $this->generateAttributes($token->attributes, $config); + $attr = $this->generateAttributes($token->attributes); return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />'; } elseif ($token->type == 'text') { - return htmlspecialchars($token->data, ENT_COMPAT, 'UTF-8'); + return $this->escape($token->data); } else { return ''; @@ -38,14 +54,19 @@ class HTMLPurifier_Generator } } - function generateAttributes($assoc_array_of_attributes, $config) { + function generateAttributes($assoc_array_of_attributes) { $html = ''; foreach ($assoc_array_of_attributes as $key => $value) { - $html .= $key.'="'.htmlspecialchars($value, ENT_COMPAT, 'UTF-8').'" '; + $html .= $key.'="'.$this->escape($value).'" '; } return rtrim($html); } + function escape($string) { + if ($this->clean_utf8) $string = HTMLPurifier_Lexer::cleanUTF8($string); + return htmlspecialchars($string, ENT_COMPAT, 'UTF-8'); + } + } ?> \ No newline at end of file diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index 2f7225a3..35235658 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -307,16 +307,19 @@ class HTMLPurifier_Lexer } /** - * Currently converts UTF8 into an array of Unicode codepoints. (changing) + * Cleans a UTF-8 string for well-formedness and SGML validity * - * We're going to convert this into a multi-purpose UTF-8 well-formedness - * checker as well as handler for the control characters that are illegal - * in SGML documents. But *after* we draw up some unit-tests. This means - * that the function, in the end, will not return an array of codepoints - * but a valid UTF8 string, with non-SGML codepoints excluded. + * It will parse according to UTF-8 and return a valid UTF8 string, with + * non-SGML codepoints excluded. + * + * @warning This function can find a lot of use, so we may be moving + * it to a dedicated class. * * @note Just for reference, the non-SGML code points are 0 to 31 and - * 127 to 159, inclusive. + * 127 to 159, inclusive. However, we allow code points 9, 10 + * and 13, which are the tab, line feed and carriage return + * respectively. 128 and above the code points map to multibyte + * UTF-8 representations. * * @note The functionality provided by the original function could be * implemented with iconv using 'UTF-8//IGNORE', mbstring, or @@ -332,7 +335,7 @@ class HTMLPurifier_Lexer * * @note Code adapted from utf8ToUnicode by Henri Sivonen and * hsivonen@iki.fi at under the - * LGPL license. + * LGPL license. Notes on what changed are inside. */ function cleanUTF8($str) { $mState = 0; // cached expected number of octets after the current octet @@ -340,17 +343,33 @@ class HTMLPurifier_Lexer $mUcs4 = 0; // cached Unicode character $mBytes = 1; // cached expected number of octets in the current sequence - $out = array(); + // original code involved an $out that was an array of Unicode + // codepoints. Instead of having to convert back into UTF-8, we've + // decided to directly append valid UTF-8 characters onto a string + // $out once they're done. $char accumulates raw bytes, while $mUcs4 + // turns into the Unicode code point, so there's some redundancy. + + $out = ''; + $char = ''; $len = strlen($str); for($i = 0; $i < $len; $i++) { $in = ord($str{$i}); + $char .= $str[$i]; // append byte to char if (0 == $mState) { // When mState is zero we expect either a US-ASCII character // or a multi-octet sequence. if (0 == (0x80 & ($in))) { // US-ASCII, pass straight through. - $out[] = $in; + if (($in <= 31 || $in == 127) && + !($in == 9 || $in == 13 || $in == 10) // save \r\t\n + ) { + // control characters, remove + } else { + $out .= $char; + } + // reset + $char = ''; $mBytes = 1; } elseif (0xC0 == (0xE0 & ($in))) { // First octet of 2 octet sequence @@ -394,7 +413,10 @@ class HTMLPurifier_Lexer } else { // Current octet is neither in the US-ASCII range nor a // legal first octet of a multi-octet sequence. - return false; + $mState = 0; + $mUcs4 = 0; + $mBytes = 1; + $char = ''; } } else { // When mState is non-zero, we expect a continuation of the @@ -422,23 +444,26 @@ class HTMLPurifier_Lexer // Codepoints outside the Unicode range are illegal ($mUcs4 > 0x10FFFF) ) { - return false; + + } elseif (0xFEFF != $mUcs4 && // omit BOM + !($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML + ) { + $out .= $char; } - if (0xFEFF != $mUcs4) { - // BOM is legal but we don't want to output it - $out[] = $mUcs4; - } - //initialize UTF8 cache + // initialize UTF8 cache (reset) $mState = 0; $mUcs4 = 0; $mBytes = 1; + $char = ''; } } else { // ((0xC0 & (*in) != 0x80) && (mState != 0)) - // // Incomplete multi-octet sequence. - // - return false; + // used to result in complete fail, but we'll reset + $mState = 0; + $mUcs4 = 0; + $mBytes = 1; + $char =''; } } } diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index 4e85575c..9def5257 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -47,6 +47,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer // mode won't get 'em. $string = $this->escapeCDATA($string); + // substitute non-special entities. While DOM is perfectly capable + // of doing this, we need to get at the UTF-8 characters in + // cleanUTF8 + $string = $this->substituteNonSpecialEntities($string); + + // clean it into well-formed UTF-8 string + $string = $this->cleanUTF8($string); + if (!$is_full) { // preprocess string, essential for UTF-8 $string = diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index 535b3866..aa1250df 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -128,6 +128,9 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // expand entities THAT AREN'T THE BIG FIVE $string = $this->substituteNonSpecialEntities($string); + // clean it into wellformed UTF-8 string + $string = $this->cleanUTF8($string); + // infinite loop protection // has to be pretty big, since html docs can be big // we're allow two hundred thousand tags... more than enough? diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php index 02b3a484..a0bdfe66 100644 --- a/library/HTMLPurifier/Lexer/PEARSax3.php +++ b/library/HTMLPurifier/Lexer/PEARSax3.php @@ -29,20 +29,21 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer */ var $tokens = array(); - function tokenizeHTML($html, $config = null) { + function tokenizeHTML($string, $config = null) { if (!$config) $config = HTMLPurifier_Config::createDefault(); - $html = $this->escapeCDATA($html); + $string = $this->escapeCDATA($string); if ($config->get('Core', 'AcceptFullDocuments')) { - $html = $this->extractBody($html); + $string = $this->extractBody($string); } - $html = $this->substituteNonSpecialEntities($html); + $string = $this->substituteNonSpecialEntities($string); + $string = $this->cleanUTF8($string); $parser=& new XML_HTMLSax3(); $parser->set_object($this); $parser->set_element_handler('openHandler','closeHandler'); $parser->set_data_handler('dataHandler'); $parser->set_escape_handler('escapeHandler'); $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1); - $parser->parse($html); + $parser->parse($string); $tokens = $this->tokens; $this->tokens = array(); return $tokens; diff --git a/smoketests/common.php b/smoketests/common.php new file mode 100644 index 00000000..a6a8c146 --- /dev/null +++ b/smoketests/common.php @@ -0,0 +1,14 @@ + \ No newline at end of file diff --git a/smoketests/utf8.php b/smoketests/utf8.php index 09b3d7af..6e62f03c 100644 --- a/smoketests/utf8.php +++ b/smoketests/utf8.php @@ -1,7 +1,6 @@ HTMLPurifier UTF-8 Smoketest diff --git a/smoketests/variableWidthAttack.php b/smoketests/variableWidthAttack.php index c9bbcb1e..73ce6f0c 100644 --- a/smoketests/variableWidthAttack.php +++ b/smoketests/variableWidthAttack.php @@ -1,6 +1,6 @@

Test

@@ -44,8 +36,8 @@ for ($i = 0; $i < 256; $i++) { ?> - - + + @@ -54,9 +46,8 @@ for ($i = 0; $i < 256; $i++) {

Analysis

-

This test currently passes the XSS aspect but fails the validation aspect -due to generalized encoding issues. An augmented UTF-8 smoketest is -pending, until then, consider this a pass.

+

By making sure that UTF-8 is well formed and non-SGML codepoints are +removed, as well as escaping quotes outside of tags, this is a non-threat.

\ No newline at end of file diff --git a/smoketests/xssAttacks.php b/smoketests/xssAttacks.php index bd6800a6..48d020a1 100644 --- a/smoketests/xssAttacks.php +++ b/smoketests/xssAttacks.php @@ -1,6 +1,6 @@ if (version_compare(PHP_VERSION, '5', '<')) exit('

Requires PHP 5.

'); -set_include_path('../library' . PATH_SEPARATOR . get_include_path()); -require_once 'HTMLPurifier.php'; - $xml = simplexml_load_file('xssAttacks.xml'); $purifier = new HTMLPurifier(); @@ -43,10 +40,10 @@ foreach ($xml->attack as $attack) { if ($attack->name == 'US-ASCII encoding') $code = urldecode($code); ?> - - + + purify($code); ?> - + assertIdentical($this->Lexer->cleanUTF8($string), $expect); + } + + function test_cleanUTF8() { + $this->assertCleanUTF8('Normal string.'); + $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters"); + $this->assertCleanUTF8("null byte: \0", 'null byte: '); + $this->assertCleanUTF8("\1\2\3\4\5\6\7", ''); + $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char + $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML + $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte + $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8 + } + function test_substituteNonSpecialEntities() { $char_theta = $this->_entity_lookup->table['theta']; $this->assertIdentical($char_theta,
ASCIIRawOutputRender
name); ?>name); ?>