diff --git a/NEWS b/NEWS
index 39955416..00139015 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
1.0.0rc1, released 2006-??-??
- Fixed broken numeric entity conversion
+- Malformed UTF-8 and non-SGML character detection and cleaning implemented
1.0.0beta, released 2006-08-16
- First public release, most functionality implemented. Notable omissions are:
diff --git a/library/HTMLPurifier/Generator.php b/library/HTMLPurifier/Generator.php
index d1ac6d40..aa36896a 100644
--- a/library/HTMLPurifier/Generator.php
+++ b/library/HTMLPurifier/Generator.php
@@ -2,35 +2,51 @@
// pretty-printing with indentation would be pretty cool
+require_once 'HTMLPurifier/Lexer.php';
+
+HTMLPurifier_ConfigDef::define(
+ 'Core', 'CleanUTF8DuringGeneration', false,
+ 'When true, HTMLPurifier_Generator will also check all strings it '.
+ 'escapes for UTF-8 well-formedness as a defense in depth measure. '.
+ 'This could cause a considerable performance impact, and is not '.
+ 'strictly necessary due to the fact that the Lexers should have '.
+ 'ensured that all the UTF-8 strings were well-formed. Note that '.
+ 'the configuration value is only read at the beginning of '.
+ 'generateFromTokens.'
+);
+
class HTMLPurifier_Generator
{
+ var $clean_utf8 = false;
+
// only unit tests may omit configuration: internals MUST pass config
function generateFromTokens($tokens, $config = null) {
$html = '';
if (!$config) $config = HTMLPurifier_Config::createDefault();
+ $this->clean_utf8 = $config->get('Core', 'CleanUTF8DuringGeneration');
if (!$tokens) return '';
foreach ($tokens as $token) {
- $html .= $this->generateFromToken($token, $config);
+ $html .= $this->generateFromToken($token);
}
return $html;
}
- function generateFromToken($token, $config) {
+ function generateFromToken($token) {
if (!isset($token->type)) return '';
if ($token->type == 'start') {
- $attr = $this->generateAttributes($token->attributes, $config);
+ $attr = $this->generateAttributes($token->attributes);
return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
} elseif ($token->type == 'end') {
return '' . $token->name . '>';
} elseif ($token->type == 'empty') {
- $attr = $this->generateAttributes($token->attributes, $config);
+ $attr = $this->generateAttributes($token->attributes);
return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />';
} elseif ($token->type == 'text') {
- return htmlspecialchars($token->data, ENT_COMPAT, 'UTF-8');
+ return $this->escape($token->data);
} else {
return '';
@@ -38,14 +54,19 @@ class HTMLPurifier_Generator
}
}
- function generateAttributes($assoc_array_of_attributes, $config) {
+ function generateAttributes($assoc_array_of_attributes) {
$html = '';
foreach ($assoc_array_of_attributes as $key => $value) {
- $html .= $key.'="'.htmlspecialchars($value, ENT_COMPAT, 'UTF-8').'" ';
+ $html .= $key.'="'.$this->escape($value).'" ';
}
return rtrim($html);
}
+ function escape($string) {
+ if ($this->clean_utf8) $string = HTMLPurifier_Lexer::cleanUTF8($string);
+ return htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
+ }
+
}
?>
\ No newline at end of file
diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php
index 2f7225a3..35235658 100644
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@@ -307,16 +307,19 @@ class HTMLPurifier_Lexer
}
/**
- * Currently converts UTF8 into an array of Unicode codepoints. (changing)
+ * Cleans a UTF-8 string for well-formedness and SGML validity
*
- * We're going to convert this into a multi-purpose UTF-8 well-formedness
- * checker as well as handler for the control characters that are illegal
- * in SGML documents. But *after* we draw up some unit-tests. This means
- * that the function, in the end, will not return an array of codepoints
- * but a valid UTF8 string, with non-SGML codepoints excluded.
+ * It will parse according to UTF-8 and return a valid UTF8 string, with
+ * non-SGML codepoints excluded.
+ *
+ * @warning This function can find a lot of use, so we may be moving
+ * it to a dedicated class.
*
* @note Just for reference, the non-SGML code points are 0 to 31 and
- * 127 to 159, inclusive.
+ * 127 to 159, inclusive. However, we allow code points 9, 10
+ * and 13, which are the tab, line feed and carriage return
+ * respectively. 128 and above the code points map to multibyte
+ * UTF-8 representations.
*
* @note The functionality provided by the original function could be
* implemented with iconv using 'UTF-8//IGNORE', mbstring, or
@@ -332,7 +335,7 @@ class HTMLPurifier_Lexer
*
* @note Code adapted from utf8ToUnicode by Henri Sivonen and
* hsivonen@iki.fi at
ASCII | Raw | Output | Render | ||
---|---|---|---|---|---|
- | - | + | + |