mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-01-20 12:31:53 +00:00
Malformed UTF-8 and non-SGML character detection and cleaning implemented
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@303 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
53808ee34a
commit
973cc43b64
1
NEWS
1
NEWS
@ -3,6 +3,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
|
|
||||||
1.0.0rc1, released 2006-??-??
|
1.0.0rc1, released 2006-??-??
|
||||||
- Fixed broken numeric entity conversion
|
- Fixed broken numeric entity conversion
|
||||||
|
- Malformed UTF-8 and non-SGML character detection and cleaning implemented
|
||||||
|
|
||||||
1.0.0beta, released 2006-08-16
|
1.0.0beta, released 2006-08-16
|
||||||
- First public release, most functionality implemented. Notable omissions are:
|
- First public release, most functionality implemented. Notable omissions are:
|
||||||
|
@ -2,35 +2,51 @@
|
|||||||
|
|
||||||
// pretty-printing with indentation would be pretty cool
|
// pretty-printing with indentation would be pretty cool
|
||||||
|
|
||||||
|
require_once 'HTMLPurifier/Lexer.php';
|
||||||
|
|
||||||
|
HTMLPurifier_ConfigDef::define(
|
||||||
|
'Core', 'CleanUTF8DuringGeneration', false,
|
||||||
|
'When true, HTMLPurifier_Generator will also check all strings it '.
|
||||||
|
'escapes for UTF-8 well-formedness as a defense in depth measure. '.
|
||||||
|
'This could cause a considerable performance impact, and is not '.
|
||||||
|
'strictly necessary due to the fact that the Lexers should have '.
|
||||||
|
'ensured that all the UTF-8 strings were well-formed. Note that '.
|
||||||
|
'the configuration value is only read at the beginning of '.
|
||||||
|
'generateFromTokens.'
|
||||||
|
);
|
||||||
|
|
||||||
class HTMLPurifier_Generator
|
class HTMLPurifier_Generator
|
||||||
{
|
{
|
||||||
|
|
||||||
|
var $clean_utf8 = false;
|
||||||
|
|
||||||
// only unit tests may omit configuration: internals MUST pass config
|
// only unit tests may omit configuration: internals MUST pass config
|
||||||
function generateFromTokens($tokens, $config = null) {
|
function generateFromTokens($tokens, $config = null) {
|
||||||
$html = '';
|
$html = '';
|
||||||
if (!$config) $config = HTMLPurifier_Config::createDefault();
|
if (!$config) $config = HTMLPurifier_Config::createDefault();
|
||||||
|
$this->clean_utf8 = $config->get('Core', 'CleanUTF8DuringGeneration');
|
||||||
if (!$tokens) return '';
|
if (!$tokens) return '';
|
||||||
foreach ($tokens as $token) {
|
foreach ($tokens as $token) {
|
||||||
$html .= $this->generateFromToken($token, $config);
|
$html .= $this->generateFromToken($token);
|
||||||
}
|
}
|
||||||
return $html;
|
return $html;
|
||||||
}
|
}
|
||||||
|
|
||||||
function generateFromToken($token, $config) {
|
function generateFromToken($token) {
|
||||||
if (!isset($token->type)) return '';
|
if (!isset($token->type)) return '';
|
||||||
if ($token->type == 'start') {
|
if ($token->type == 'start') {
|
||||||
$attr = $this->generateAttributes($token->attributes, $config);
|
$attr = $this->generateAttributes($token->attributes);
|
||||||
return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
|
return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
|
||||||
|
|
||||||
} elseif ($token->type == 'end') {
|
} elseif ($token->type == 'end') {
|
||||||
return '</' . $token->name . '>';
|
return '</' . $token->name . '>';
|
||||||
|
|
||||||
} elseif ($token->type == 'empty') {
|
} elseif ($token->type == 'empty') {
|
||||||
$attr = $this->generateAttributes($token->attributes, $config);
|
$attr = $this->generateAttributes($token->attributes);
|
||||||
return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />';
|
return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />';
|
||||||
|
|
||||||
} elseif ($token->type == 'text') {
|
} elseif ($token->type == 'text') {
|
||||||
return htmlspecialchars($token->data, ENT_COMPAT, 'UTF-8');
|
return $this->escape($token->data);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
return '';
|
return '';
|
||||||
@ -38,14 +54,19 @@ class HTMLPurifier_Generator
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function generateAttributes($assoc_array_of_attributes, $config) {
|
function generateAttributes($assoc_array_of_attributes) {
|
||||||
$html = '';
|
$html = '';
|
||||||
foreach ($assoc_array_of_attributes as $key => $value) {
|
foreach ($assoc_array_of_attributes as $key => $value) {
|
||||||
$html .= $key.'="'.htmlspecialchars($value, ENT_COMPAT, 'UTF-8').'" ';
|
$html .= $key.'="'.$this->escape($value).'" ';
|
||||||
}
|
}
|
||||||
return rtrim($html);
|
return rtrim($html);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function escape($string) {
|
||||||
|
if ($this->clean_utf8) $string = HTMLPurifier_Lexer::cleanUTF8($string);
|
||||||
|
return htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
?>
|
@ -307,16 +307,19 @@ class HTMLPurifier_Lexer
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Currently converts UTF8 into an array of Unicode codepoints. (changing)
|
* Cleans a UTF-8 string for well-formedness and SGML validity
|
||||||
*
|
*
|
||||||
* We're going to convert this into a multi-purpose UTF-8 well-formedness
|
* It will parse according to UTF-8 and return a valid UTF8 string, with
|
||||||
* checker as well as handler for the control characters that are illegal
|
* non-SGML codepoints excluded.
|
||||||
* in SGML documents. But *after* we draw up some unit-tests. This means
|
*
|
||||||
* that the function, in the end, will not return an array of codepoints
|
* @warning This function can find a lot of use, so we may be moving
|
||||||
* but a valid UTF8 string, with non-SGML codepoints excluded.
|
* it to a dedicated class.
|
||||||
*
|
*
|
||||||
* @note Just for reference, the non-SGML code points are 0 to 31 and
|
* @note Just for reference, the non-SGML code points are 0 to 31 and
|
||||||
* 127 to 159, inclusive.
|
* 127 to 159, inclusive. However, we allow code points 9, 10
|
||||||
|
* and 13, which are the tab, line feed and carriage return
|
||||||
|
* respectively. 128 and above the code points map to multibyte
|
||||||
|
* UTF-8 representations.
|
||||||
*
|
*
|
||||||
* @note The functionality provided by the original function could be
|
* @note The functionality provided by the original function could be
|
||||||
* implemented with iconv using 'UTF-8//IGNORE', mbstring, or
|
* implemented with iconv using 'UTF-8//IGNORE', mbstring, or
|
||||||
@ -332,7 +335,7 @@ class HTMLPurifier_Lexer
|
|||||||
*
|
*
|
||||||
* @note Code adapted from utf8ToUnicode by Henri Sivonen and
|
* @note Code adapted from utf8ToUnicode by Henri Sivonen and
|
||||||
* hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
|
* hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
|
||||||
* LGPL license.
|
* LGPL license. Notes on what changed are inside.
|
||||||
*/
|
*/
|
||||||
function cleanUTF8($str) {
|
function cleanUTF8($str) {
|
||||||
$mState = 0; // cached expected number of octets after the current octet
|
$mState = 0; // cached expected number of octets after the current octet
|
||||||
@ -340,17 +343,33 @@ class HTMLPurifier_Lexer
|
|||||||
$mUcs4 = 0; // cached Unicode character
|
$mUcs4 = 0; // cached Unicode character
|
||||||
$mBytes = 1; // cached expected number of octets in the current sequence
|
$mBytes = 1; // cached expected number of octets in the current sequence
|
||||||
|
|
||||||
$out = array();
|
// original code involved an $out that was an array of Unicode
|
||||||
|
// codepoints. Instead of having to convert back into UTF-8, we've
|
||||||
|
// decided to directly append valid UTF-8 characters onto a string
|
||||||
|
// $out once they're done. $char accumulates raw bytes, while $mUcs4
|
||||||
|
// turns into the Unicode code point, so there's some redundancy.
|
||||||
|
|
||||||
|
$out = '';
|
||||||
|
$char = '';
|
||||||
|
|
||||||
$len = strlen($str);
|
$len = strlen($str);
|
||||||
for($i = 0; $i < $len; $i++) {
|
for($i = 0; $i < $len; $i++) {
|
||||||
$in = ord($str{$i});
|
$in = ord($str{$i});
|
||||||
|
$char .= $str[$i]; // append byte to char
|
||||||
if (0 == $mState) {
|
if (0 == $mState) {
|
||||||
// When mState is zero we expect either a US-ASCII character
|
// When mState is zero we expect either a US-ASCII character
|
||||||
// or a multi-octet sequence.
|
// or a multi-octet sequence.
|
||||||
if (0 == (0x80 & ($in))) {
|
if (0 == (0x80 & ($in))) {
|
||||||
// US-ASCII, pass straight through.
|
// US-ASCII, pass straight through.
|
||||||
$out[] = $in;
|
if (($in <= 31 || $in == 127) &&
|
||||||
|
!($in == 9 || $in == 13 || $in == 10) // save \r\t\n
|
||||||
|
) {
|
||||||
|
// control characters, remove
|
||||||
|
} else {
|
||||||
|
$out .= $char;
|
||||||
|
}
|
||||||
|
// reset
|
||||||
|
$char = '';
|
||||||
$mBytes = 1;
|
$mBytes = 1;
|
||||||
} elseif (0xC0 == (0xE0 & ($in))) {
|
} elseif (0xC0 == (0xE0 & ($in))) {
|
||||||
// First octet of 2 octet sequence
|
// First octet of 2 octet sequence
|
||||||
@ -394,7 +413,10 @@ class HTMLPurifier_Lexer
|
|||||||
} else {
|
} else {
|
||||||
// Current octet is neither in the US-ASCII range nor a
|
// Current octet is neither in the US-ASCII range nor a
|
||||||
// legal first octet of a multi-octet sequence.
|
// legal first octet of a multi-octet sequence.
|
||||||
return false;
|
$mState = 0;
|
||||||
|
$mUcs4 = 0;
|
||||||
|
$mBytes = 1;
|
||||||
|
$char = '';
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// When mState is non-zero, we expect a continuation of the
|
// When mState is non-zero, we expect a continuation of the
|
||||||
@ -422,23 +444,26 @@ class HTMLPurifier_Lexer
|
|||||||
// Codepoints outside the Unicode range are illegal
|
// Codepoints outside the Unicode range are illegal
|
||||||
($mUcs4 > 0x10FFFF)
|
($mUcs4 > 0x10FFFF)
|
||||||
) {
|
) {
|
||||||
return false;
|
|
||||||
|
} elseif (0xFEFF != $mUcs4 && // omit BOM
|
||||||
|
!($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
|
||||||
|
) {
|
||||||
|
$out .= $char;
|
||||||
}
|
}
|
||||||
if (0xFEFF != $mUcs4) {
|
// initialize UTF8 cache (reset)
|
||||||
// BOM is legal but we don't want to output it
|
|
||||||
$out[] = $mUcs4;
|
|
||||||
}
|
|
||||||
//initialize UTF8 cache
|
|
||||||
$mState = 0;
|
$mState = 0;
|
||||||
$mUcs4 = 0;
|
$mUcs4 = 0;
|
||||||
$mBytes = 1;
|
$mBytes = 1;
|
||||||
|
$char = '';
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// ((0xC0 & (*in) != 0x80) && (mState != 0))
|
// ((0xC0 & (*in) != 0x80) && (mState != 0))
|
||||||
//
|
|
||||||
// Incomplete multi-octet sequence.
|
// Incomplete multi-octet sequence.
|
||||||
//
|
// used to result in complete fail, but we'll reset
|
||||||
return false;
|
$mState = 0;
|
||||||
|
$mUcs4 = 0;
|
||||||
|
$mBytes = 1;
|
||||||
|
$char ='';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -47,6 +47,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
|||||||
// mode won't get 'em.
|
// mode won't get 'em.
|
||||||
$string = $this->escapeCDATA($string);
|
$string = $this->escapeCDATA($string);
|
||||||
|
|
||||||
|
// substitute non-special entities. While DOM is perfectly capable
|
||||||
|
// of doing this, we need to get at the UTF-8 characters in
|
||||||
|
// cleanUTF8
|
||||||
|
$string = $this->substituteNonSpecialEntities($string);
|
||||||
|
|
||||||
|
// clean it into well-formed UTF-8 string
|
||||||
|
$string = $this->cleanUTF8($string);
|
||||||
|
|
||||||
if (!$is_full) {
|
if (!$is_full) {
|
||||||
// preprocess string, essential for UTF-8
|
// preprocess string, essential for UTF-8
|
||||||
$string =
|
$string =
|
||||||
|
@ -128,6 +128,9 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
|||||||
// expand entities THAT AREN'T THE BIG FIVE
|
// expand entities THAT AREN'T THE BIG FIVE
|
||||||
$string = $this->substituteNonSpecialEntities($string);
|
$string = $this->substituteNonSpecialEntities($string);
|
||||||
|
|
||||||
|
// clean it into wellformed UTF-8 string
|
||||||
|
$string = $this->cleanUTF8($string);
|
||||||
|
|
||||||
// infinite loop protection
|
// infinite loop protection
|
||||||
// has to be pretty big, since html docs can be big
|
// has to be pretty big, since html docs can be big
|
||||||
// we're allow two hundred thousand tags... more than enough?
|
// we're allow two hundred thousand tags... more than enough?
|
||||||
|
@ -29,20 +29,21 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
|||||||
*/
|
*/
|
||||||
var $tokens = array();
|
var $tokens = array();
|
||||||
|
|
||||||
function tokenizeHTML($html, $config = null) {
|
function tokenizeHTML($string, $config = null) {
|
||||||
if (!$config) $config = HTMLPurifier_Config::createDefault();
|
if (!$config) $config = HTMLPurifier_Config::createDefault();
|
||||||
$html = $this->escapeCDATA($html);
|
$string = $this->escapeCDATA($string);
|
||||||
if ($config->get('Core', 'AcceptFullDocuments')) {
|
if ($config->get('Core', 'AcceptFullDocuments')) {
|
||||||
$html = $this->extractBody($html);
|
$string = $this->extractBody($string);
|
||||||
}
|
}
|
||||||
$html = $this->substituteNonSpecialEntities($html);
|
$string = $this->substituteNonSpecialEntities($string);
|
||||||
|
$string = $this->cleanUTF8($string);
|
||||||
$parser=& new XML_HTMLSax3();
|
$parser=& new XML_HTMLSax3();
|
||||||
$parser->set_object($this);
|
$parser->set_object($this);
|
||||||
$parser->set_element_handler('openHandler','closeHandler');
|
$parser->set_element_handler('openHandler','closeHandler');
|
||||||
$parser->set_data_handler('dataHandler');
|
$parser->set_data_handler('dataHandler');
|
||||||
$parser->set_escape_handler('escapeHandler');
|
$parser->set_escape_handler('escapeHandler');
|
||||||
$parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
|
$parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
|
||||||
$parser->parse($html);
|
$parser->parse($string);
|
||||||
$tokens = $this->tokens;
|
$tokens = $this->tokens;
|
||||||
$this->tokens = array();
|
$this->tokens = array();
|
||||||
return $tokens;
|
return $tokens;
|
||||||
|
14
smoketests/common.php
Normal file
14
smoketests/common.php
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
header('Content-type: text/html; charset=UTF-8');
|
||||||
|
|
||||||
|
set_include_path('../library' . PATH_SEPARATOR . get_include_path());
|
||||||
|
require_once 'HTMLPurifier.php';
|
||||||
|
|
||||||
|
function escapeHTML($string) {
|
||||||
|
$string = HTMLPurifier_Lexer::cleanUTF8($string);
|
||||||
|
$string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
|
||||||
|
return $string;
|
||||||
|
}
|
||||||
|
|
||||||
|
?>
|
@ -1,7 +1,6 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
// there must not be a byte order mark
|
require_once 'common.php';
|
||||||
header('Content-type: text/html; charset=UTF-8');
|
|
||||||
|
|
||||||
?><!DOCTYPE html
|
?><!DOCTYPE html
|
||||||
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||||
@ -15,9 +14,6 @@ header('Content-type: text/html; charset=UTF-8');
|
|||||||
<h1>HTMLPurifier UTF-8 Smoketest</h1>
|
<h1>HTMLPurifier UTF-8 Smoketest</h1>
|
||||||
<?php
|
<?php
|
||||||
|
|
||||||
set_include_path('../library' . PATH_SEPARATOR . get_include_path());
|
|
||||||
require_once 'HTMLPurifier.php';
|
|
||||||
|
|
||||||
$purifier = new HTMLPurifier();
|
$purifier = new HTMLPurifier();
|
||||||
$string = '
|
$string = '
|
||||||
<ul>
|
<ul>
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
header('Content-type: text/html; charset=UTF-8');
|
require_once 'common.php';
|
||||||
|
|
||||||
?><!DOCTYPE html
|
?><!DOCTYPE html
|
||||||
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||||
@ -19,16 +19,8 @@ in Internet Explorer, if it works at all.</p>
|
|||||||
<h2>Test</h2>
|
<h2>Test</h2>
|
||||||
<?php
|
<?php
|
||||||
|
|
||||||
set_include_path('../library' . PATH_SEPARATOR . get_include_path());
|
|
||||||
require_once 'HTMLPurifier.php';
|
|
||||||
$purifier = new HTMLPurifier();
|
$purifier = new HTMLPurifier();
|
||||||
|
|
||||||
function escape($string) {
|
|
||||||
$string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
|
|
||||||
$string = iconv('UTF-8', 'UTF-8//IGNORE', $string);
|
|
||||||
return $string;
|
|
||||||
}
|
|
||||||
|
|
||||||
?>
|
?>
|
||||||
<table>
|
<table>
|
||||||
<thead><tr><th>ASCII</th><th width="30%">Raw</th><th>Output</th><th>Render</th></tr></thead>
|
<thead><tr><th>ASCII</th><th width="30%">Raw</th><th>Output</th><th>Render</th></tr></thead>
|
||||||
@ -44,8 +36,8 @@ for ($i = 0; $i < 256; $i++) {
|
|||||||
?>
|
?>
|
||||||
<tr>
|
<tr>
|
||||||
<td><?php echo $i; ?></td>
|
<td><?php echo $i; ?></td>
|
||||||
<td style="font-size:8pt;"><?php echo escape($html); ?></td>
|
<td style="font-size:8pt;"><?php echo escapeHTML($html); ?></td>
|
||||||
<td style="font-size:8pt;"><?php echo escape($pure_html); ?></td>
|
<td style="font-size:8pt;"><?php echo escapeHTML($pure_html); ?></td>
|
||||||
<td><?php echo $pure_html; ?></td>
|
<td><?php echo $pure_html; ?></td>
|
||||||
</tr>
|
</tr>
|
||||||
<?php } ?>
|
<?php } ?>
|
||||||
@ -54,9 +46,8 @@ for ($i = 0; $i < 256; $i++) {
|
|||||||
|
|
||||||
<h2>Analysis</h2>
|
<h2>Analysis</h2>
|
||||||
|
|
||||||
<p>This test currently passes the XSS aspect but fails the validation aspect
|
<p>By making sure that UTF-8 is well formed and non-SGML codepoints are
|
||||||
due to generalized encoding issues. An augmented UTF-8 smoketest is
|
removed, as well as escaping quotes outside of tags, this is a non-threat.</p>
|
||||||
pending, until then, consider this a pass.</p>
|
|
||||||
|
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
@ -1,6 +1,6 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
header('Content-type: text/html; charset=UTF-8');
|
require_once('common.php');
|
||||||
|
|
||||||
?><!DOCTYPE html
|
?><!DOCTYPE html
|
||||||
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||||
@ -23,9 +23,6 @@ relevant.</p>
|
|||||||
|
|
||||||
if (version_compare(PHP_VERSION, '5', '<')) exit('<p>Requires PHP 5.</p>');
|
if (version_compare(PHP_VERSION, '5', '<')) exit('<p>Requires PHP 5.</p>');
|
||||||
|
|
||||||
set_include_path('../library' . PATH_SEPARATOR . get_include_path());
|
|
||||||
require_once 'HTMLPurifier.php';
|
|
||||||
|
|
||||||
$xml = simplexml_load_file('xssAttacks.xml');
|
$xml = simplexml_load_file('xssAttacks.xml');
|
||||||
$purifier = new HTMLPurifier();
|
$purifier = new HTMLPurifier();
|
||||||
|
|
||||||
@ -43,10 +40,10 @@ foreach ($xml->attack as $attack) {
|
|||||||
if ($attack->name == 'US-ASCII encoding') $code = urldecode($code);
|
if ($attack->name == 'US-ASCII encoding') $code = urldecode($code);
|
||||||
?>
|
?>
|
||||||
<tr>
|
<tr>
|
||||||
<td><?php echo htmlspecialchars($attack->name); ?></td>
|
<td><?php echo escapeHTML($attack->name); ?></td>
|
||||||
<td><textarea readonly="readonly" cols="20" rows="2"><?php echo htmlspecialchars($code); ?></textarea></td>
|
<td><textarea readonly="readonly" cols="20" rows="2"><?php echo escapeHTML($code); ?></textarea></td>
|
||||||
<?php $pure_html = $purifier->purify($code); ?>
|
<?php $pure_html = $purifier->purify($code); ?>
|
||||||
<td><textarea readonly="readonly" cols="20" rows="2"><?php echo htmlspecialchars($pure_html); ?></textarea></td>
|
<td><textarea readonly="readonly" cols="20" rows="2"><?php echo escapeHTML($pure_html); ?></textarea></td>
|
||||||
<td><?php echo $pure_html ?></td>
|
<td><?php echo $pure_html ?></td>
|
||||||
</tr>
|
</tr>
|
||||||
<?php
|
<?php
|
||||||
|
@ -32,6 +32,22 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function assertCleanUTF8($string, $expect = null) {
|
||||||
|
if ($expect === null) $expect = $string;
|
||||||
|
$this->assertIdentical($this->Lexer->cleanUTF8($string), $expect);
|
||||||
|
}
|
||||||
|
|
||||||
|
function test_cleanUTF8() {
|
||||||
|
$this->assertCleanUTF8('Normal string.');
|
||||||
|
$this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
|
||||||
|
$this->assertCleanUTF8("null byte: \0", 'null byte: ');
|
||||||
|
$this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
|
||||||
|
$this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
|
||||||
|
$this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
|
||||||
|
$this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
|
||||||
|
$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
|
||||||
|
}
|
||||||
|
|
||||||
function test_substituteNonSpecialEntities() {
|
function test_substituteNonSpecialEntities() {
|
||||||
$char_theta = $this->_entity_lookup->table['theta'];
|
$char_theta = $this->_entity_lookup->table['theta'];
|
||||||
$this->assertIdentical($char_theta,
|
$this->assertIdentical($char_theta,
|
||||||
|
Loading…
Reference in New Issue
Block a user