0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-12-22 16:31:53 +00:00
- Added %Core.EscapeNonASCIICharacters to workaround %Core.Encoding misbehavior
- Add "All Tests" to test runner title and reorder subfile names
- Specific file is now called with ?f=
- Link to UTF-8 docs, even though they're not done
- 1000th unit test passed! W00t! (that's a third as many as SimpleTest has for itself.)

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@658 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2007-01-19 03:54:55 +00:00
parent ad1169c711
commit 0dd866cc15
5 changed files with 128 additions and 8 deletions

2
NEWS
View File

@ -16,6 +16,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
to allow these, and background-position IS NOT implemented yet. to allow these, and background-position IS NOT implemented yet.
! Configuration documentation looks nicer ! Configuration documentation looks nicer
! Added smoketest 'all.php', which loads all other smoketests via frames ! Added smoketest 'all.php', which loads all other smoketests via frames
! Added %Core.EscapeNonASCIICharacters to workaround loss of Unicode
characters while %Core.Encoding is set to a non-UTF-8 encoding.
. Implemented AttrDef_CSSURI for url(http://google.com) style declarations . Implemented AttrDef_CSSURI for url(http://google.com) style declarations
1.3.3, unknown release date, likely to be dropped 1.3.3, unknown release date, likely to be dropped

View File

@ -31,6 +31,9 @@ information for casual developers using HTML Purifier.</p>
<dt><a href="enduser-slow.html">Speeding up HTML Purifier</a></dt> <dt><a href="enduser-slow.html">Speeding up HTML Purifier</a></dt>
<dd>Explains how to speed up HTML Purifier through caching or inbound filtering.</dd> <dd>Explains how to speed up HTML Purifier through caching or inbound filtering.</dd>
<dt><a href="enduser-utf8.html">UTF-8</a></dt>
<dd>Describes the rationale for using UTF-8, the ramifications otherwise, and how to make the switch.</dd>
</dl> </dl>
<h2>Development</h2> <h2>Development</h2>

View File

@ -6,15 +6,29 @@ HTMLPurifier_ConfigSchema::define(
'Core', 'Encoding', 'utf-8', 'istring', 'Core', 'Encoding', 'utf-8', 'istring',
'If for some reason you are unable to convert all webpages to UTF-8, '. 'If for some reason you are unable to convert all webpages to UTF-8, '.
'you can use this directive as a stop-gap compatibility change to '. 'you can use this directive as a stop-gap compatibility change to '.
'let HTMLPurifier deal with non UTF-8 input. This technique has '. 'let HTML Purifier deal with non UTF-8 input. This technique has '.
'notable deficiencies: absolutely no characters outside of the selected '. 'notable deficiencies: absolutely no characters outside of the selected '.
'character encoding will be preserved, not even the ones that have '. 'character encoding will be preserved, not even the ones that have '.
'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '. 'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '.
'that automatically resolves all entities), making it pretty useless '. 'that automatically resolves all entities), making it pretty useless '.
'for anything except the most I18N-blind applications. This directive '. 'for anything except the most I18N-blind applications, although '.
'%Core.EscapeNonASCIICharacters offers fixes this trouble with '.
'another tradeoff. This directive '.
'only accepts ISO-8859-1 if iconv is not enabled.' 'only accepts ISO-8859-1 if iconv is not enabled.'
); );
HTMLPurifier_ConfigSchema::define(
'Core', 'EscapeNonASCIICharacters', false, 'bool',
'This directive overcomes a deficiency in %Core.Encoding by blindly '.
'converting all non-ASCII characters into decimal numeric entities before '.
'converting it to its native encoding. This means that even '.
'characters that can be expressed in the non-UTF-8 encoding will '.
'be entity-ized, which can be a real downer for encodings like Big5. '.
'It also assumes that the ASCII repetoire is available, although '.
'this is the case for almost all encodings. Anyway, use UTF-8! This '.
'directive has been available since 1.4.0.'
);
if ( !function_exists('iconv') ) { if ( !function_exists('iconv') ) {
// only encodings with native PHP support // only encodings with native PHP support
HTMLPurifier_ConfigSchema::defineAllowedValues( HTMLPurifier_ConfigSchema::defineAllowedValues(
@ -310,6 +324,7 @@ class HTMLPurifier_Encoder
} elseif ($encoding === 'iso-8859-1') { } elseif ($encoding === 'iso-8859-1') {
return @utf8_encode($str); return @utf8_encode($str);
} }
trigger_error('Encoding not supported', E_USER_ERROR);
} }
/** /**
@ -323,11 +338,63 @@ class HTMLPurifier_Encoder
if ($iconv === null) $iconv = function_exists('iconv'); if ($iconv === null) $iconv = function_exists('iconv');
$encoding = $config->get('Core', 'Encoding'); $encoding = $config->get('Core', 'Encoding');
if ($encoding === 'utf-8') return $str; if ($encoding === 'utf-8') return $str;
if ($config->get('Core', 'EscapeNonASCIICharacters')) {
$str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
}
if ($iconv && !$config->get('Test', 'ForceNoIconv')) { if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
return @iconv('utf-8', $encoding . '//IGNORE', $str); return @iconv('utf-8', $encoding . '//IGNORE', $str);
} elseif ($encoding === 'iso-8859-1') { } elseif ($encoding === 'iso-8859-1') {
return @utf8_decode($str); return @utf8_decode($str);
} }
trigger_error('Encoding not supported', E_USER_ERROR);
}
/**
* Lossless (character-wise) conversion of HTML to ASCII
* @static
* @param $str UTF-8 string to be converted to ASCII
* @returns ASCII encoded string with non-ASCII character entity-ized
* @warning Adapted from MediaWiki, claiming fair use: this is a common
* algorithm. If you disagree with this license fudgery,
* implement it yourself.
* @note Uses decimal numeric entities since they are best supported.
* @note This is a DUMB function: it has no concept of keeping
* character entities that the projected character encoding
* can allow. We could possibly implement a smart version
* but that would require it to also know which Unicode
* codepoints the charset supported (not an easy task).
* @note Sort of with cleanUTF8() but it assumes that $str is
* well-formed UTF-8
*/
function convertToASCIIDumbLossless($str) {
$bytesleft = 0;
$result = '';
$working = 0;
$len = strlen($str);
for( $i = 0; $i < $len; $i++ ) {
$bytevalue = ord( $str[$i] );
if( $bytevalue <= 0x7F ) { //0xxx xxxx
$result .= chr( $bytevalue );
$bytesleft = 0;
} elseif( $bytevalue <= 0xBF ) { //10xx xxxx
$working = $working << 6;
$working += ($bytevalue & 0x3F);
$bytesleft--;
if( $bytesleft <= 0 ) {
$result .= "&#" . $working . ";";
}
} elseif( $bytevalue <= 0xDF ) { //110x xxxx
$working = $bytevalue & 0x1F;
$bytesleft = 1;
} elseif( $bytevalue <= 0xEF ) { //1110 xxxx
$working = $bytevalue & 0x0F;
$bytesleft = 2;
} else { //1111 0xxx
$working = $bytevalue & 0x07;
$bytesleft = 3;
}
}
return $result;
} }

View File

@ -5,7 +5,7 @@ require_once 'HTMLPurifier/Encoder.php';
class HTMLPurifier_EncoderTest extends UnitTestCase class HTMLPurifier_EncoderTest extends UnitTestCase
{ {
var $Encoder; var $_entity_lookup;
function setUp() { function setUp() {
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
@ -60,6 +60,9 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
$config = HTMLPurifier_Config::createDefault(); $config = HTMLPurifier_Config::createDefault();
$context = new HTMLPurifier_Context(); $context = new HTMLPurifier_Context();
// zhong-wen
$chinese = "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
// UTF-8 means that we don't touch it // UTF-8 means that we don't touch it
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context), HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
@ -74,13 +77,55 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
"\xF6" "\xF6"
); );
$config->set('Test', 'ForceNoIconv', true); if (function_exists('iconv')) {
// iconv has it's own way
$this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
" (Chinese)"
);
}
// Plain PHP implementation has slightly different behavior
$config->set('Test', 'ForceNoIconv', true);
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context), HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
"\xF6" "\xF6"
); );
$this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
"?? (Chinese)"
);
// Preserve the characters!
$config->set('Core', 'EscapeNonASCIICharacters', true);
$this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
"&#20013;&#25991; (Chinese)"
);
}
function test_convertToASCIIDumbLossless() {
// Uppercase thorn letter
$this->assertIdentical(
HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"),
"&#222;orn"
);
$this->assertIdentical(
HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"),
"an"
);
// test up to four bytes
$this->assertIdentical(
HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
"&#917536;"
);
} }
} }

View File

@ -1,5 +1,8 @@
<?php <?php
// call one file using /?f=FileTest.php , see $test_files array for
// valid values
error_reporting(E_ALL); error_reporting(E_ALL);
// wishlist: automated calling of this file from multiple PHP versions so we // wishlist: automated calling of this file from multiple PHP versions so we
@ -118,19 +121,19 @@ function htmlpurifier_path2class($path) {
// we can't use addTestFile because SimpleTest chokes on E_STRICT warnings // we can't use addTestFile because SimpleTest chokes on E_STRICT warnings
if (isset($_GET['file']) && isset($test_file_lookup[$_GET['file']])) { if (isset($_GET['f']) && isset($test_file_lookup[$_GET['f']])) {
// execute only one test // execute only one test
$test_file = $_GET['file']; $test_file = $_GET['f'];
$test = new GroupTest('HTML Purifier - ' . $test_file); $test = new GroupTest($test_file . ' - HTML Purifier');
$path = 'HTMLPurifier/' . $test_file; $path = 'HTMLPurifier/' . $test_file;
require_once $path; require_once $path;
$test->addTestClass(htmlpurifier_path2class($path)); $test->addTestClass(htmlpurifier_path2class($path));
} else { } else {
$test = new GroupTest('HTML Purifier'); $test = new GroupTest('All Tests - HTML Purifier');
foreach ($test_files as $test_file) { foreach ($test_files as $test_file) {
$path = 'HTMLPurifier/' . $test_file; $path = 'HTMLPurifier/' . $test_file;