mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-22 16:31:53 +00:00
[1.4.0]
- Added %Core.EscapeNonASCIICharacters to workaround %Core.Encoding misbehavior - Add "All Tests" to test runner title and reorder subfile names - Specific file is now called with ?f= - Link to UTF-8 docs, even though they're not done - 1000th unit test passed! W00t! (that's a third as many as SimpleTest has for itself.) git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@658 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
ad1169c711
commit
0dd866cc15
2
NEWS
2
NEWS
@ -16,6 +16,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
to allow these, and background-position IS NOT implemented yet.
|
to allow these, and background-position IS NOT implemented yet.
|
||||||
! Configuration documentation looks nicer
|
! Configuration documentation looks nicer
|
||||||
! Added smoketest 'all.php', which loads all other smoketests via frames
|
! Added smoketest 'all.php', which loads all other smoketests via frames
|
||||||
|
! Added %Core.EscapeNonASCIICharacters to workaround loss of Unicode
|
||||||
|
characters while %Core.Encoding is set to a non-UTF-8 encoding.
|
||||||
. Implemented AttrDef_CSSURI for url(http://google.com) style declarations
|
. Implemented AttrDef_CSSURI for url(http://google.com) style declarations
|
||||||
|
|
||||||
1.3.3, unknown release date, likely to be dropped
|
1.3.3, unknown release date, likely to be dropped
|
||||||
|
@ -31,6 +31,9 @@ information for casual developers using HTML Purifier.</p>
|
|||||||
<dt><a href="enduser-slow.html">Speeding up HTML Purifier</a></dt>
|
<dt><a href="enduser-slow.html">Speeding up HTML Purifier</a></dt>
|
||||||
<dd>Explains how to speed up HTML Purifier through caching or inbound filtering.</dd>
|
<dd>Explains how to speed up HTML Purifier through caching or inbound filtering.</dd>
|
||||||
|
|
||||||
|
<dt><a href="enduser-utf8.html">UTF-8</a></dt>
|
||||||
|
<dd>Describes the rationale for using UTF-8, the ramifications otherwise, and how to make the switch.</dd>
|
||||||
|
|
||||||
</dl>
|
</dl>
|
||||||
|
|
||||||
<h2>Development</h2>
|
<h2>Development</h2>
|
||||||
|
@ -6,15 +6,29 @@ HTMLPurifier_ConfigSchema::define(
|
|||||||
'Core', 'Encoding', 'utf-8', 'istring',
|
'Core', 'Encoding', 'utf-8', 'istring',
|
||||||
'If for some reason you are unable to convert all webpages to UTF-8, '.
|
'If for some reason you are unable to convert all webpages to UTF-8, '.
|
||||||
'you can use this directive as a stop-gap compatibility change to '.
|
'you can use this directive as a stop-gap compatibility change to '.
|
||||||
'let HTMLPurifier deal with non UTF-8 input. This technique has '.
|
'let HTML Purifier deal with non UTF-8 input. This technique has '.
|
||||||
'notable deficiencies: absolutely no characters outside of the selected '.
|
'notable deficiencies: absolutely no characters outside of the selected '.
|
||||||
'character encoding will be preserved, not even the ones that have '.
|
'character encoding will be preserved, not even the ones that have '.
|
||||||
'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '.
|
'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '.
|
||||||
'that automatically resolves all entities), making it pretty useless '.
|
'that automatically resolves all entities), making it pretty useless '.
|
||||||
'for anything except the most I18N-blind applications. This directive '.
|
'for anything except the most I18N-blind applications, although '.
|
||||||
|
'%Core.EscapeNonASCIICharacters offers fixes this trouble with '.
|
||||||
|
'another tradeoff. This directive '.
|
||||||
'only accepts ISO-8859-1 if iconv is not enabled.'
|
'only accepts ISO-8859-1 if iconv is not enabled.'
|
||||||
);
|
);
|
||||||
|
|
||||||
|
HTMLPurifier_ConfigSchema::define(
|
||||||
|
'Core', 'EscapeNonASCIICharacters', false, 'bool',
|
||||||
|
'This directive overcomes a deficiency in %Core.Encoding by blindly '.
|
||||||
|
'converting all non-ASCII characters into decimal numeric entities before '.
|
||||||
|
'converting it to its native encoding. This means that even '.
|
||||||
|
'characters that can be expressed in the non-UTF-8 encoding will '.
|
||||||
|
'be entity-ized, which can be a real downer for encodings like Big5. '.
|
||||||
|
'It also assumes that the ASCII repetoire is available, although '.
|
||||||
|
'this is the case for almost all encodings. Anyway, use UTF-8! This '.
|
||||||
|
'directive has been available since 1.4.0.'
|
||||||
|
);
|
||||||
|
|
||||||
if ( !function_exists('iconv') ) {
|
if ( !function_exists('iconv') ) {
|
||||||
// only encodings with native PHP support
|
// only encodings with native PHP support
|
||||||
HTMLPurifier_ConfigSchema::defineAllowedValues(
|
HTMLPurifier_ConfigSchema::defineAllowedValues(
|
||||||
@ -310,6 +324,7 @@ class HTMLPurifier_Encoder
|
|||||||
} elseif ($encoding === 'iso-8859-1') {
|
} elseif ($encoding === 'iso-8859-1') {
|
||||||
return @utf8_encode($str);
|
return @utf8_encode($str);
|
||||||
}
|
}
|
||||||
|
trigger_error('Encoding not supported', E_USER_ERROR);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -323,11 +338,63 @@ class HTMLPurifier_Encoder
|
|||||||
if ($iconv === null) $iconv = function_exists('iconv');
|
if ($iconv === null) $iconv = function_exists('iconv');
|
||||||
$encoding = $config->get('Core', 'Encoding');
|
$encoding = $config->get('Core', 'Encoding');
|
||||||
if ($encoding === 'utf-8') return $str;
|
if ($encoding === 'utf-8') return $str;
|
||||||
|
if ($config->get('Core', 'EscapeNonASCIICharacters')) {
|
||||||
|
$str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
|
||||||
|
}
|
||||||
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
|
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
|
||||||
return @iconv('utf-8', $encoding . '//IGNORE', $str);
|
return @iconv('utf-8', $encoding . '//IGNORE', $str);
|
||||||
} elseif ($encoding === 'iso-8859-1') {
|
} elseif ($encoding === 'iso-8859-1') {
|
||||||
return @utf8_decode($str);
|
return @utf8_decode($str);
|
||||||
}
|
}
|
||||||
|
trigger_error('Encoding not supported', E_USER_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lossless (character-wise) conversion of HTML to ASCII
|
||||||
|
* @static
|
||||||
|
* @param $str UTF-8 string to be converted to ASCII
|
||||||
|
* @returns ASCII encoded string with non-ASCII character entity-ized
|
||||||
|
* @warning Adapted from MediaWiki, claiming fair use: this is a common
|
||||||
|
* algorithm. If you disagree with this license fudgery,
|
||||||
|
* implement it yourself.
|
||||||
|
* @note Uses decimal numeric entities since they are best supported.
|
||||||
|
* @note This is a DUMB function: it has no concept of keeping
|
||||||
|
* character entities that the projected character encoding
|
||||||
|
* can allow. We could possibly implement a smart version
|
||||||
|
* but that would require it to also know which Unicode
|
||||||
|
* codepoints the charset supported (not an easy task).
|
||||||
|
* @note Sort of with cleanUTF8() but it assumes that $str is
|
||||||
|
* well-formed UTF-8
|
||||||
|
*/
|
||||||
|
function convertToASCIIDumbLossless($str) {
|
||||||
|
$bytesleft = 0;
|
||||||
|
$result = '';
|
||||||
|
$working = 0;
|
||||||
|
$len = strlen($str);
|
||||||
|
for( $i = 0; $i < $len; $i++ ) {
|
||||||
|
$bytevalue = ord( $str[$i] );
|
||||||
|
if( $bytevalue <= 0x7F ) { //0xxx xxxx
|
||||||
|
$result .= chr( $bytevalue );
|
||||||
|
$bytesleft = 0;
|
||||||
|
} elseif( $bytevalue <= 0xBF ) { //10xx xxxx
|
||||||
|
$working = $working << 6;
|
||||||
|
$working += ($bytevalue & 0x3F);
|
||||||
|
$bytesleft--;
|
||||||
|
if( $bytesleft <= 0 ) {
|
||||||
|
$result .= "&#" . $working . ";";
|
||||||
|
}
|
||||||
|
} elseif( $bytevalue <= 0xDF ) { //110x xxxx
|
||||||
|
$working = $bytevalue & 0x1F;
|
||||||
|
$bytesleft = 1;
|
||||||
|
} elseif( $bytevalue <= 0xEF ) { //1110 xxxx
|
||||||
|
$working = $bytevalue & 0x0F;
|
||||||
|
$bytesleft = 2;
|
||||||
|
} else { //1111 0xxx
|
||||||
|
$working = $bytevalue & 0x07;
|
||||||
|
$bytesleft = 3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return $result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ require_once 'HTMLPurifier/Encoder.php';
|
|||||||
class HTMLPurifier_EncoderTest extends UnitTestCase
|
class HTMLPurifier_EncoderTest extends UnitTestCase
|
||||||
{
|
{
|
||||||
|
|
||||||
var $Encoder;
|
var $_entity_lookup;
|
||||||
|
|
||||||
function setUp() {
|
function setUp() {
|
||||||
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
|
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
|
||||||
@ -60,6 +60,9 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
|
|||||||
$config = HTMLPurifier_Config::createDefault();
|
$config = HTMLPurifier_Config::createDefault();
|
||||||
$context = new HTMLPurifier_Context();
|
$context = new HTMLPurifier_Context();
|
||||||
|
|
||||||
|
// zhong-wen
|
||||||
|
$chinese = "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
|
||||||
|
|
||||||
// UTF-8 means that we don't touch it
|
// UTF-8 means that we don't touch it
|
||||||
$this->assertIdentical(
|
$this->assertIdentical(
|
||||||
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
|
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
|
||||||
@ -74,13 +77,55 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
|
|||||||
"\xF6"
|
"\xF6"
|
||||||
);
|
);
|
||||||
|
|
||||||
$config->set('Test', 'ForceNoIconv', true);
|
if (function_exists('iconv')) {
|
||||||
|
// iconv has it's own way
|
||||||
|
$this->assertIdentical(
|
||||||
|
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
|
||||||
|
" (Chinese)"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Plain PHP implementation has slightly different behavior
|
||||||
|
$config->set('Test', 'ForceNoIconv', true);
|
||||||
$this->assertIdentical(
|
$this->assertIdentical(
|
||||||
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
|
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
|
||||||
"\xF6"
|
"\xF6"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
$this->assertIdentical(
|
||||||
|
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
|
||||||
|
"?? (Chinese)"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Preserve the characters!
|
||||||
|
|
||||||
|
$config->set('Core', 'EscapeNonASCIICharacters', true);
|
||||||
|
$this->assertIdentical(
|
||||||
|
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
|
||||||
|
"中文 (Chinese)"
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
function test_convertToASCIIDumbLossless() {
|
||||||
|
|
||||||
|
// Uppercase thorn letter
|
||||||
|
$this->assertIdentical(
|
||||||
|
HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"),
|
||||||
|
"Þorn"
|
||||||
|
);
|
||||||
|
|
||||||
|
$this->assertIdentical(
|
||||||
|
HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"),
|
||||||
|
"an"
|
||||||
|
);
|
||||||
|
|
||||||
|
// test up to four bytes
|
||||||
|
$this->assertIdentical(
|
||||||
|
HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
|
||||||
|
"󠀠"
|
||||||
|
);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
// call one file using /?f=FileTest.php , see $test_files array for
|
||||||
|
// valid values
|
||||||
|
|
||||||
error_reporting(E_ALL);
|
error_reporting(E_ALL);
|
||||||
|
|
||||||
// wishlist: automated calling of this file from multiple PHP versions so we
|
// wishlist: automated calling of this file from multiple PHP versions so we
|
||||||
@ -118,19 +121,19 @@ function htmlpurifier_path2class($path) {
|
|||||||
|
|
||||||
// we can't use addTestFile because SimpleTest chokes on E_STRICT warnings
|
// we can't use addTestFile because SimpleTest chokes on E_STRICT warnings
|
||||||
|
|
||||||
if (isset($_GET['file']) && isset($test_file_lookup[$_GET['file']])) {
|
if (isset($_GET['f']) && isset($test_file_lookup[$_GET['f']])) {
|
||||||
|
|
||||||
// execute only one test
|
// execute only one test
|
||||||
$test_file = $_GET['file'];
|
$test_file = $_GET['f'];
|
||||||
|
|
||||||
$test = new GroupTest('HTML Purifier - ' . $test_file);
|
$test = new GroupTest($test_file . ' - HTML Purifier');
|
||||||
$path = 'HTMLPurifier/' . $test_file;
|
$path = 'HTMLPurifier/' . $test_file;
|
||||||
require_once $path;
|
require_once $path;
|
||||||
$test->addTestClass(htmlpurifier_path2class($path));
|
$test->addTestClass(htmlpurifier_path2class($path));
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
$test = new GroupTest('HTML Purifier');
|
$test = new GroupTest('All Tests - HTML Purifier');
|
||||||
|
|
||||||
foreach ($test_files as $test_file) {
|
foreach ($test_files as $test_file) {
|
||||||
$path = 'HTMLPurifier/' . $test_file;
|
$path = 'HTMLPurifier/' . $test_file;
|
||||||
|
Loading…
Reference in New Issue
Block a user