0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-11-09 23:28:42 +00:00

Add rudimentary extra encoding support. We are now release-ready!

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-09-01 00:54:38 +00:00
parent b621602ac1
commit f4f636a09c
6 changed files with 121 additions and 22 deletions

16
INSTALL
View File

@ -44,18 +44,26 @@ in docs/security.txt, in the meantime, try to change your output so this is
the case.
If, for some reason, you are unable to switch to UTF-8 immediately, you can
use iconv to convert the output of HTMLPurifier to your desired encoding.
We may integrate support for other encodings in later releases, but for now,
UTF-8 is all you should need. (If you're not using UTF-8, switch now!)
switch HTMLPurifier's encoding. Note that the availability of encodings is
dependent on iconv, and you'll be missing characters if the charset you
choose doesn't have them.
$config = HTMLPurifier_Config::createDefault();
$config->set('Core', 'Encoding', $encoding);
3. Using the code
The interface is mind-numbingly simple.
The interface is mind-numbingly simple:
$purifier = new HTMLPurifier();
$clean_html = $purifier->purify($dirty_html);
Or, if you're using the configuration object:
$purifier = new HTMLPurifier($config);
$clean_html = $purifier->purify($dirty_html);
That's it. For more examples, check out docs/examples/. Also, SLOW gives
advice on what to do if HTMLPurifier is slowing down your application.

3
NEWS
View File

@ -1,7 +1,7 @@
NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
1.0.0rc1, released 2006-??-??
1.0.0rc1, released 2006-08-31
- Fixed broken numeric entity conversion
- Malformed UTF-8 and non-SGML character detection and cleaning implemented
- API documentation completed
@ -9,6 +9,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
- Basic color keywords translated into hexadecimal values
- Table CSS properties implemented
- (HTML|CSS)Definition de-singleton-ized
- Support for charsets other than UTF-8 (defined by iconv)
1.0.0beta, released 2006-08-16
- First public release, most functionality implemented. Notable omissions are:

4
TODO
View File

@ -5,10 +5,6 @@ Ongoing
- Lots of profiling, make it faster!
- Plugins for major CMSes (very tricky issue)
1.0 release
- Lossy alternate character encoding support (characters not in the encoding
will get silently dropped).
1.1 release
- Directive documentation generation
- Rewrite table's child definition to be faster, smart, and regexp free

View File

@ -27,6 +27,7 @@ require_once 'HTMLPurifier/Lexer.php';
require_once 'HTMLPurifier/HTMLDefinition.php';
require_once 'HTMLPurifier/Generator.php';
require_once 'HTMLPurifier/Strategy/Core.php';
require_once 'HTMLPurifier/Encoder.php';
/**
* Main library execution class.
@ -58,6 +59,7 @@ class HTMLPurifier
$this->lexer = HTMLPurifier_Lexer::create();
$this->strategy = new HTMLPurifier_Strategy_Core();
$this->generator = new HTMLPurifier_Generator();
$this->encoder = new HTMLPurifier_Encoder();
}
@ -72,7 +74,8 @@ class HTMLPurifier
*/
function purify($html, $config = null) {
$config = $config ? $config : $this->config;
return
$html = $this->encoder->convertToUTF8($html, $config);
$html =
$this->generator->generateFromTokens(
$this->strategy->execute(
$this->lexer->tokenizeHTML($html, $config),
@ -80,6 +83,8 @@ class HTMLPurifier
),
$config
);
$html = $this->encoder->convertFromUTF8($html, $config);
return $html;
}
}

View File

@ -2,6 +2,29 @@
require_once 'HTMLPurifier/EntityLookup.php';
HTMLPurifier_ConfigDef::define(
'Core', 'Encoding', 'utf-8', 'istring',
'If for some reason you are unable to convert all webpages to UTF-8, '.
'you can use this directive as a stop-gap compatibility change to '.
'let HTMLPurifier deal with non UTF-8 input. This technique has '.
'notable deficiencies: absolutely no characters outside of the selected '.
'character encoding will be preserved, not even the ones that have '.
'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '.
'that automatically resolves all entities), making it pretty useless '.
'for anything except the most I18N-blind applications. This directive '.
'only accepts ISO-8859-1 if iconv is not enabled.'
);
if ( !function_exists('iconv') ) {
// only encodings with native PHP support
HTMLPurifier_ConfigDef::defineAllowedValues(
'Core', 'Encoding', array(
'utf-8',
'iso-8859-1'
)
);
}
/**
* A UTF-8 specific character encoder that handles cleaning and transforming.
*/
@ -36,8 +59,6 @@ class HTMLPurifier_Encoder
function cleanUTF8($str, $force_php = false) {
static $non_sgml_chars = array();
static $iconv = null;
if (empty($non_sgml_chars)) {
for ($i = 0; $i <= 31; $i++) {
// non-SGML ASCII chars
@ -50,9 +71,8 @@ class HTMLPurifier_Encoder
}
}
if ($iconv === null) {
$iconv = function_exists('iconv');
}
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
if ($iconv && !$force_php) {
// do the shortcut way
@ -232,6 +252,38 @@ class HTMLPurifier_Encoder
return $ret;
}
/**
* Converts a string to UTF-8 based on configuration.
*/
function convertToUTF8($str, $config) {
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
$encoding = $config->get('Core', 'Encoding');
if ($encoding === 'utf-8') return $str;
if ($iconv) {
return iconv($encoding, 'utf-8//IGNORE', $str);
} elseif ($encoding === 'iso-8895-1') {
return utf8_encode($str);
}
}
/**
* Converts a string from UTF-8 based on configuration.
* @note Currently, this is a lossy conversion, with unexpressable
* characters being omitted.
*/
function convertFromUTF8($str, $config) {
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
$encoding = $config->get('Core', 'Encoding');
if ($encoding === 'utf-8') return $str;
if ($iconv) {
return iconv('utf-8', $encoding . '//IGNORE', $str);
} elseif ($encoding === 'iso-8895-1') {
return utf8_encode($str);
}
}
}

View File

@ -29,6 +29,43 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
}
function test_convertToUTF8() {
$config = HTMLPurifier_Config::createDefault();
// UTF-8 means that we don't touch it
$this->assertIdentical(
$this->Encoder->convertToUTF8("\xF6", $config),
"\xF6" // this is invalid
);
$this->assertNoErrors();
$config->set('Core', 'Encoding', 'ISO-8859-1');
// Now it gets converted
$this->assertIdentical(
$this->Encoder->convertToUTF8("\xF6", $config),
"\xC3\xB6"
);
}
function test_convertFromUTF8() {
$config = HTMLPurifier_Config::createDefault();
// UTF-8 means that we don't touch it
$this->assertIdentical(
$this->Encoder->convertFromUTF8("\xC3\xB6", $config),
"\xC3\xB6"
);
$config->set('Core', 'Encoding', 'ISO-8859-1');
// Now it gets converted
$this->assertIdentical(
$this->Encoder->convertFromUTF8("\xC3\xB6", $config),
"\xF6"
);
}
}
?>