0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-12-22 08:21:52 +00:00

Add rudimentary extra encoding support. We are now release-ready!

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-09-01 00:54:38 +00:00
parent b621602ac1
commit f4f636a09c
6 changed files with 121 additions and 22 deletions

24
INSTALL
View File

@ -14,7 +14,7 @@ things you should be mindful of.
The library/ directory must be added to your path: HTMLPurifier will not be The library/ directory must be added to your path: HTMLPurifier will not be
able to find the necessary includes otherwise. This is as simple as: able to find the necessary includes otherwise. This is as simple as:
set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR . get_include_path()); set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR . get_include_path());
...replacing /path/to/htmlpurifier with the actual location of the folder. Don't ...replacing /path/to/htmlpurifier with the actual location of the folder. Don't
worry, HTMLPurifier is namespaced so unless you have another file named worry, HTMLPurifier is namespaced so unless you have another file named
@ -22,7 +22,7 @@ HTMLPurifier.php, the files won't collide with any of your includes.
Then, it's a simple matter of including the base file: Then, it's a simple matter of including the base file:
require_once 'HTMLPurifier.php'; require_once 'HTMLPurifier.php';
...and you're good to go. ...and you're good to go.
@ -44,18 +44,26 @@ in docs/security.txt, in the meantime, try to change your output so this is
the case. the case.
If, for some reason, you are unable to switch to UTF-8 immediately, you can If, for some reason, you are unable to switch to UTF-8 immediately, you can
use iconv to convert the output of HTMLPurifier to your desired encoding. switch HTMLPurifier's encoding. Note that the availability of encodings is
We may integrate support for other encodings in later releases, but for now, dependent on iconv, and you'll be missing characters if the charset you
UTF-8 is all you should need. (If you're not using UTF-8, switch now!) choose doesn't have them.
$config = HTMLPurifier_Config::createDefault();
$config->set('Core', 'Encoding', $encoding);
3. Using the code 3. Using the code
The interface is mind-numbingly simple. The interface is mind-numbingly simple:
$purifier = new HTMLPurifier(); $purifier = new HTMLPurifier();
$clean_html = $purifier->purify($dirty_html); $clean_html = $purifier->purify($dirty_html);
Or, if you're using the configuration object:
$purifier = new HTMLPurifier($config);
$clean_html = $purifier->purify($dirty_html);
That's it. For more examples, check out docs/examples/. Also, SLOW gives That's it. For more examples, check out docs/examples/. Also, SLOW gives
advice on what to do if HTMLPurifier is slowing down your application. advice on what to do if HTMLPurifier is slowing down your application.

3
NEWS
View File

@ -1,7 +1,7 @@
NEWS ( CHANGELOG and HISTORY ) HTMLPurifier NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
1.0.0rc1, released 2006-??-?? 1.0.0rc1, released 2006-08-31
- Fixed broken numeric entity conversion - Fixed broken numeric entity conversion
- Malformed UTF-8 and non-SGML character detection and cleaning implemented - Malformed UTF-8 and non-SGML character detection and cleaning implemented
- API documentation completed - API documentation completed
@ -9,6 +9,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
- Basic color keywords translated into hexadecimal values - Basic color keywords translated into hexadecimal values
- Table CSS properties implemented - Table CSS properties implemented
- (HTML|CSS)Definition de-singleton-ized - (HTML|CSS)Definition de-singleton-ized
- Support for charsets other than UTF-8 (defined by iconv)
1.0.0beta, released 2006-08-16 1.0.0beta, released 2006-08-16
- First public release, most functionality implemented. Notable omissions are: - First public release, most functionality implemented. Notable omissions are:

4
TODO
View File

@ -5,10 +5,6 @@ Ongoing
- Lots of profiling, make it faster! - Lots of profiling, make it faster!
- Plugins for major CMSes (very tricky issue) - Plugins for major CMSes (very tricky issue)
1.0 release
- Lossy alternate character encoding support (characters not in the encoding
will get silently dropped).
1.1 release 1.1 release
- Directive documentation generation - Directive documentation generation
- Rewrite table's child definition to be faster, smart, and regexp free - Rewrite table's child definition to be faster, smart, and regexp free

View File

@ -27,6 +27,7 @@ require_once 'HTMLPurifier/Lexer.php';
require_once 'HTMLPurifier/HTMLDefinition.php'; require_once 'HTMLPurifier/HTMLDefinition.php';
require_once 'HTMLPurifier/Generator.php'; require_once 'HTMLPurifier/Generator.php';
require_once 'HTMLPurifier/Strategy/Core.php'; require_once 'HTMLPurifier/Strategy/Core.php';
require_once 'HTMLPurifier/Encoder.php';
/** /**
* Main library execution class. * Main library execution class.
@ -58,6 +59,7 @@ class HTMLPurifier
$this->lexer = HTMLPurifier_Lexer::create(); $this->lexer = HTMLPurifier_Lexer::create();
$this->strategy = new HTMLPurifier_Strategy_Core(); $this->strategy = new HTMLPurifier_Strategy_Core();
$this->generator = new HTMLPurifier_Generator(); $this->generator = new HTMLPurifier_Generator();
$this->encoder = new HTMLPurifier_Encoder();
} }
@ -72,14 +74,17 @@ class HTMLPurifier
*/ */
function purify($html, $config = null) { function purify($html, $config = null) {
$config = $config ? $config : $this->config; $config = $config ? $config : $this->config;
return $html = $this->encoder->convertToUTF8($html, $config);
$html =
$this->generator->generateFromTokens( $this->generator->generateFromTokens(
$this->strategy->execute( $this->strategy->execute(
$this->lexer->tokenizeHTML($html, $config), $this->lexer->tokenizeHTML($html, $config),
$config
),
$config $config
), );
$config $html = $this->encoder->convertFromUTF8($html, $config);
); return $html;
} }
} }

View File

@ -2,6 +2,29 @@
require_once 'HTMLPurifier/EntityLookup.php'; require_once 'HTMLPurifier/EntityLookup.php';
HTMLPurifier_ConfigDef::define(
'Core', 'Encoding', 'utf-8', 'istring',
'If for some reason you are unable to convert all webpages to UTF-8, '.
'you can use this directive as a stop-gap compatibility change to '.
'let HTMLPurifier deal with non UTF-8 input. This technique has '.
'notable deficiencies: absolutely no characters outside of the selected '.
'character encoding will be preserved, not even the ones that have '.
'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '.
'that automatically resolves all entities), making it pretty useless '.
'for anything except the most I18N-blind applications. This directive '.
'only accepts ISO-8859-1 if iconv is not enabled.'
);
if ( !function_exists('iconv') ) {
// only encodings with native PHP support
HTMLPurifier_ConfigDef::defineAllowedValues(
'Core', 'Encoding', array(
'utf-8',
'iso-8859-1'
)
);
}
/** /**
* A UTF-8 specific character encoder that handles cleaning and transforming. * A UTF-8 specific character encoder that handles cleaning and transforming.
*/ */
@ -36,8 +59,6 @@ class HTMLPurifier_Encoder
function cleanUTF8($str, $force_php = false) { function cleanUTF8($str, $force_php = false) {
static $non_sgml_chars = array(); static $non_sgml_chars = array();
static $iconv = null;
if (empty($non_sgml_chars)) { if (empty($non_sgml_chars)) {
for ($i = 0; $i <= 31; $i++) { for ($i = 0; $i <= 31; $i++) {
// non-SGML ASCII chars // non-SGML ASCII chars
@ -50,9 +71,8 @@ class HTMLPurifier_Encoder
} }
} }
if ($iconv === null) { static $iconv = null;
$iconv = function_exists('iconv'); if ($iconv === null) $iconv = function_exists('iconv');
}
if ($iconv && !$force_php) { if ($iconv && !$force_php) {
// do the shortcut way // do the shortcut way
@ -232,6 +252,38 @@ class HTMLPurifier_Encoder
return $ret; return $ret;
} }
/**
* Converts a string to UTF-8 based on configuration.
*/
function convertToUTF8($str, $config) {
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
$encoding = $config->get('Core', 'Encoding');
if ($encoding === 'utf-8') return $str;
if ($iconv) {
return iconv($encoding, 'utf-8//IGNORE', $str);
} elseif ($encoding === 'iso-8895-1') {
return utf8_encode($str);
}
}
/**
* Converts a string from UTF-8 based on configuration.
* @note Currently, this is a lossy conversion, with unexpressable
* characters being omitted.
*/
function convertFromUTF8($str, $config) {
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
$encoding = $config->get('Core', 'Encoding');
if ($encoding === 'utf-8') return $str;
if ($iconv) {
return iconv('utf-8', $encoding . '//IGNORE', $str);
} elseif ($encoding === 'iso-8895-1') {
return utf8_encode($str);
}
}
} }

View File

@ -29,6 +29,43 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8 $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
} }
function test_convertToUTF8() {
$config = HTMLPurifier_Config::createDefault();
// UTF-8 means that we don't touch it
$this->assertIdentical(
$this->Encoder->convertToUTF8("\xF6", $config),
"\xF6" // this is invalid
);
$this->assertNoErrors();
$config->set('Core', 'Encoding', 'ISO-8859-1');
// Now it gets converted
$this->assertIdentical(
$this->Encoder->convertToUTF8("\xF6", $config),
"\xC3\xB6"
);
}
function test_convertFromUTF8() {
$config = HTMLPurifier_Config::createDefault();
// UTF-8 means that we don't touch it
$this->assertIdentical(
$this->Encoder->convertFromUTF8("\xC3\xB6", $config),
"\xC3\xB6"
);
$config->set('Core', 'Encoding', 'ISO-8859-1');
// Now it gets converted
$this->assertIdentical(
$this->Encoder->convertFromUTF8("\xC3\xB6", $config),
"\xF6"
);
}
} }
?> ?>