diff --git a/INSTALL b/INSTALL index 969aeb8b..dc697e11 100644 --- a/INSTALL +++ b/INSTALL @@ -14,7 +14,7 @@ things you should be mindful of. The library/ directory must be added to your path: HTMLPurifier will not be able to find the necessary includes otherwise. This is as simple as: -set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR . get_include_path()); + set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR . get_include_path()); ...replacing /path/to/htmlpurifier with the actual location of the folder. Don't worry, HTMLPurifier is namespaced so unless you have another file named @@ -22,7 +22,7 @@ HTMLPurifier.php, the files won't collide with any of your includes. Then, it's a simple matter of including the base file: -require_once 'HTMLPurifier.php'; + require_once 'HTMLPurifier.php'; ...and you're good to go. @@ -44,18 +44,26 @@ in docs/security.txt, in the meantime, try to change your output so this is the case. If, for some reason, you are unable to switch to UTF-8 immediately, you can -use iconv to convert the output of HTMLPurifier to your desired encoding. -We may integrate support for other encodings in later releases, but for now, -UTF-8 is all you should need. (If you're not using UTF-8, switch now!) +switch HTMLPurifier's encoding. Note that the availability of encodings is +dependent on iconv, and you'll be missing characters if the charset you +choose doesn't have them. + + $config = HTMLPurifier_Config::createDefault(); + $config->set('Core', 'Encoding', $encoding); 3. Using the code -The interface is mind-numbingly simple. +The interface is mind-numbingly simple: -$purifier = new HTMLPurifier(); -$clean_html = $purifier->purify($dirty_html); + $purifier = new HTMLPurifier(); + $clean_html = $purifier->purify($dirty_html); + +Or, if you're using the configuration object: + + $purifier = new HTMLPurifier($config); + $clean_html = $purifier->purify($dirty_html); That's it. For more examples, check out docs/examples/. Also, SLOW gives advice on what to do if HTMLPurifier is slowing down your application. diff --git a/NEWS b/NEWS index c3cf3d6d..61f366ea 100644 --- a/NEWS +++ b/NEWS @@ -1,7 +1,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| -1.0.0rc1, released 2006-??-?? +1.0.0rc1, released 2006-08-31 - Fixed broken numeric entity conversion - Malformed UTF-8 and non-SGML character detection and cleaning implemented - API documentation completed @@ -9,6 +9,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier - Basic color keywords translated into hexadecimal values - Table CSS properties implemented - (HTML|CSS)Definition de-singleton-ized +- Support for charsets other than UTF-8 (defined by iconv) 1.0.0beta, released 2006-08-16 - First public release, most functionality implemented. Notable omissions are: diff --git a/TODO b/TODO index fa18414c..28fc95d0 100644 --- a/TODO +++ b/TODO @@ -5,10 +5,6 @@ Ongoing - Lots of profiling, make it faster! - Plugins for major CMSes (very tricky issue) -1.0 release - - Lossy alternate character encoding support (characters not in the encoding - will get silently dropped). - 1.1 release - Directive documentation generation - Rewrite table's child definition to be faster, smart, and regexp free diff --git a/library/HTMLPurifier.php b/library/HTMLPurifier.php index 25edd724..266a8e7a 100644 --- a/library/HTMLPurifier.php +++ b/library/HTMLPurifier.php @@ -27,6 +27,7 @@ require_once 'HTMLPurifier/Lexer.php'; require_once 'HTMLPurifier/HTMLDefinition.php'; require_once 'HTMLPurifier/Generator.php'; require_once 'HTMLPurifier/Strategy/Core.php'; +require_once 'HTMLPurifier/Encoder.php'; /** * Main library execution class. @@ -58,6 +59,7 @@ class HTMLPurifier $this->lexer = HTMLPurifier_Lexer::create(); $this->strategy = new HTMLPurifier_Strategy_Core(); $this->generator = new HTMLPurifier_Generator(); + $this->encoder = new HTMLPurifier_Encoder(); } @@ -72,14 +74,17 @@ class HTMLPurifier */ function purify($html, $config = null) { $config = $config ? $config : $this->config; - return + $html = $this->encoder->convertToUTF8($html, $config); + $html = $this->generator->generateFromTokens( $this->strategy->execute( $this->lexer->tokenizeHTML($html, $config), + $config + ), $config - ), - $config - ); + ); + $html = $this->encoder->convertFromUTF8($html, $config); + return $html; } } diff --git a/library/HTMLPurifier/Encoder.php b/library/HTMLPurifier/Encoder.php index 3315b67c..e748ad1a 100644 --- a/library/HTMLPurifier/Encoder.php +++ b/library/HTMLPurifier/Encoder.php @@ -2,6 +2,29 @@ require_once 'HTMLPurifier/EntityLookup.php'; +HTMLPurifier_ConfigDef::define( + 'Core', 'Encoding', 'utf-8', 'istring', + 'If for some reason you are unable to convert all webpages to UTF-8, '. + 'you can use this directive as a stop-gap compatibility change to '. + 'let HTMLPurifier deal with non UTF-8 input. This technique has '. + 'notable deficiencies: absolutely no characters outside of the selected '. + 'character encoding will be preserved, not even the ones that have '. + 'been ampersand escaped (this is due to a UTF-8 specific feature '. + 'that automatically resolves all entities), making it pretty useless '. + 'for anything except the most I18N-blind applications. This directive '. + 'only accepts ISO-8859-1 if iconv is not enabled.' +); + +if ( !function_exists('iconv') ) { + // only encodings with native PHP support + HTMLPurifier_ConfigDef::defineAllowedValues( + 'Core', 'Encoding', array( + 'utf-8', + 'iso-8859-1' + ) + ); +} + /** * A UTF-8 specific character encoder that handles cleaning and transforming. */ @@ -36,8 +59,6 @@ class HTMLPurifier_Encoder function cleanUTF8($str, $force_php = false) { static $non_sgml_chars = array(); - static $iconv = null; - if (empty($non_sgml_chars)) { for ($i = 0; $i <= 31; $i++) { // non-SGML ASCII chars @@ -50,9 +71,8 @@ class HTMLPurifier_Encoder } } - if ($iconv === null) { - $iconv = function_exists('iconv'); - } + static $iconv = null; + if ($iconv === null) $iconv = function_exists('iconv'); if ($iconv && !$force_php) { // do the shortcut way @@ -232,6 +252,38 @@ class HTMLPurifier_Encoder return $ret; } + /** + * Converts a string to UTF-8 based on configuration. + */ + function convertToUTF8($str, $config) { + static $iconv = null; + if ($iconv === null) $iconv = function_exists('iconv'); + $encoding = $config->get('Core', 'Encoding'); + if ($encoding === 'utf-8') return $str; + if ($iconv) { + return iconv($encoding, 'utf-8//IGNORE', $str); + } elseif ($encoding === 'iso-8895-1') { + return utf8_encode($str); + } + } + + /** + * Converts a string from UTF-8 based on configuration. + * @note Currently, this is a lossy conversion, with unexpressable + * characters being omitted. + */ + function convertFromUTF8($str, $config) { + static $iconv = null; + if ($iconv === null) $iconv = function_exists('iconv'); + $encoding = $config->get('Core', 'Encoding'); + if ($encoding === 'utf-8') return $str; + if ($iconv) { + return iconv('utf-8', $encoding . '//IGNORE', $str); + } elseif ($encoding === 'iso-8895-1') { + return utf8_encode($str); + } + } + } diff --git a/tests/HTMLPurifier/EncoderTest.php b/tests/HTMLPurifier/EncoderTest.php index 1c45d1ef..3d8a2af8 100644 --- a/tests/HTMLPurifier/EncoderTest.php +++ b/tests/HTMLPurifier/EncoderTest.php @@ -29,6 +29,43 @@ class HTMLPurifier_EncoderTest extends UnitTestCase $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8 } + function test_convertToUTF8() { + $config = HTMLPurifier_Config::createDefault(); + + // UTF-8 means that we don't touch it + $this->assertIdentical( + $this->Encoder->convertToUTF8("\xF6", $config), + "\xF6" // this is invalid + ); + $this->assertNoErrors(); + + $config->set('Core', 'Encoding', 'ISO-8859-1'); + + // Now it gets converted + $this->assertIdentical( + $this->Encoder->convertToUTF8("\xF6", $config), + "\xC3\xB6" + ); + } + + function test_convertFromUTF8() { + $config = HTMLPurifier_Config::createDefault(); + + // UTF-8 means that we don't touch it + $this->assertIdentical( + $this->Encoder->convertFromUTF8("\xC3\xB6", $config), + "\xC3\xB6" + ); + + $config->set('Core', 'Encoding', 'ISO-8859-1'); + + // Now it gets converted + $this->assertIdentical( + $this->Encoder->convertFromUTF8("\xC3\xB6", $config), + "\xF6" + ); + } + } ?> \ No newline at end of file