mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-22 08:21:52 +00:00
Add rudimentary extra encoding support. We are now release-ready!
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
b621602ac1
commit
f4f636a09c
24
INSTALL
24
INSTALL
@ -14,7 +14,7 @@ things you should be mindful of.
|
||||
The library/ directory must be added to your path: HTMLPurifier will not be
|
||||
able to find the necessary includes otherwise. This is as simple as:
|
||||
|
||||
set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR . get_include_path());
|
||||
set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR . get_include_path());
|
||||
|
||||
...replacing /path/to/htmlpurifier with the actual location of the folder. Don't
|
||||
worry, HTMLPurifier is namespaced so unless you have another file named
|
||||
@ -22,7 +22,7 @@ HTMLPurifier.php, the files won't collide with any of your includes.
|
||||
|
||||
Then, it's a simple matter of including the base file:
|
||||
|
||||
require_once 'HTMLPurifier.php';
|
||||
require_once 'HTMLPurifier.php';
|
||||
|
||||
...and you're good to go.
|
||||
|
||||
@ -44,18 +44,26 @@ in docs/security.txt, in the meantime, try to change your output so this is
|
||||
the case.
|
||||
|
||||
If, for some reason, you are unable to switch to UTF-8 immediately, you can
|
||||
use iconv to convert the output of HTMLPurifier to your desired encoding.
|
||||
We may integrate support for other encodings in later releases, but for now,
|
||||
UTF-8 is all you should need. (If you're not using UTF-8, switch now!)
|
||||
switch HTMLPurifier's encoding. Note that the availability of encodings is
|
||||
dependent on iconv, and you'll be missing characters if the charset you
|
||||
choose doesn't have them.
|
||||
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$config->set('Core', 'Encoding', $encoding);
|
||||
|
||||
|
||||
|
||||
3. Using the code
|
||||
|
||||
The interface is mind-numbingly simple.
|
||||
The interface is mind-numbingly simple:
|
||||
|
||||
$purifier = new HTMLPurifier();
|
||||
$clean_html = $purifier->purify($dirty_html);
|
||||
$purifier = new HTMLPurifier();
|
||||
$clean_html = $purifier->purify($dirty_html);
|
||||
|
||||
Or, if you're using the configuration object:
|
||||
|
||||
$purifier = new HTMLPurifier($config);
|
||||
$clean_html = $purifier->purify($dirty_html);
|
||||
|
||||
That's it. For more examples, check out docs/examples/. Also, SLOW gives
|
||||
advice on what to do if HTMLPurifier is slowing down your application.
|
||||
|
3
NEWS
3
NEWS
@ -1,7 +1,7 @@
|
||||
NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
||||
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||
|
||||
1.0.0rc1, released 2006-??-??
|
||||
1.0.0rc1, released 2006-08-31
|
||||
- Fixed broken numeric entity conversion
|
||||
- Malformed UTF-8 and non-SGML character detection and cleaning implemented
|
||||
- API documentation completed
|
||||
@ -9,6 +9,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
||||
- Basic color keywords translated into hexadecimal values
|
||||
- Table CSS properties implemented
|
||||
- (HTML|CSS)Definition de-singleton-ized
|
||||
- Support for charsets other than UTF-8 (defined by iconv)
|
||||
|
||||
1.0.0beta, released 2006-08-16
|
||||
- First public release, most functionality implemented. Notable omissions are:
|
||||
|
4
TODO
4
TODO
@ -5,10 +5,6 @@ Ongoing
|
||||
- Lots of profiling, make it faster!
|
||||
- Plugins for major CMSes (very tricky issue)
|
||||
|
||||
1.0 release
|
||||
- Lossy alternate character encoding support (characters not in the encoding
|
||||
will get silently dropped).
|
||||
|
||||
1.1 release
|
||||
- Directive documentation generation
|
||||
- Rewrite table's child definition to be faster, smart, and regexp free
|
||||
|
@ -27,6 +27,7 @@ require_once 'HTMLPurifier/Lexer.php';
|
||||
require_once 'HTMLPurifier/HTMLDefinition.php';
|
||||
require_once 'HTMLPurifier/Generator.php';
|
||||
require_once 'HTMLPurifier/Strategy/Core.php';
|
||||
require_once 'HTMLPurifier/Encoder.php';
|
||||
|
||||
/**
|
||||
* Main library execution class.
|
||||
@ -58,6 +59,7 @@ class HTMLPurifier
|
||||
$this->lexer = HTMLPurifier_Lexer::create();
|
||||
$this->strategy = new HTMLPurifier_Strategy_Core();
|
||||
$this->generator = new HTMLPurifier_Generator();
|
||||
$this->encoder = new HTMLPurifier_Encoder();
|
||||
|
||||
}
|
||||
|
||||
@ -72,14 +74,17 @@ class HTMLPurifier
|
||||
*/
|
||||
function purify($html, $config = null) {
|
||||
$config = $config ? $config : $this->config;
|
||||
return
|
||||
$html = $this->encoder->convertToUTF8($html, $config);
|
||||
$html =
|
||||
$this->generator->generateFromTokens(
|
||||
$this->strategy->execute(
|
||||
$this->lexer->tokenizeHTML($html, $config),
|
||||
$config
|
||||
),
|
||||
$config
|
||||
),
|
||||
$config
|
||||
);
|
||||
);
|
||||
$html = $this->encoder->convertFromUTF8($html, $config);
|
||||
return $html;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,6 +2,29 @@
|
||||
|
||||
require_once 'HTMLPurifier/EntityLookup.php';
|
||||
|
||||
HTMLPurifier_ConfigDef::define(
|
||||
'Core', 'Encoding', 'utf-8', 'istring',
|
||||
'If for some reason you are unable to convert all webpages to UTF-8, '.
|
||||
'you can use this directive as a stop-gap compatibility change to '.
|
||||
'let HTMLPurifier deal with non UTF-8 input. This technique has '.
|
||||
'notable deficiencies: absolutely no characters outside of the selected '.
|
||||
'character encoding will be preserved, not even the ones that have '.
|
||||
'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '.
|
||||
'that automatically resolves all entities), making it pretty useless '.
|
||||
'for anything except the most I18N-blind applications. This directive '.
|
||||
'only accepts ISO-8859-1 if iconv is not enabled.'
|
||||
);
|
||||
|
||||
if ( !function_exists('iconv') ) {
|
||||
// only encodings with native PHP support
|
||||
HTMLPurifier_ConfigDef::defineAllowedValues(
|
||||
'Core', 'Encoding', array(
|
||||
'utf-8',
|
||||
'iso-8859-1'
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* A UTF-8 specific character encoder that handles cleaning and transforming.
|
||||
*/
|
||||
@ -36,8 +59,6 @@ class HTMLPurifier_Encoder
|
||||
function cleanUTF8($str, $force_php = false) {
|
||||
|
||||
static $non_sgml_chars = array();
|
||||
static $iconv = null;
|
||||
|
||||
if (empty($non_sgml_chars)) {
|
||||
for ($i = 0; $i <= 31; $i++) {
|
||||
// non-SGML ASCII chars
|
||||
@ -50,9 +71,8 @@ class HTMLPurifier_Encoder
|
||||
}
|
||||
}
|
||||
|
||||
if ($iconv === null) {
|
||||
$iconv = function_exists('iconv');
|
||||
}
|
||||
static $iconv = null;
|
||||
if ($iconv === null) $iconv = function_exists('iconv');
|
||||
|
||||
if ($iconv && !$force_php) {
|
||||
// do the shortcut way
|
||||
@ -232,6 +252,38 @@ class HTMLPurifier_Encoder
|
||||
return $ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a string to UTF-8 based on configuration.
|
||||
*/
|
||||
function convertToUTF8($str, $config) {
|
||||
static $iconv = null;
|
||||
if ($iconv === null) $iconv = function_exists('iconv');
|
||||
$encoding = $config->get('Core', 'Encoding');
|
||||
if ($encoding === 'utf-8') return $str;
|
||||
if ($iconv) {
|
||||
return iconv($encoding, 'utf-8//IGNORE', $str);
|
||||
} elseif ($encoding === 'iso-8895-1') {
|
||||
return utf8_encode($str);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a string from UTF-8 based on configuration.
|
||||
* @note Currently, this is a lossy conversion, with unexpressable
|
||||
* characters being omitted.
|
||||
*/
|
||||
function convertFromUTF8($str, $config) {
|
||||
static $iconv = null;
|
||||
if ($iconv === null) $iconv = function_exists('iconv');
|
||||
$encoding = $config->get('Core', 'Encoding');
|
||||
if ($encoding === 'utf-8') return $str;
|
||||
if ($iconv) {
|
||||
return iconv('utf-8', $encoding . '//IGNORE', $str);
|
||||
} elseif ($encoding === 'iso-8895-1') {
|
||||
return utf8_encode($str);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
@ -29,6 +29,43 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
|
||||
$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
|
||||
}
|
||||
|
||||
function test_convertToUTF8() {
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
|
||||
// UTF-8 means that we don't touch it
|
||||
$this->assertIdentical(
|
||||
$this->Encoder->convertToUTF8("\xF6", $config),
|
||||
"\xF6" // this is invalid
|
||||
);
|
||||
$this->assertNoErrors();
|
||||
|
||||
$config->set('Core', 'Encoding', 'ISO-8859-1');
|
||||
|
||||
// Now it gets converted
|
||||
$this->assertIdentical(
|
||||
$this->Encoder->convertToUTF8("\xF6", $config),
|
||||
"\xC3\xB6"
|
||||
);
|
||||
}
|
||||
|
||||
function test_convertFromUTF8() {
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
|
||||
// UTF-8 means that we don't touch it
|
||||
$this->assertIdentical(
|
||||
$this->Encoder->convertFromUTF8("\xC3\xB6", $config),
|
||||
"\xC3\xB6"
|
||||
);
|
||||
|
||||
$config->set('Core', 'Encoding', 'ISO-8859-1');
|
||||
|
||||
// Now it gets converted
|
||||
$this->assertIdentical(
|
||||
$this->Encoder->convertFromUTF8("\xC3\xB6", $config),
|
||||
"\xF6"
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
Loading…
Reference in New Issue
Block a user