mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-22 08:21:52 +00:00
Add rudimentary extra encoding support. We are now release-ready!
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@352 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
b621602ac1
commit
f4f636a09c
24
INSTALL
24
INSTALL
@ -14,7 +14,7 @@ things you should be mindful of.
|
|||||||
The library/ directory must be added to your path: HTMLPurifier will not be
|
The library/ directory must be added to your path: HTMLPurifier will not be
|
||||||
able to find the necessary includes otherwise. This is as simple as:
|
able to find the necessary includes otherwise. This is as simple as:
|
||||||
|
|
||||||
set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR . get_include_path());
|
set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR . get_include_path());
|
||||||
|
|
||||||
...replacing /path/to/htmlpurifier with the actual location of the folder. Don't
|
...replacing /path/to/htmlpurifier with the actual location of the folder. Don't
|
||||||
worry, HTMLPurifier is namespaced so unless you have another file named
|
worry, HTMLPurifier is namespaced so unless you have another file named
|
||||||
@ -22,7 +22,7 @@ HTMLPurifier.php, the files won't collide with any of your includes.
|
|||||||
|
|
||||||
Then, it's a simple matter of including the base file:
|
Then, it's a simple matter of including the base file:
|
||||||
|
|
||||||
require_once 'HTMLPurifier.php';
|
require_once 'HTMLPurifier.php';
|
||||||
|
|
||||||
...and you're good to go.
|
...and you're good to go.
|
||||||
|
|
||||||
@ -44,18 +44,26 @@ in docs/security.txt, in the meantime, try to change your output so this is
|
|||||||
the case.
|
the case.
|
||||||
|
|
||||||
If, for some reason, you are unable to switch to UTF-8 immediately, you can
|
If, for some reason, you are unable to switch to UTF-8 immediately, you can
|
||||||
use iconv to convert the output of HTMLPurifier to your desired encoding.
|
switch HTMLPurifier's encoding. Note that the availability of encodings is
|
||||||
We may integrate support for other encodings in later releases, but for now,
|
dependent on iconv, and you'll be missing characters if the charset you
|
||||||
UTF-8 is all you should need. (If you're not using UTF-8, switch now!)
|
choose doesn't have them.
|
||||||
|
|
||||||
|
$config = HTMLPurifier_Config::createDefault();
|
||||||
|
$config->set('Core', 'Encoding', $encoding);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
3. Using the code
|
3. Using the code
|
||||||
|
|
||||||
The interface is mind-numbingly simple.
|
The interface is mind-numbingly simple:
|
||||||
|
|
||||||
$purifier = new HTMLPurifier();
|
$purifier = new HTMLPurifier();
|
||||||
$clean_html = $purifier->purify($dirty_html);
|
$clean_html = $purifier->purify($dirty_html);
|
||||||
|
|
||||||
|
Or, if you're using the configuration object:
|
||||||
|
|
||||||
|
$purifier = new HTMLPurifier($config);
|
||||||
|
$clean_html = $purifier->purify($dirty_html);
|
||||||
|
|
||||||
That's it. For more examples, check out docs/examples/. Also, SLOW gives
|
That's it. For more examples, check out docs/examples/. Also, SLOW gives
|
||||||
advice on what to do if HTMLPurifier is slowing down your application.
|
advice on what to do if HTMLPurifier is slowing down your application.
|
||||||
|
3
NEWS
3
NEWS
@ -1,7 +1,7 @@
|
|||||||
NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
||||||
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||
|
|
||||||
1.0.0rc1, released 2006-??-??
|
1.0.0rc1, released 2006-08-31
|
||||||
- Fixed broken numeric entity conversion
|
- Fixed broken numeric entity conversion
|
||||||
- Malformed UTF-8 and non-SGML character detection and cleaning implemented
|
- Malformed UTF-8 and non-SGML character detection and cleaning implemented
|
||||||
- API documentation completed
|
- API documentation completed
|
||||||
@ -9,6 +9,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
- Basic color keywords translated into hexadecimal values
|
- Basic color keywords translated into hexadecimal values
|
||||||
- Table CSS properties implemented
|
- Table CSS properties implemented
|
||||||
- (HTML|CSS)Definition de-singleton-ized
|
- (HTML|CSS)Definition de-singleton-ized
|
||||||
|
- Support for charsets other than UTF-8 (defined by iconv)
|
||||||
|
|
||||||
1.0.0beta, released 2006-08-16
|
1.0.0beta, released 2006-08-16
|
||||||
- First public release, most functionality implemented. Notable omissions are:
|
- First public release, most functionality implemented. Notable omissions are:
|
||||||
|
4
TODO
4
TODO
@ -5,10 +5,6 @@ Ongoing
|
|||||||
- Lots of profiling, make it faster!
|
- Lots of profiling, make it faster!
|
||||||
- Plugins for major CMSes (very tricky issue)
|
- Plugins for major CMSes (very tricky issue)
|
||||||
|
|
||||||
1.0 release
|
|
||||||
- Lossy alternate character encoding support (characters not in the encoding
|
|
||||||
will get silently dropped).
|
|
||||||
|
|
||||||
1.1 release
|
1.1 release
|
||||||
- Directive documentation generation
|
- Directive documentation generation
|
||||||
- Rewrite table's child definition to be faster, smart, and regexp free
|
- Rewrite table's child definition to be faster, smart, and regexp free
|
||||||
|
@ -27,6 +27,7 @@ require_once 'HTMLPurifier/Lexer.php';
|
|||||||
require_once 'HTMLPurifier/HTMLDefinition.php';
|
require_once 'HTMLPurifier/HTMLDefinition.php';
|
||||||
require_once 'HTMLPurifier/Generator.php';
|
require_once 'HTMLPurifier/Generator.php';
|
||||||
require_once 'HTMLPurifier/Strategy/Core.php';
|
require_once 'HTMLPurifier/Strategy/Core.php';
|
||||||
|
require_once 'HTMLPurifier/Encoder.php';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Main library execution class.
|
* Main library execution class.
|
||||||
@ -58,6 +59,7 @@ class HTMLPurifier
|
|||||||
$this->lexer = HTMLPurifier_Lexer::create();
|
$this->lexer = HTMLPurifier_Lexer::create();
|
||||||
$this->strategy = new HTMLPurifier_Strategy_Core();
|
$this->strategy = new HTMLPurifier_Strategy_Core();
|
||||||
$this->generator = new HTMLPurifier_Generator();
|
$this->generator = new HTMLPurifier_Generator();
|
||||||
|
$this->encoder = new HTMLPurifier_Encoder();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,14 +74,17 @@ class HTMLPurifier
|
|||||||
*/
|
*/
|
||||||
function purify($html, $config = null) {
|
function purify($html, $config = null) {
|
||||||
$config = $config ? $config : $this->config;
|
$config = $config ? $config : $this->config;
|
||||||
return
|
$html = $this->encoder->convertToUTF8($html, $config);
|
||||||
|
$html =
|
||||||
$this->generator->generateFromTokens(
|
$this->generator->generateFromTokens(
|
||||||
$this->strategy->execute(
|
$this->strategy->execute(
|
||||||
$this->lexer->tokenizeHTML($html, $config),
|
$this->lexer->tokenizeHTML($html, $config),
|
||||||
|
$config
|
||||||
|
),
|
||||||
$config
|
$config
|
||||||
),
|
);
|
||||||
$config
|
$html = $this->encoder->convertFromUTF8($html, $config);
|
||||||
);
|
return $html;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,29 @@
|
|||||||
|
|
||||||
require_once 'HTMLPurifier/EntityLookup.php';
|
require_once 'HTMLPurifier/EntityLookup.php';
|
||||||
|
|
||||||
|
HTMLPurifier_ConfigDef::define(
|
||||||
|
'Core', 'Encoding', 'utf-8', 'istring',
|
||||||
|
'If for some reason you are unable to convert all webpages to UTF-8, '.
|
||||||
|
'you can use this directive as a stop-gap compatibility change to '.
|
||||||
|
'let HTMLPurifier deal with non UTF-8 input. This technique has '.
|
||||||
|
'notable deficiencies: absolutely no characters outside of the selected '.
|
||||||
|
'character encoding will be preserved, not even the ones that have '.
|
||||||
|
'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '.
|
||||||
|
'that automatically resolves all entities), making it pretty useless '.
|
||||||
|
'for anything except the most I18N-blind applications. This directive '.
|
||||||
|
'only accepts ISO-8859-1 if iconv is not enabled.'
|
||||||
|
);
|
||||||
|
|
||||||
|
if ( !function_exists('iconv') ) {
|
||||||
|
// only encodings with native PHP support
|
||||||
|
HTMLPurifier_ConfigDef::defineAllowedValues(
|
||||||
|
'Core', 'Encoding', array(
|
||||||
|
'utf-8',
|
||||||
|
'iso-8859-1'
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A UTF-8 specific character encoder that handles cleaning and transforming.
|
* A UTF-8 specific character encoder that handles cleaning and transforming.
|
||||||
*/
|
*/
|
||||||
@ -36,8 +59,6 @@ class HTMLPurifier_Encoder
|
|||||||
function cleanUTF8($str, $force_php = false) {
|
function cleanUTF8($str, $force_php = false) {
|
||||||
|
|
||||||
static $non_sgml_chars = array();
|
static $non_sgml_chars = array();
|
||||||
static $iconv = null;
|
|
||||||
|
|
||||||
if (empty($non_sgml_chars)) {
|
if (empty($non_sgml_chars)) {
|
||||||
for ($i = 0; $i <= 31; $i++) {
|
for ($i = 0; $i <= 31; $i++) {
|
||||||
// non-SGML ASCII chars
|
// non-SGML ASCII chars
|
||||||
@ -50,9 +71,8 @@ class HTMLPurifier_Encoder
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($iconv === null) {
|
static $iconv = null;
|
||||||
$iconv = function_exists('iconv');
|
if ($iconv === null) $iconv = function_exists('iconv');
|
||||||
}
|
|
||||||
|
|
||||||
if ($iconv && !$force_php) {
|
if ($iconv && !$force_php) {
|
||||||
// do the shortcut way
|
// do the shortcut way
|
||||||
@ -232,6 +252,38 @@ class HTMLPurifier_Encoder
|
|||||||
return $ret;
|
return $ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a string to UTF-8 based on configuration.
|
||||||
|
*/
|
||||||
|
function convertToUTF8($str, $config) {
|
||||||
|
static $iconv = null;
|
||||||
|
if ($iconv === null) $iconv = function_exists('iconv');
|
||||||
|
$encoding = $config->get('Core', 'Encoding');
|
||||||
|
if ($encoding === 'utf-8') return $str;
|
||||||
|
if ($iconv) {
|
||||||
|
return iconv($encoding, 'utf-8//IGNORE', $str);
|
||||||
|
} elseif ($encoding === 'iso-8895-1') {
|
||||||
|
return utf8_encode($str);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a string from UTF-8 based on configuration.
|
||||||
|
* @note Currently, this is a lossy conversion, with unexpressable
|
||||||
|
* characters being omitted.
|
||||||
|
*/
|
||||||
|
function convertFromUTF8($str, $config) {
|
||||||
|
static $iconv = null;
|
||||||
|
if ($iconv === null) $iconv = function_exists('iconv');
|
||||||
|
$encoding = $config->get('Core', 'Encoding');
|
||||||
|
if ($encoding === 'utf-8') return $str;
|
||||||
|
if ($iconv) {
|
||||||
|
return iconv('utf-8', $encoding . '//IGNORE', $str);
|
||||||
|
} elseif ($encoding === 'iso-8895-1') {
|
||||||
|
return utf8_encode($str);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -29,6 +29,43 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
|
|||||||
$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
|
$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function test_convertToUTF8() {
|
||||||
|
$config = HTMLPurifier_Config::createDefault();
|
||||||
|
|
||||||
|
// UTF-8 means that we don't touch it
|
||||||
|
$this->assertIdentical(
|
||||||
|
$this->Encoder->convertToUTF8("\xF6", $config),
|
||||||
|
"\xF6" // this is invalid
|
||||||
|
);
|
||||||
|
$this->assertNoErrors();
|
||||||
|
|
||||||
|
$config->set('Core', 'Encoding', 'ISO-8859-1');
|
||||||
|
|
||||||
|
// Now it gets converted
|
||||||
|
$this->assertIdentical(
|
||||||
|
$this->Encoder->convertToUTF8("\xF6", $config),
|
||||||
|
"\xC3\xB6"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function test_convertFromUTF8() {
|
||||||
|
$config = HTMLPurifier_Config::createDefault();
|
||||||
|
|
||||||
|
// UTF-8 means that we don't touch it
|
||||||
|
$this->assertIdentical(
|
||||||
|
$this->Encoder->convertFromUTF8("\xC3\xB6", $config),
|
||||||
|
"\xC3\xB6"
|
||||||
|
);
|
||||||
|
|
||||||
|
$config->set('Core', 'Encoding', 'ISO-8859-1');
|
||||||
|
|
||||||
|
// Now it gets converted
|
||||||
|
$this->assertIdentical(
|
||||||
|
$this->Encoder->convertFromUTF8("\xC3\xB6", $config),
|
||||||
|
"\xF6"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
?>
|
Loading…
Reference in New Issue
Block a user