mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-01-23 13:51:54 +00:00
Finish up with a few more files that didn't get updated. Hrmm..
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/strict@1181 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
5ecb11f19a
commit
42858ad594
57
INSTALL
57
INSTALL
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
Install
|
Install
|
||||||
How to install HTML Purifier
|
How to install HTML Purifier
|
||||||
|
|
||||||
@ -8,13 +7,13 @@ installation GUI, you've come to the wrong place!) The impatient can scroll
|
|||||||
down to the bottom of this INSTALL document to see the code, but you really
|
down to the bottom of this INSTALL document to see the code, but you really
|
||||||
should make sure a few things are properly done.
|
should make sure a few things are properly done.
|
||||||
|
|
||||||
Todo: Convert to using the array syntax for configuration.
|
|
||||||
|
|
||||||
|
|
||||||
1. Compatibility
|
1. Compatibility
|
||||||
|
|
||||||
HTML Purifier works in both PHP 4 and PHP 5, from PHP 4.3.9 and up. It has no
|
HTML Purifier works in both PHP 4 and PHP 5, from PHP 4.3.2 and up. It has no
|
||||||
core dependencies with other libraries. (Whoopee!)
|
core dependencies with other libraries.
|
||||||
|
|
||||||
Optional extensions are iconv (usually installed) and tidy (also common).
|
Optional extensions are iconv (usually installed) and tidy (also common).
|
||||||
If you use UTF-8 and don't plan on pretty-printing HTML, you can get away with
|
If you use UTF-8 and don't plan on pretty-printing HTML, you can get away with
|
||||||
@ -50,6 +49,7 @@ be standards compliant. HTML Purifier can deal with these doctypes:
|
|||||||
* XHTML 1.0 Strict
|
* XHTML 1.0 Strict
|
||||||
* HTML 4.01 Transitional
|
* HTML 4.01 Transitional
|
||||||
* HTML 4.01 Strict
|
* HTML 4.01 Strict
|
||||||
|
* XHTML 1.1 sans Ruby
|
||||||
|
|
||||||
...and these character encodings:
|
...and these character encodings:
|
||||||
|
|
||||||
@ -68,11 +68,11 @@ the doctype from this code in your HTML documents:
|
|||||||
<meta http-equiv="Content-type" content="text/html;charset=ENCODING">
|
<meta http-equiv="Content-type" content="text/html;charset=ENCODING">
|
||||||
|
|
||||||
For legacy codebases these declarations may be missing. If that is the case,
|
For legacy codebases these declarations may be missing. If that is the case,
|
||||||
STOP, and read up on character encodings and doctypes (in that order). Here
|
STOP, and read docs/enduser-utf8.html
|
||||||
are some links:
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
* http://www.joelonsoftware.com/articles/Unicode.html
|
|
||||||
* http://alistapart.com/stories/doctype/
|
|
||||||
|
|
||||||
You may currently be vulnerable to XSS and other security threats, and HTML
|
You may currently be vulnerable to XSS and other security threats, and HTML
|
||||||
Purifier won't be able to fix that.
|
Purifier won't be able to fix that.
|
||||||
@ -116,27 +116,30 @@ websites):
|
|||||||
|
|
||||||
Note that HTML Purifier's support for non-Unicode encodings is crippled by the
|
Note that HTML Purifier's support for non-Unicode encodings is crippled by the
|
||||||
fact that any character not supported by that encoding will be silently
|
fact that any character not supported by that encoding will be silently
|
||||||
dropped, EVEN if it is ampersand escaped. This is a current limitation of
|
dropped, EVEN if it is ampersand escaped. If you want to work around
|
||||||
HTML Purifier that we are NOT actively working to fix. Patches are welcome,
|
this, you are welcome to read docs/enduser-utf8.html for a workaround,
|
||||||
but there are so many other gotchas and problems in I18N for non-Unicode
|
but please be cognizant of the issues the "solution" creates.
|
||||||
encodings that this functionality is low priority. See
|
|
||||||
<http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html> for a more
|
|
||||||
detailed lowdown on the topic.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
4.2. Setting a different doctype
|
4.2. Setting a different doctype
|
||||||
|
|
||||||
For those of you stuck using HTML 4.01 Transitional, you can disable
|
For those of you using HTML 4.01 Transitional, you can disable
|
||||||
XHTML output like this:
|
XHTML output like this:
|
||||||
|
|
||||||
$config->set('Core', 'XHTML', false);
|
$config->set('HTML', 'Doctype', 'HTML 4.01 Transitional');
|
||||||
|
|
||||||
I recommend that you use XHTML, although not as much as I recommend UTF-8. If
|
Other supported doctypes include:
|
||||||
your HTML 4.01 page validates, good for you!
|
|
||||||
|
|
||||||
Currently, we can only guarantee transitional-complaint output, future
|
|
||||||
versions will also allow strict-compliant output.
|
* HTML 4.01 Strict
|
||||||
|
* HTML 4.01 Transitional
|
||||||
|
* XHTML 1.0 Strict
|
||||||
|
* XHTML 1.0 Transitional
|
||||||
|
* XHTML 1.1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -184,9 +187,17 @@ If your website is in a different encoding or doctype, use this code:
|
|||||||
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
|
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
|
||||||
|
|
||||||
$config = HTMLPurifier_Config::createDefault();
|
$config = HTMLPurifier_Config::createDefault();
|
||||||
$config->set('Core', 'Encoding', 'ISO-8859-1'); //replace with your encoding
|
$config->set('Core', 'Encoding', 'ISO-8859-1'); // replace with your encoding
|
||||||
$config->set('Core', 'XHTML', true); //replace with false if HTML 4.01
|
$config->set('HTML', 'Doctype', 'HTML 4.01 Transitional'); // replace with your doctype
|
||||||
$purifier = new HTMLPurifier($config);
|
$purifier = new HTMLPurifier($config);
|
||||||
|
|
||||||
$clean_html = $purifier->purify($dirty_html);
|
$clean_html = $purifier->purify($dirty_html);
|
||||||
?>
|
?>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
7. Caching
|
||||||
|
|
||||||
|
HTML Purifier generates some cache files to speed up its execution. For
|
||||||
|
maximum performance, make sure that library/HTMLPurifier/DefinitionCache/Serializer
|
||||||
|
is writeable by the webserver.
|
61
NEWS
61
NEWS
@ -9,7 +9,62 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
. Internal change
|
. Internal change
|
||||||
==========================
|
==========================
|
||||||
|
|
||||||
1.7.0, unknown release date
|
2.0.0, released 2007-06-20
|
||||||
|
# Completely refactored HTMLModuleManager, decentralizing safety
|
||||||
|
information
|
||||||
|
# Transform modules changed to Tidy modules, which offer more flexibility
|
||||||
|
and better modularization
|
||||||
|
# Configuration object now finalizes itself when a read operation is
|
||||||
|
performed on it, ensuring that its internal state stays consistent.
|
||||||
|
To revert this behavior, you can set the $autoFinalize member variable
|
||||||
|
off, but it's not recommended.
|
||||||
|
# New compact syntax for AttrDef objects that can be used to instantiate
|
||||||
|
new objects via make()
|
||||||
|
# Definitions (esp. HTMLDefinition) are now cached for a significant
|
||||||
|
performance boost. You can disable caching by setting %Core.DefinitionCache
|
||||||
|
to null. You CANNOT edit raw definitions without setting the corresponding
|
||||||
|
DefinitionID directive (%HTML.DefinitionID for HTMLDefinition).
|
||||||
|
# Contents between <script> tags are now completely removed if <script>
|
||||||
|
is not allowed
|
||||||
|
# Prototype-declarations for Lexer removed in favor of configuration
|
||||||
|
determination of Lexer implementations.
|
||||||
|
! HTML Purifier now works in PHP 4.3.2.
|
||||||
|
! Configuration form-editing API makes tweaking HTMLPurifier_Config a
|
||||||
|
breeze!
|
||||||
|
! Configuration directives that accept hashes now allow new string
|
||||||
|
format: key1:value1,key2:value2
|
||||||
|
! ConfigDoc now factored into OOP design
|
||||||
|
! All deprecated elements now natively supported
|
||||||
|
! Implement TinyMCE styled whitelist specification format in
|
||||||
|
%HTML.Allowed
|
||||||
|
! Config object gives more friendly error messages when things go wrong
|
||||||
|
! Advanced API implemented: easy functions for creating elements (addElement)
|
||||||
|
and attributes (addAttribute) on HTMLDefinition
|
||||||
|
! Add native support for required attributes
|
||||||
|
- Deprecated and removed EnableRedundantUTF8Cleaning. It didn't even work!
|
||||||
|
- DOMLex will not emit errors when a custom error handler that does not
|
||||||
|
honor error_reporting is used
|
||||||
|
- StrictBlockquote child definition refrains from wrapping whitespace
|
||||||
|
in tags now.
|
||||||
|
- Bug resulting from tag transforms to non-allowed elements fixed
|
||||||
|
- ChildDef_Custom's regex generation has been improved, removing several
|
||||||
|
false positives
|
||||||
|
. Unit test for ElementDef created, ElementDef behavior modified to
|
||||||
|
be more flexible
|
||||||
|
. Added convenience functions for HTMLModule constructors
|
||||||
|
. AttrTypes now has accessor functions that should be used instead
|
||||||
|
of directly manipulating info
|
||||||
|
. TagTransform_Center deprecated in favor of generic TagTransform_Simple
|
||||||
|
. Add extra protection in AttrDef_URI against phantom Schemes
|
||||||
|
. Doctype object added to HTMLDefinition which describes certain aspects
|
||||||
|
of the operational document type
|
||||||
|
. Lexer is now pre-emptively included, with a conditional include for the
|
||||||
|
PHP5 only version.
|
||||||
|
. HTMLDefinition and CSSDefinition have a common parent class: Definition.
|
||||||
|
. DirectLex can now track line-numbers
|
||||||
|
. Preliminary error collector is in place, although no code actually reports
|
||||||
|
errors yet
|
||||||
|
. Factor out most of ValidateAttributes to new AttrValidator class
|
||||||
|
|
||||||
1.6.1, released 2007-05-05
|
1.6.1, released 2007-05-05
|
||||||
! Support for more deprecated attributes via transformations:
|
! Support for more deprecated attributes via transformations:
|
||||||
@ -61,7 +116,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
- Error messages are emitted when you attempt to "allow" elements or
|
- Error messages are emitted when you attempt to "allow" elements or
|
||||||
attributes that HTML Purifier does not support
|
attributes that HTML Purifier does not support
|
||||||
|
|
||||||
1.5.1, unknown release date
|
|
||||||
- Fix segfault in unit test. The problem is not very reproduceable and
|
- Fix segfault in unit test. The problem is not very reproduceable and
|
||||||
I don't know what causes it, but a six line patch fixed it.
|
I don't know what causes it, but a six line patch fixed it.
|
||||||
|
|
||||||
@ -260,4 +315,4 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
! First public release, most functionality implemented. Notable omissions are:
|
! First public release, most functionality implemented. Notable omissions are:
|
||||||
+ Shorthand CSS properties
|
+ Shorthand CSS properties
|
||||||
+ Table CSS properties
|
+ Table CSS properties
|
||||||
+ Deprecated attribute transformations
|
+ Deprecated attribute transformations
|
47
TODO
47
TODO
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
TODO List
|
TODO List
|
||||||
|
|
||||||
= KEY ====================
|
= KEY ====================
|
||||||
@ -7,33 +6,34 @@ TODO List
|
|||||||
? Maybe I'll Do It
|
? Maybe I'll Do It
|
||||||
==========================
|
==========================
|
||||||
|
|
||||||
1.7 release [Advanced API]
|
2.1 release [Refactor, refactor!]
|
||||||
# Complete advanced API, and fully document it
|
|
||||||
# Implement all edge-case attribute transforms
|
|
||||||
# Implement all deprecated tags and attributes
|
|
||||||
- Parse TinyMCE-style whitelist into our %HTML.Allow* whitelists (possibly
|
|
||||||
do this earlier)
|
|
||||||
? HTML interface for tweaking configuration to see changes
|
|
||||||
|
|
||||||
|
|
||||||
1.8 release [Refactor, refactor!]
|
|
||||||
# URI validation routines tighter (see docs/dev-code-quality.html) (COMPLEX)
|
# URI validation routines tighter (see docs/dev-code-quality.html) (COMPLEX)
|
||||||
# Advanced URI filtering schemes (see docs/proposal-new-directives.txt)
|
# Advanced URI filtering schemes (see docs/proposal-new-directives.txt)
|
||||||
- Configuration profiles: predefined directives set with one func call
|
- Configuration profiles: predefined directives set with one func call
|
||||||
- Implement IDREF support (harder than it seems, since you cannot have
|
- Implement IDREF support (harder than it seems, since you cannot have
|
||||||
IDREFs to non-existent IDs)
|
IDREFs to non-existent IDs)
|
||||||
- Allow non-ASCII characters in font names
|
- Allow non-ASCII characters in font names
|
||||||
|
- Genericize special cases in RemoveForeignElements
|
||||||
|
|
||||||
1.9 release [Error'ed]
|
2.2 release [Error'ed]
|
||||||
# Error logging for filtering/cleanup procedures
|
# Error logging for filtering/cleanup procedures
|
||||||
- Requires I18N facilities to be created first (COMPLEX)
|
- Requires I18N facilities to be created first (COMPLEX)
|
||||||
- XSS-attempt detection
|
- XSS-attempt detection
|
||||||
- More fine-grained control over escaping behavior
|
- More fine-grained control over escaping behavior
|
||||||
- Silently drop content inbetween SCRIPT tags (can be generalized to allow
|
|
||||||
specification of elements that, when detected as foreign, trigger removal
|
|
||||||
of children, although unbalanced tags could wreck havoc (or at least
|
|
||||||
delete the rest of the document)).
|
|
||||||
|
|
||||||
1.10 release [Do What I Mean, Not What I Say]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
2.3 release [Do What I Mean, Not What I Say]
|
||||||
# Additional support for poorly written HTML
|
# Additional support for poorly written HTML
|
||||||
- Microsoft Word HTML cleaning (i.e. MsoNormal, but research essential!)
|
- Microsoft Word HTML cleaning (i.e. MsoNormal, but research essential!)
|
||||||
- Friendly strict handling of <address> (block -> <br>)
|
- Friendly strict handling of <address> (block -> <br>)
|
||||||
@ -48,9 +48,14 @@ TODO List
|
|||||||
- Append something to duplicate IDs so they're still usable (impl. note: the
|
- Append something to duplicate IDs so they're still usable (impl. note: the
|
||||||
dupe detector would also need to detect the suffix as well)
|
dupe detector would also need to detect the suffix as well)
|
||||||
|
|
||||||
2.0 release [Beyond HTML]
|
2.4 release [It's All About Trust] (floating)
|
||||||
|
# Implement untrusted, dangerous elements/attributes
|
||||||
|
|
||||||
|
3.0 release [Beyond HTML]
|
||||||
# Legit token based CSS parsing (will require revamping almost every
|
# Legit token based CSS parsing (will require revamping almost every
|
||||||
AttrDef class)
|
AttrDef class)
|
||||||
|
# More control over allowed CSS properties (maybe modularize it in the
|
||||||
|
same fashion!)
|
||||||
# Formatters for plaintext (COMPLEX)
|
# Formatters for plaintext (COMPLEX)
|
||||||
- Auto-paragraphing (be sure to leverage fact that we know when things
|
- Auto-paragraphing (be sure to leverage fact that we know when things
|
||||||
shouldn't be paragraphed, such as lists and tables).
|
shouldn't be paragraphed, such as lists and tables).
|
||||||
@ -65,7 +70,7 @@ TODO List
|
|||||||
- Convert RTL/LTR override characters to <bdo> tags, or vice versa on demand.
|
- Convert RTL/LTR override characters to <bdo> tags, or vice versa on demand.
|
||||||
Also, enable disabling of directionality
|
Also, enable disabling of directionality
|
||||||
|
|
||||||
3.0 release [To XML and Beyond]
|
4.0 release [To XML and Beyond]
|
||||||
- Extended HTML capabilities based on namespacing and tag transforms (COMPLEX)
|
- Extended HTML capabilities based on namespacing and tag transforms (COMPLEX)
|
||||||
- Hooks for adding custom processors to custom namespaced tags and
|
- Hooks for adding custom processors to custom namespaced tags and
|
||||||
attributes, offer default implementation
|
attributes, offer default implementation
|
||||||
@ -78,12 +83,18 @@ Ongoing
|
|||||||
- WordPress (mostly written, needs beta-testing)
|
- WordPress (mostly written, needs beta-testing)
|
||||||
- eFiction
|
- eFiction
|
||||||
- more! (look for ones that use WYSIWYGs)
|
- more! (look for ones that use WYSIWYGs)
|
||||||
|
- Complete basic smoketests
|
||||||
|
|
||||||
Unknown release (on a scratch-an-itch basis)
|
Unknown release (on a scratch-an-itch basis)
|
||||||
? Semi-lossy dumb alternate character encoding transfor
|
? Semi-lossy dumb alternate character encoding transfor
|
||||||
? Have 'lang' attribute be checked against official lists, achieved by
|
? Have 'lang' attribute be checked against official lists, achieved by
|
||||||
encoding all characters that have string entity equivalents
|
encoding all characters that have string entity equivalents
|
||||||
- Explain how to use HTML Purifier in non-PHP languages
|
- Explain how to use HTML Purifier in non-PHP languages
|
||||||
|
- Abstract ChildDef_BlockQuote to work with all elements that only
|
||||||
|
allow blocks in them, required or optional
|
||||||
|
- Reorganize Unit Tests
|
||||||
|
- Refactor loop tests (esp. AttrDef_URI)
|
||||||
|
- Reorganize configuration directives (Create more namespaces! Get messy!)
|
||||||
|
|
||||||
Requested
|
Requested
|
||||||
? Native content compression, whitespace stripping (don't rely on Tidy, make
|
? Native content compression, whitespace stripping (don't rely on Tidy, make
|
||||||
@ -92,4 +103,4 @@ Requested
|
|||||||
Wontfix
|
Wontfix
|
||||||
- Non-lossy smart alternate character encoding transformations (unless
|
- Non-lossy smart alternate character encoding transformations (unless
|
||||||
patch provided)
|
patch provided)
|
||||||
- Pretty-printing HTML, users can use Tidy on the output on entire page
|
- Pretty-printing HTML, users can use Tidy on the output on entire page
|
95
release1-update.php
Normal file
95
release1-update.php
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
// release script
|
||||||
|
// PHP 5.0 only
|
||||||
|
|
||||||
|
if (php_sapi_name() != 'cli') {
|
||||||
|
echo 'Release script cannot be called from web-browser.';
|
||||||
|
exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isset($argv[1])) {
|
||||||
|
echo
|
||||||
|
'php release.php [version]
|
||||||
|
HTML Purifier release script
|
||||||
|
';
|
||||||
|
exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
$version = trim($argv[1]);
|
||||||
|
|
||||||
|
// Bump version numbers:
|
||||||
|
|
||||||
|
// ...in VERSION
|
||||||
|
file_put_contents('VERSION', $version);
|
||||||
|
|
||||||
|
// ...in NEWS
|
||||||
|
$date = date('Y-m-d');
|
||||||
|
$news_c = str_replace(
|
||||||
|
$l = "$version, unknown release date",
|
||||||
|
"$version, released $date",
|
||||||
|
file_get_contents('NEWS'),
|
||||||
|
$c
|
||||||
|
);
|
||||||
|
if (!$c) {
|
||||||
|
echo 'Could not update NEWS, missing ' . $l . PHP_EOL;
|
||||||
|
exit;
|
||||||
|
} elseif ($c > 1) {
|
||||||
|
echo 'More than one release declaration in NEWS replaced' . PHP_EOL;
|
||||||
|
exit;
|
||||||
|
}
|
||||||
|
file_put_contents('NEWS', $news_c);
|
||||||
|
|
||||||
|
// ...in Doxyfile
|
||||||
|
$doxyfile_c = preg_replace(
|
||||||
|
'/(?<=PROJECT_NUMBER {9}= )[^\s]+/m', // brittle
|
||||||
|
$version,
|
||||||
|
file_get_contents('Doxyfile'),
|
||||||
|
1, $c
|
||||||
|
);
|
||||||
|
if (!$c) {
|
||||||
|
echo 'Could not update Doxyfile, missing PROJECT_NUMBER.' . PHP_EOL;
|
||||||
|
exit;
|
||||||
|
}
|
||||||
|
file_put_contents('Doxyfile', $doxyfile_c);
|
||||||
|
|
||||||
|
// ...in HTMLPurifier.php
|
||||||
|
$htmlpurifier_c = file_get_contents('library/HTMLPurifier.php');
|
||||||
|
$htmlpurifier_c = preg_replace(
|
||||||
|
'/HTML Purifier .+? - /',
|
||||||
|
"HTML Purifier $version - ",
|
||||||
|
$htmlpurifier_c,
|
||||||
|
1, $c
|
||||||
|
);
|
||||||
|
if (!$c) {
|
||||||
|
echo 'Could not update HTMLPurifier.php, missing HTML Purifier [version] header.' . PHP_EOL;
|
||||||
|
exit;
|
||||||
|
}
|
||||||
|
$htmlpurifier_c = preg_replace(
|
||||||
|
'/var \$version = \'.+?\';/',
|
||||||
|
"var \$version = '$version';",
|
||||||
|
$htmlpurifier_c,
|
||||||
|
1, $c
|
||||||
|
);
|
||||||
|
if (!$c) {
|
||||||
|
echo 'Could not update HTMLPurifier.php, missing var $version.' . PHP_EOL;
|
||||||
|
exit;
|
||||||
|
}
|
||||||
|
file_put_contents('library/HTMLPurifier.php', $htmlpurifier_c);
|
||||||
|
|
||||||
|
$config_c = file_get_contents('library/HTMLPurifier/Config.php');
|
||||||
|
$config_c = preg_replace(
|
||||||
|
'/var \$version = \'.+?\';/',
|
||||||
|
"var \$version = '$version';",
|
||||||
|
$config_c,
|
||||||
|
1, $c
|
||||||
|
);
|
||||||
|
if (!$c) {
|
||||||
|
echo 'Could not update Config.php, missing var $version.' . PHP_EOL;
|
||||||
|
exit;
|
||||||
|
}
|
||||||
|
file_put_contents('library/HTMLPurifier/Config.php', $config_c);
|
||||||
|
|
||||||
|
echo "Review changes, write something in WHATSNEW, and then SVN commit with log 'Release $version.'" . PHP_EOL;
|
||||||
|
|
||||||
|
?>
|
Loading…
Reference in New Issue
Block a user