diff --git a/INSTALL b/INSTALL
index ad74f405..45ad662e 100644
--- a/INSTALL
+++ b/INSTALL
@@ -26,7 +26,7 @@ any earlier versions.
I have been unable to get PHP 5.0.5 working on my computer, so if someone
wants to test that, be my guest. All tests were done on Windows XP Home,
-but operating system is quite irrelevant in this particular case.
+but operating system should not be a major factor in the library.
@@ -70,21 +70,36 @@ I cannot stress the importance of these two bullets enough. Omitting either
of them could have dire consequences not only for security but for plain
old usability. You can find a more in-depth discussion of why this is needed
in docs/security.txt, in the meantime, try to change your output so this is
-the case.
+the case. If you can't, well, we might be able to accomodate you (read
+section 3).
+
+
+
+3. Configuring HTML Purifier
+
+HTML Purifier is designed to run out-of-the-box, but occasionally HTML
+Purifier needs to be told what to do.
If, for some reason, you are unable to switch to UTF-8 immediately, you can
switch HTML Purifier's encoding. Note that the availability of encodings is
dependent on iconv, and you'll be missing characters if the charset you
choose doesn't have them.
- $config = HTMLPurifier_Config::createDefault();
$config->set('Core', 'Encoding', /* put your encoding here */);
An example usage for Latin-1 websites:
- $config = HTMLPurifier_Config::createDefault();
$config->set('Core', 'Encoding', 'ISO-8859-1');
+For those of you stuck using HTML 4.01 Transitional, you can disable
+XHTML output like this:
+
+ $config->set('Core', 'XHTML', false);
+
+However, I strongly recommend that you use XHTML. Currently, we can only
+guarantee transitional-complaint output, future versions will also allow strict
+output.
+
3. Using the code
@@ -106,7 +121,7 @@ advice on what to do if HTML Purifier is slowing down your application.
4. Quick install
-If your website is in UTF-8, use this code:
+If your website is in UTF-8 and XHTML Transitional, use this code:
purify($dirty_html);
-If your website is in a different encoding, use this code:
+If your website is in a different encoding or doctype, use this code:
set('Core', 'Encoding', 'ISO-8859-1'); //replace with your encoding
+ $config->set('Core', 'XHTML', true); //replace with false if HTML 4.01
$purifier = new HTMLPurifier($config);
$clean_html = $purifier->purify($dirty_html);
diff --git a/NEWS b/NEWS
index 493afb8a..a0def4b0 100644
--- a/NEWS
+++ b/NEWS
@@ -1,15 +1,13 @@
NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
-1.1.0, unknown release date
+1.1.0, projected 2006-09-16
- Made URI validator more forgiving: will ignore leading and trailing
quotes, apostrophes and less than or greater than signs.
- Enforce alphanumeric namespace and directive names for configuration.
- Directive documentation generation using XSLT
- Table child definition made more flexible, will fix up poorly ordered elements
-
-1.0.2, unknown release date
-(bugfix release may be dropped if no bugs found)
+- XHTML generation can now be turned off, allowing things like
1.0.1, released 2006-09-04
- Fixed slight bug in DOMLex attribute parsing
diff --git a/TODO b/TODO
index f98a0cb0..b8b2caa1 100644
--- a/TODO
+++ b/TODO
@@ -5,9 +5,6 @@ Ongoing
- Lots of profiling, make it faster!
- Plugins for major CMSes (very tricky issue)
-1.1 release
- - Allow HTML 4.01 output (cosmetic changes to the generator)
-
1.2 release
- Additional support for poorly written HTML
- Implement all non-essential attribute transforms
diff --git a/WYSIWYG b/WYSIWYG
index 9ce8575c..6fab8bcc 100644
--- a/WYSIWYG
+++ b/WYSIWYG
@@ -1,6 +1,6 @@
WYSIWYG - What You See Is What You Get
- HTMLPurifier: A Pretty Good Fit for TinyMCE and FCKeditor
+ HTML Purifier: A Pretty Good Fit for TinyMCE and FCKeditor
Javascript-based WYSIWYG editors, simply stated, are quite amazing. But I've
always been wary about using them due to security issues: they handle the
@@ -13,6 +13,9 @@ other markup languages still reign supreme. Put simply: filtering HTML is
hard work, and these WYSIWYG authors don't offer anything to alleviate that
trouble. Therein lies the solution:
-HTMLPurifier is perfect for filtering pure-HTML input from WYSIWYG editors.
+HTML Purifier is perfect for filtering pure-HTML input from WYSIWYG editors.
Enough said.
+
+There is a proof-of-concept integration of HTML Purifier with the Mantis
+bugtracker at http://hp.jpsband.org/mantis/
diff --git a/library/HTMLPurifier/AttrDef.php b/library/HTMLPurifier/AttrDef.php
index 56d5101e..31762ec4 100644
--- a/library/HTMLPurifier/AttrDef.php
+++ b/library/HTMLPurifier/AttrDef.php
@@ -15,6 +15,12 @@ require_once 'HTMLPurifier/AttrContext.php';
class HTMLPurifier_AttrDef
{
+ /**
+ * Tells us whether or not an HTML attribute is minimized. Only the
+ * boolean attribute vapourware would use this.
+ */
+ var $minimized = false;
+
/**
* Abstract function defined for functions that validate and clean strings.
*
diff --git a/library/HTMLPurifier/Generator.php b/library/HTMLPurifier/Generator.php
index bb366942..015c5422 100644
--- a/library/HTMLPurifier/Generator.php
+++ b/library/HTMLPurifier/Generator.php
@@ -15,6 +15,14 @@ HTMLPurifier_ConfigDef::define(
'generateFromTokens.'
);
+HTMLPurifier_ConfigDef::define(
+ 'Core', 'XHTML', true, 'bool',
+ 'Determines whether or not output is XHTML or not. When disabled, HTML '.
+ 'Purifier goes into HTML 4.01 removes XHTML-specific markup constructs, '.
+ 'such as boolean attribute expansion and trailing slashes in empty tags. '.
+ 'This directive was available since 1.1.'
+);
+
/**
* Generates HTML from tokens.
*/
@@ -22,11 +30,16 @@ class HTMLPurifier_Generator
{
/**
- * Bool cache of the CleanUTF8DuringGeneration directive.
+ * Bool cache of %Core.CleanUTF8DuringGeneration
* @private
*/
var $_clean_utf8 = false;
+ /**
+ * Bool cache of %Core.XHTML
+ */
+ var $_xhtml = true;
+
/**
* Generates HTML from an array of tokens.
* @param $tokens Array of HTMLPurifier_Token
@@ -38,6 +51,7 @@ class HTMLPurifier_Generator
$html = '';
if (!$config) $config = HTMLPurifier_Config::createDefault();
$this->_clean_utf8 = $config->get('Core', 'CleanUTF8DuringGeneration');
+ $this->_xhtml = $config->get('Core', 'XHTML');
if (!$tokens) return '';
foreach ($tokens as $token) {
$html .= $this->generateFromToken($token);
@@ -61,7 +75,9 @@ class HTMLPurifier_Generator
} elseif ($token->type == 'empty') {
$attr = $this->generateAttributes($token->attributes);
- return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />';
+ return '<' . $token->name . ($attr ? ' ' : '') . $attr .
+ ( $this->_xhtml ? ' /': '' )
+ . '>';
} elseif ($token->type == 'text') {
return $this->escape($token->data);
@@ -80,6 +96,11 @@ class HTMLPurifier_Generator
function generateAttributes($assoc_array_of_attributes) {
$html = '';
foreach ($assoc_array_of_attributes as $key => $value) {
+ if (!$this->_xhtml) {
+ // remove namespaced attributes
+ if (strpos($key, ':') !== false) continue;
+ // also needed: check for attribute minimization
+ }
$html .= $key.'="'.$this->escape($value).'" ';
}
return rtrim($html);
diff --git a/tests/HTMLPurifier/GeneratorTest.php b/tests/HTMLPurifier/GeneratorTest.php
index d567c236..6b85a9ca 100644
--- a/tests/HTMLPurifier/GeneratorTest.php
+++ b/tests/HTMLPurifier/GeneratorTest.php
@@ -52,10 +52,8 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase
$inputs[7] = new HTMLPurifier_Token_Text($theta_char);
$expect[7] = $theta_char;
- $default_config = HTMLPurifier_Config::createDefault();
foreach ($inputs as $i => $input) {
- if (!isset($config[$i])) $config[$i] = $default_config;
- $result = $this->gen->generateFromToken($input, $config[$i]);
+ $result = $this->gen->generateFromToken($input);
$this->assertEqual($result, $expect[$i]);
paintIf($result, $result != $expect[$i]);
}
@@ -122,6 +120,34 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase
}
+ var $config;
+ function assertGeneration($tokens, $expect) {
+ $result = $this->gen->generateFromTokens($tokens, $this->config);
+ $this->assertEqual($expect, $result);
+ }
+
+ function test_generateFromTokens_XHTMLoff() {
+ $this->config = HTMLPurifier_Config::createDefault();
+ $this->config->set('Core', 'XHTML', false);
+
+ // omit trailing slash
+ $this->assertGeneration(
+ array( new HTMLPurifier_Token_Empty('br') ),
+ '
'
+ );
+
+ // there should be a test for attribute minimization, but it is
+ // impossible for something like that to happen due to our current
+ // definitions! fix it later
+
+ // namespaced attributes must be dropped
+ $this->assertGeneration(
+ array( new HTMLPurifier_Token_Start('p', array('xml:lang'=>'fr')) ),
+ '
' + ); + + } + } ?> \ No newline at end of file