diff --git a/INSTALL b/INSTALL index ad74f405..45ad662e 100644 --- a/INSTALL +++ b/INSTALL @@ -26,7 +26,7 @@ any earlier versions. I have been unable to get PHP 5.0.5 working on my computer, so if someone wants to test that, be my guest. All tests were done on Windows XP Home, -but operating system is quite irrelevant in this particular case. +but operating system should not be a major factor in the library. @@ -70,21 +70,36 @@ I cannot stress the importance of these two bullets enough. Omitting either of them could have dire consequences not only for security but for plain old usability. You can find a more in-depth discussion of why this is needed in docs/security.txt, in the meantime, try to change your output so this is -the case. +the case. If you can't, well, we might be able to accomodate you (read +section 3). + + + +3. Configuring HTML Purifier + +HTML Purifier is designed to run out-of-the-box, but occasionally HTML +Purifier needs to be told what to do. If, for some reason, you are unable to switch to UTF-8 immediately, you can switch HTML Purifier's encoding. Note that the availability of encodings is dependent on iconv, and you'll be missing characters if the charset you choose doesn't have them. - $config = HTMLPurifier_Config::createDefault(); $config->set('Core', 'Encoding', /* put your encoding here */); An example usage for Latin-1 websites: - $config = HTMLPurifier_Config::createDefault(); $config->set('Core', 'Encoding', 'ISO-8859-1'); +For those of you stuck using HTML 4.01 Transitional, you can disable +XHTML output like this: + + $config->set('Core', 'XHTML', false); + +However, I strongly recommend that you use XHTML. Currently, we can only +guarantee transitional-complaint output, future versions will also allow strict +output. + 3. Using the code @@ -106,7 +121,7 @@ advice on what to do if HTML Purifier is slowing down your application. 4. Quick install -If your website is in UTF-8, use this code: +If your website is in UTF-8 and XHTML Transitional, use this code: purify($dirty_html); -If your website is in a different encoding, use this code: +If your website is in a different encoding or doctype, use this code: set('Core', 'Encoding', 'ISO-8859-1'); //replace with your encoding + $config->set('Core', 'XHTML', true); //replace with false if HTML 4.01 $purifier = new HTMLPurifier($config); $clean_html = $purifier->purify($dirty_html); diff --git a/NEWS b/NEWS index 493afb8a..a0def4b0 100644 --- a/NEWS +++ b/NEWS @@ -1,15 +1,13 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| -1.1.0, unknown release date +1.1.0, projected 2006-09-16 - Made URI validator more forgiving: will ignore leading and trailing quotes, apostrophes and less than or greater than signs. - Enforce alphanumeric namespace and directive names for configuration. - Directive documentation generation using XSLT - Table child definition made more flexible, will fix up poorly ordered elements - -1.0.2, unknown release date -(bugfix release may be dropped if no bugs found) +- XHTML generation can now be turned off, allowing things like
1.0.1, released 2006-09-04 - Fixed slight bug in DOMLex attribute parsing diff --git a/TODO b/TODO index f98a0cb0..b8b2caa1 100644 --- a/TODO +++ b/TODO @@ -5,9 +5,6 @@ Ongoing - Lots of profiling, make it faster! - Plugins for major CMSes (very tricky issue) -1.1 release - - Allow HTML 4.01 output (cosmetic changes to the generator) - 1.2 release - Additional support for poorly written HTML - Implement all non-essential attribute transforms diff --git a/WYSIWYG b/WYSIWYG index 9ce8575c..6fab8bcc 100644 --- a/WYSIWYG +++ b/WYSIWYG @@ -1,6 +1,6 @@ WYSIWYG - What You See Is What You Get - HTMLPurifier: A Pretty Good Fit for TinyMCE and FCKeditor + HTML Purifier: A Pretty Good Fit for TinyMCE and FCKeditor Javascript-based WYSIWYG editors, simply stated, are quite amazing. But I've always been wary about using them due to security issues: they handle the @@ -13,6 +13,9 @@ other markup languages still reign supreme. Put simply: filtering HTML is hard work, and these WYSIWYG authors don't offer anything to alleviate that trouble. Therein lies the solution: -HTMLPurifier is perfect for filtering pure-HTML input from WYSIWYG editors. +HTML Purifier is perfect for filtering pure-HTML input from WYSIWYG editors. Enough said. + +There is a proof-of-concept integration of HTML Purifier with the Mantis +bugtracker at http://hp.jpsband.org/mantis/ diff --git a/library/HTMLPurifier/AttrDef.php b/library/HTMLPurifier/AttrDef.php index 56d5101e..31762ec4 100644 --- a/library/HTMLPurifier/AttrDef.php +++ b/library/HTMLPurifier/AttrDef.php @@ -15,6 +15,12 @@ require_once 'HTMLPurifier/AttrContext.php'; class HTMLPurifier_AttrDef { + /** + * Tells us whether or not an HTML attribute is minimized. Only the + * boolean attribute vapourware would use this. + */ + var $minimized = false; + /** * Abstract function defined for functions that validate and clean strings. * diff --git a/library/HTMLPurifier/Generator.php b/library/HTMLPurifier/Generator.php index bb366942..015c5422 100644 --- a/library/HTMLPurifier/Generator.php +++ b/library/HTMLPurifier/Generator.php @@ -15,6 +15,14 @@ HTMLPurifier_ConfigDef::define( 'generateFromTokens.' ); +HTMLPurifier_ConfigDef::define( + 'Core', 'XHTML', true, 'bool', + 'Determines whether or not output is XHTML or not. When disabled, HTML '. + 'Purifier goes into HTML 4.01 removes XHTML-specific markup constructs, '. + 'such as boolean attribute expansion and trailing slashes in empty tags. '. + 'This directive was available since 1.1.' +); + /** * Generates HTML from tokens. */ @@ -22,11 +30,16 @@ class HTMLPurifier_Generator { /** - * Bool cache of the CleanUTF8DuringGeneration directive. + * Bool cache of %Core.CleanUTF8DuringGeneration * @private */ var $_clean_utf8 = false; + /** + * Bool cache of %Core.XHTML + */ + var $_xhtml = true; + /** * Generates HTML from an array of tokens. * @param $tokens Array of HTMLPurifier_Token @@ -38,6 +51,7 @@ class HTMLPurifier_Generator $html = ''; if (!$config) $config = HTMLPurifier_Config::createDefault(); $this->_clean_utf8 = $config->get('Core', 'CleanUTF8DuringGeneration'); + $this->_xhtml = $config->get('Core', 'XHTML'); if (!$tokens) return ''; foreach ($tokens as $token) { $html .= $this->generateFromToken($token); @@ -61,7 +75,9 @@ class HTMLPurifier_Generator } elseif ($token->type == 'empty') { $attr = $this->generateAttributes($token->attributes); - return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />'; + return '<' . $token->name . ($attr ? ' ' : '') . $attr . + ( $this->_xhtml ? ' /': '' ) + . '>'; } elseif ($token->type == 'text') { return $this->escape($token->data); @@ -80,6 +96,11 @@ class HTMLPurifier_Generator function generateAttributes($assoc_array_of_attributes) { $html = ''; foreach ($assoc_array_of_attributes as $key => $value) { + if (!$this->_xhtml) { + // remove namespaced attributes + if (strpos($key, ':') !== false) continue; + // also needed: check for attribute minimization + } $html .= $key.'="'.$this->escape($value).'" '; } return rtrim($html); diff --git a/tests/HTMLPurifier/GeneratorTest.php b/tests/HTMLPurifier/GeneratorTest.php index d567c236..6b85a9ca 100644 --- a/tests/HTMLPurifier/GeneratorTest.php +++ b/tests/HTMLPurifier/GeneratorTest.php @@ -52,10 +52,8 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase $inputs[7] = new HTMLPurifier_Token_Text($theta_char); $expect[7] = $theta_char; - $default_config = HTMLPurifier_Config::createDefault(); foreach ($inputs as $i => $input) { - if (!isset($config[$i])) $config[$i] = $default_config; - $result = $this->gen->generateFromToken($input, $config[$i]); + $result = $this->gen->generateFromToken($input); $this->assertEqual($result, $expect[$i]); paintIf($result, $result != $expect[$i]); } @@ -122,6 +120,34 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase } + var $config; + function assertGeneration($tokens, $expect) { + $result = $this->gen->generateFromTokens($tokens, $this->config); + $this->assertEqual($expect, $result); + } + + function test_generateFromTokens_XHTMLoff() { + $this->config = HTMLPurifier_Config::createDefault(); + $this->config->set('Core', 'XHTML', false); + + // omit trailing slash + $this->assertGeneration( + array( new HTMLPurifier_Token_Empty('br') ), + '
' + ); + + // there should be a test for attribute minimization, but it is + // impossible for something like that to happen due to our current + // definitions! fix it later + + // namespaced attributes must be dropped + $this->assertGeneration( + array( new HTMLPurifier_Token_Start('p', array('xml:lang'=>'fr')) ), + '

' + ); + + } + } ?> \ No newline at end of file