From 24663d65ed9eeb30ac5c678ae55d2e55e6616267 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sun, 24 Sep 2006 21:23:54 +0000 Subject: [PATCH] [1.1.1] To make up for DOMLex's tendency to drop tags, we've added a configuration option to let Tidy cleanup the HTML afterwards. Good for hand-editors. Also, Tidy is a smart solution for pretty-printed HTML, so we're marking the related TODO wontfix. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@454 48356398-32a2-884e-a903-53898d9a118a --- TODO | 2 +- docs/examples/demo.php | 6 ++++- library/HTMLPurifier/Generator.php | 37 ++++++++++++++++++++++++++++ tests/HTMLPurifier/GeneratorTest.php | 22 +++++++++++++++++ 4 files changed, 65 insertions(+), 2 deletions(-) diff --git a/TODO b/TODO index aad4c467..79c32c89 100644 --- a/TODO +++ b/TODO @@ -43,9 +43,9 @@ Unknown release (on a scratch-an-itch basis) - Fixes for Firefox's inability to handle COL alignment props (Bug 915) - Automatically add non-breaking spaces to empty table cells when empty-cells:show is applied to have compatibility with Internet Explorer - - Pretty-printing HTML (adds dependency of Generator to HTMLDefinition) - Non-lossy dumb alternate character encoding transformations, achieved by numerically encoding all non-ASCII characters Wontfix - Non-lossy smart alternate character encoding transformations + - Pretty-printing HTML, users can use Tidy on the output on entire page diff --git a/docs/examples/demo.php b/docs/examples/demo.php index 07630078..35a47986 100644 --- a/docs/examples/demo.php +++ b/docs/examples/demo.php @@ -21,7 +21,9 @@ if (!empty($_POST['html'])) { $html = get_magic_quotes_gpc() ? stripslashes($_POST['html']) : $_POST['html']; - $purifier = new HTMLPurifier(); + $config = HTMLPurifier_Config::createDefault(); + $config->set('Core', 'TidyFormat', !empty($_POST['tidy'])); + $purifier = new HTMLPurifier($config); $pure_html = $purifier->purify($html); ?> @@ -65,6 +67,8 @@ if (isset($html)) { HTMLPurifier_Encoder::cleanUTF8($html), ENT_COMPAT, 'UTF-8'); } ?> +
Nicely format output with Tidy? />
diff --git a/library/HTMLPurifier/Generator.php b/library/HTMLPurifier/Generator.php index 613ea965..7adfa81a 100644 --- a/library/HTMLPurifier/Generator.php +++ b/library/HTMLPurifier/Generator.php @@ -23,6 +23,19 @@ HTMLPurifier_ConfigSchema::define( 'This directive was available since 1.1.' ); +// extension constraints could be factored into ConfigSchema +HTMLPurifier_ConfigSchema::define( + 'Core', 'TidyFormat', false, 'bool', + 'Determines whether or not to run Tidy on the final output for pretty '. + 'formatting reasons, such as indentation and wrap. This can greatly '. + 'improve readability for editors who are hand-editing the HTML, but is '. + 'by no means necessary as HTML Purifier has already fixed all major '. + 'errors the HTML may have had and could potentially result in data loss '. + 'due to bugs in Tidy. Tidy is a non-default extension, and this directive '. + 'will silently fail if Tidy is not available. This '. + 'directive was available since 1.1.1.' +); + /** * Generates HTML from tokens. */ @@ -56,6 +69,30 @@ class HTMLPurifier_Generator foreach ($tokens as $token) { $html .= $this->generateFromToken($token); } + if ($config->get('Core', 'TidyFormat') && extension_loaded('tidy')) { + + $tidy_options = array( + 'indent'=> true, + 'output-xhtml' => $this->_xhtml, + 'show-body-only' => true, + 'indent-spaces' => 2, + 'wrap' => 68, + ); + if (version_compare(PHP_VERSION, '5', '<')) { + tidy_set_encoding('utf8'); + foreach ($tidy_options as $key => $value) { + tidy_setopt($key, $value); + } + tidy_parse_string($html); + tidy_clean_repair(); + $html = tidy_get_output(); + } else { + $tidy = new Tidy; + $tidy->parseString($html, $tidy_options, 'utf8'); + $tidy->cleanRepair(); + $html = (string) $tidy; + } + } return $html; } diff --git a/tests/HTMLPurifier/GeneratorTest.php b/tests/HTMLPurifier/GeneratorTest.php index 6b85a9ca..a6ca4043 100644 --- a/tests/HTMLPurifier/GeneratorTest.php +++ b/tests/HTMLPurifier/GeneratorTest.php @@ -123,6 +123,9 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase var $config; function assertGeneration($tokens, $expect) { $result = $this->gen->generateFromTokens($tokens, $this->config); + // normalized newlines, this probably should be put somewhere else + $result = str_replace("\r\n", "\n", $result); + $result = str_replace("\r", "\n", $result); $this->assertEqual($expect, $result); } @@ -148,6 +151,25 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase } + function test_generateFromTokens_TidyFormat() { + // abort test if tidy isn't loaded + if (!extension_loaded('tidy')) return; + + $this->config = HTMLPurifier_Config::createDefault(); + $this->config->set('Core', 'TidyFormat', true); + + // nice wrapping please + $this->assertGeneration( + array( + new HTMLPurifier_Token_Start('div'), + new HTMLPurifier_Token_Text('Text'), + new HTMLPurifier_Token_End('div') + ), + "
\n Text\n
\n" + ); + + } + } ?> \ No newline at end of file