From bc5871f389867a7d9ab8139c46a24bd8b58fa44f Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <edwardzyang@thewritingpot.com>
Date: Sun, 24 Sep 2006 22:22:06 +0000
Subject: [PATCH] Merged 438:439, 440:441, and 442:457 from trunk/ to
 branches/1.1/, mostly major work done for 1.1.1 release. - Various
 documentation updates - Fixed fatal error in benchmark scripts, slightly
 augmented - As far as possible, whitespace is preserved in-between table
 children - Configuration option to optionally Tidy up output for indentation
 to make up for dropped whitespace by DOMLex (pretty-printing for the entire
 application should be done by a page-wide Tidy) - Sample test-settings.php
 file included

Unrelated unmerged edit: removed irrelevant 1.2.0 release notes, those only exist in the trunk.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/1.1@458 48356398-32a2-884e-a903-53898d9a118a
---
 INSTALL                               |  8 ++++--
 NEWS                                  | 12 ++++++---
 SLOW                                  | 13 ++++++---
 TODO                                  | 28 ++++++++++---------
 benchmarks/Lexer.php                  | 38 +++++++++++++++++---------
 benchmarks/ProfileDirectLex.php       |  2 +-
 configdoc/generate.php                | 12 ++++-----
 docs/examples/demo.php                |  6 ++++-
 docs/progress.html                    |  8 +++---
 library/HTMLPurifier/AttrDef/Host.php |  9 +++++--
 library/HTMLPurifier/ChildDef.php     | 24 ++++++++++++-----
 library/HTMLPurifier/Encoder.php      |  2 +-
 library/HTMLPurifier/Generator.php    | 39 +++++++++++++++++++++++++++
 library/HTMLPurifier/Lexer/DOMLex.php |  7 +++++
 test-settings.sample.php              | 17 ++++++++++++
 tests/HTMLPurifier/ChildDefTest.php   | 15 ++++++++---
 tests/HTMLPurifier/GeneratorTest.php  | 22 +++++++++++++++
 17 files changed, 202 insertions(+), 60 deletions(-)
 create mode 100644 test-settings.sample.php

diff --git a/INSTALL b/INSTALL
index 45ad662e..b3382056 100644
--- a/INSTALL
+++ b/INSTALL
@@ -46,7 +46,9 @@ Then, it's a simple matter of including the base file:
 
     require_once 'HTMLPurifier.php';
 
-...and you're good to go.
+...and you're good to go. The library/ folder contains all the files you need,
+so you can get rid of most of everything else when using the library in a
+production environment.
 
 
 
@@ -98,7 +100,8 @@ XHTML output like this:
 
 However, I strongly recommend that you use XHTML. Currently, we can only
 guarantee transitional-complaint output, future versions will also allow strict
-output.
+output. There are more configuration directives which can be read about
+here: http://hp.jpsband.org/live/configdoc/plain.html
 
 
 
@@ -130,6 +133,7 @@ If your website is in UTF-8 and XHTML Transitional, use this code:
     $purifier = new HTMLPurifier();
     
     $clean_html = $purifier->purify($dirty_html);
+?>
 
 If your website is in a different encoding or doctype, use this code:
 
diff --git a/NEWS b/NEWS
index e5c368b6..19154667 100644
--- a/NEWS
+++ b/NEWS
@@ -1,11 +1,15 @@
 NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
 
-1.2.0, unknown projected release date
-(major feature release)
-
 1.1.1, unknown projected release date
-(bugfix release)
+- Various documentation updates
+- Fixed parse error in configuration documentation script
+- Fixed fatal error in benchmark scripts, slightly augmented
+- As far as possible, whitespace is preserved in-between table children
+- Configuration option to optionally Tidy up output for indentation to make up
+  for dropped whitespace by DOMLex (pretty-printing for the entire application
+  should be done by a page-wide Tidy)
+- Sample test-settings.php file included
 
 1.1.0, released 2006-09-16
 - Made URI validator more forgiving: will ignore leading and trailing
diff --git a/SLOW b/SLOW
index 00f372fa..9813c0f6 100644
--- a/SLOW
+++ b/SLOW
@@ -17,18 +17,23 @@ second tacked on to the load time probably isn't going to be that huge of
 a problem.  Then, displaying the content is a simple a manner of outputting
 it directly from your database/filesystem.  The trouble with this method is
 that your user loses the original text, and when doing edits, will be
-handling the filtered text.  Of course, maybe that's a good thing.  If you
-don't mind a little extra complexity, you can try...
+handling the filtered text.  While this may be a good thing, especially if
+you're using a WYSIWYG editor, it can also result in data-loss if a user
+expects a certain to be available but it doesn't.
 
 2. Caching the filtered output - accept the submitted text and put it
 unaltered into the database, but then also generate a filtered version and
 stash that in the database.  Serve the filtered version to readers, and the
 unaltered version to editors.  If need be, you can invalidate the cache and
 have the cached filtered version be regenerated on the first page view.  Pros?
-Full data retention.  Cons?  It's more complicated.
+Full data retention.  Cons?  It's more complicated, and opens other editors
+up to XSS if they are using a WYSIWYG editor (to fix that, they'd have to
+be able to get their hands on the *really* original text served in plaintext
+mode).
 
 In short, inbound filtering is almost as simple as outbound filtering, but
 it has some drawbacks which cannot be fixed unless you save both the original
 and the filtered versions.
 
-There is a third option: profile and optimize HTMLPurifier yourself.  ;-)
+There is a third option: profile and optimize HTMLPurifier yourself.  Be sure
+to tell me if you decide to do that!  ;-)
diff --git a/TODO b/TODO
index 71ffb20c..79c32c89 100644
--- a/TODO
+++ b/TODO
@@ -6,25 +6,29 @@ Ongoing
  - Plugins for major CMSes (very tricky issue)
 
 1.2 release
- - Additional support for poorly written HTML
-    - Implement all non-essential attribute transforms
-    - Microsoft Word HTML cleaning (i.e. MsoNormal)
- - Error logging for filtering and cleanup procedures
-
-1.3 release
- - Formatters for plaintext
-    - Auto-paragraphing (be sure to leverage fact that we know when things
-      shouldn't be paragraphed, such as lists and tables).
  - Make URI validation routines tighter (especially mailto)
  - More extensive URI filtering schemes
  - Allow for background-image and list-style-image (see above)
  - Distinguish between different types of URIs, for instance, a mailto URI
    in IMG SRC is nonsensical
+ - Error logging for filtering/cleanup procedures
 
-2.0 release
+1.3 release
  - Add various "levels" of cleaning
     - Related: Allow strict (X)HTML
 
+1.4 release
+ - Additional support for poorly written HTML
+    - Implement all non-essential attribute transforms
+    - Microsoft Word HTML cleaning (i.e. MsoNormal)
+
+2.0 release
+ - Formatters for plaintext
+    - Auto-paragraphing (be sure to leverage fact that we know when things
+      shouldn't be paragraphed, such as lists and tables).
+    - Linkify URLs
+    - Smileys
+
 3.0 release
  - Extended HTML capabilities based on namespacing and tag transforms
     - Hooks for adding custom processors to custom namespaced tags and
@@ -39,11 +43,9 @@ Unknown release (on a scratch-an-itch basis)
  - Fixes for Firefox's inability to handle COL alignment props (Bug 915)
  - Automatically add non-breaking spaces to empty table cells when
    empty-cells:show is applied to have compatibility with Internet Explorer
- - Pretty-printing HTML (adds dependency of Generator to HTMLDefinition)
  - Non-lossy dumb alternate character encoding transformations, achieved by
    numerically encoding all non-ASCII characters
- - Preservation of indentation in tables (tricky since the contents can be
-   shuffled around)
 
 Wontfix
  - Non-lossy smart alternate character encoding transformations
+ - Pretty-printing HTML, users can use Tidy on the output on entire page
diff --git a/benchmarks/Lexer.php b/benchmarks/Lexer.php
index da112fe4..9e13b54b 100644
--- a/benchmarks/Lexer.php
+++ b/benchmarks/Lexer.php
@@ -3,15 +3,24 @@
 // emulates inserting a dir called HTMLPurifier into your class dir
 set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
 
-require_once 'HTMLPurifier/ConfigDef.php';
-require_once 'HTMLPurifier/Config.php';
-require_once 'HTMLPurifier/Lexer/DirectLex.php';
-require_once 'HTMLPurifier/Lexer/PEARSax3.php';
+@include_once '../test-settings.php';
 
-$LEXERS = array(
-    'DirectLex' => new HTMLPurifier_Lexer_DirectLex(),
-    'PEARSax3'  => new HTMLPurifier_Lexer_PEARSax3()
-);
+require_once 'HTMLPurifier/ConfigSchema.php';
+require_once 'HTMLPurifier/Config.php';
+
+$LEXERS = array();
+$RUNS = isset($GLOBALS['HTMLPurifierTest']['Runs'])
+    ? $GLOBALS['HTMLPurifierTest']['Runs'] : 2; 
+
+require_once 'HTMLPurifier/Lexer/DirectLex.php';
+$LEXERS['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
+
+if (!empty($GLOBALS['HTMLPurifierTest']['PEAR'])) {
+    require_once 'HTMLPurifier/Lexer/PEARSax3.php';
+    $LEXERS['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3();
+} else {
+    exit('PEAR required to perform benchmark.');
+}
 
 if (version_compare(PHP_VERSION, '5', '>=')) {
     require_once 'HTMLPurifier/Lexer/DOMLex.php';
@@ -56,9 +65,12 @@ class RowTimer extends Benchmark_Timer
             if ($standard == false) $standard = $v['diff'];
             
             $perc = $v['diff'] * 100 / $standard;
+            $bad_run = ($v['diff'] < 0);
             
-            $out .= '<td align="right">' . number_format($perc, 2, '.', '') .
-                   '%</td>';
+            $out .= '<td align="right"'.
+                   ($bad_run ? ' style="color:#AAA;"' : '').
+                   '>' . number_format($perc, 2, '.', '') .
+                   '%</td><td>'.number_format($v['diff'],4,'.','').'</td>';
             
         }
         
@@ -79,13 +91,13 @@ function print_lexers() {
 }
 
 function do_benchmark($name, $document) {
-    global $LEXERS;
+    global $LEXERS, $RUNS;
     
     $timer = new RowTimer($name);
     $timer->start();
     
     foreach($LEXERS as $key => $lexer) {
-        $tokens = $lexer->tokenizeHTML($document);
+        for ($i=0; $i<$RUNS; $i++) $tokens = $lexer->tokenizeHTML($document);
         $timer->setMarker($key);
     }
     
@@ -103,7 +115,7 @@ function do_benchmark($name, $document) {
 <table border="1">
 <tr><th>Case</th><?php
 foreach ($LEXERS as $key => $value) {
-    echo '<th>' . htmlspecialchars($key) . '</th>';
+    echo '<th colspan="2">' . htmlspecialchars($key) . '</th>';
 }
 ?></tr>
 <?php
diff --git a/benchmarks/ProfileDirectLex.php b/benchmarks/ProfileDirectLex.php
index 175d2894..faf9bef5 100644
--- a/benchmarks/ProfileDirectLex.php
+++ b/benchmarks/ProfileDirectLex.php
@@ -2,7 +2,7 @@
 
 set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
 
-require_once 'HTMLPurifier/ConfigDef.php';
+require_once 'HTMLPurifier/ConfigSchema.php';
 require_once 'HTMLPurifier/Config.php';
 require_once 'HTMLPurifier/Lexer/DirectLex.php';
 
diff --git a/configdoc/generate.php b/configdoc/generate.php
index ebef6777..a1231ba2 100644
--- a/configdoc/generate.php
+++ b/configdoc/generate.php
@@ -50,7 +50,7 @@ function appendHTMLDiv($document, $node, $html) {
 // ---------------------------------------------------------------------------
 // Load copies of HTMLPurifier_ConfigDef and HTMLPurifier
 
-$definition = HTMLPurifier_ConfigDef::instance();
+$schema = HTMLPurifier_ConfigSchema::instance();
 $purifier = new HTMLPurifier();
 
 
@@ -61,7 +61,7 @@ $types_document = new DOMDocument('1.0', 'UTF-8');
 $types_root = $types_document->createElement('types');
 $types_document->appendChild($types_root);
 $types_document->formatOutput = true;
-foreach ($definition->types as $name => $expanded_name) {
+foreach ($schema->types as $name => $expanded_name) {
     $types_type = $types_document->createElement('type', $expanded_name);
     $types_type->setAttribute('id', $name);
     $types_root->appendChild($types_type);
@@ -88,7 +88,7 @@ TODO for XML format:
 - create a definition (DTD or other) once interface stabilizes
 */
 
-foreach($definition->info as $namespace_name => $namespace_info) {
+foreach($schema->info as $namespace_name => $namespace_info) {
     
     $dom_namespace = $dom_document->createElement('namespace');
     $dom_root->appendChild($dom_namespace);
@@ -100,7 +100,7 @@ foreach($definition->info as $namespace_name => $namespace_info) {
     $dom_namespace_description = $dom_document->createElement('description');
     $dom_namespace->appendChild($dom_namespace_description);
     appendHTMLDiv($dom_document, $dom_namespace_description,
-        $definition->info_namespace[$namespace_name]->description);
+        $schema->info_namespace[$namespace_name]->description);
     
     foreach ($namespace_info as $name => $info) {
         
@@ -128,14 +128,14 @@ foreach($definition->info as $namespace_name => $namespace_info) {
             }
         }
         
-        $raw_default = $definition->defaults[$namespace_name][$name];
+        $raw_default = $schema->defaults[$namespace_name][$name];
         if (is_bool($raw_default)) {
             $default = $raw_default ? 'true' : 'false';
         } elseif (is_string($raw_default)) {
             $default = "\"$raw_default\"";
         } else {
             $default = print_r(
-                    $definition->defaults[$namespace_name][$name], true
+                    $schema->defaults[$namespace_name][$name], true
                 );
         }
         $dom_constraints->appendChild(
diff --git a/docs/examples/demo.php b/docs/examples/demo.php
index 07630078..35a47986 100644
--- a/docs/examples/demo.php
+++ b/docs/examples/demo.php
@@ -21,7 +21,9 @@ if (!empty($_POST['html'])) {
     
     $html = get_magic_quotes_gpc() ? stripslashes($_POST['html']) : $_POST['html'];
     
-    $purifier = new HTMLPurifier();
+    $config = HTMLPurifier_Config::createDefault();
+    $config->set('Core', 'TidyFormat', !empty($_POST['tidy']));
+    $purifier = new HTMLPurifier($config);
     $pure_html = $purifier->purify($html);
     
 ?>
@@ -65,6 +67,8 @@ if (isset($html)) {
             HTMLPurifier_Encoder::cleanUTF8($html), ENT_COMPAT, 'UTF-8');
 }
         ?></textarea>
+        <div>Nicely format output with Tidy? <input type="checkbox" value="1"
+        name="tidy"<?php if (!empty($_POST['tidy'])) echo ' checked="checked"'; ?> /></div>
         <div>
             <input type="submit" value="Submit" name="submit" class="button" />
         </div>
diff --git a/docs/progress.html b/docs/progress.html
index 749129bb..a15386e9 100644
--- a/docs/progress.html
+++ b/docs/progress.html
@@ -86,7 +86,7 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
     Well-supported values are: disc, circle, square,
     decimal, lower-roman, upper-roman, lower-alpha and upper-alpha. See also
     CSS 3. Mostly IE lack of support.</td></tr>
-<tr class="css1 impl-yes"><td>list-style</td><td>SHORTHAND, target milestone 1.0</td></tr>
+<tr class="css1 impl-yes"><td>list-style</td><td>SHORTHAND</td></tr>
 <tr class="css1 impl-yes"><td>margin</td><td>MULTIPLE</td></tr>
 <tr class="css1 impl-yes"><td>margin-*</td><td>COMPOSITE(&lt;length&gt;,
     &lt;percentage&gt;, auto)</td></tr>
@@ -134,7 +134,7 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
 
 <tbody>
 <tr><th colspan="2">Unknown</th></tr>
-<tr class="danger css1"><td>background-image</td><td>Dangerous, target milestone 1.3</td></tr>
+<tr class="danger css1"><td>background-image</td><td>Dangerous, target milestone 1.2</td></tr>
 <tr class="css1"><td>background-attachment</td><td>ENUM(scroll, fixed),
     Depends on background-image</td></tr>
 <tr class="css1"><td>background-position</td><td>Depends on background-image</td></tr>
@@ -144,7 +144,7 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
     inline-block has incomplete IE6 support and requires -moz-inline-box
     for Mozilla. Unknown target milestone.</td></tr>
 <tr><td class="css1">height</td><td>Interesting, why use it? Unknown target milestone.</td></tr>
-<tr class="danger css1"><td>list-style-image</td><td>Dangerous? Target milestone 1.3</td></tr>
+<tr class="danger css1"><td>list-style-image</td><td>Dangerous? Target milestone 1.2</td></tr>
 <tr class="impl-no"><td>max-height</td><td rowspan="4">No IE 5/6</td></tr>
 <tr class="impl-no"><td>min-height</td></tr>
 <tr class="impl-no"><td>max-width</td></tr>
@@ -254,7 +254,7 @@ Mozilla on inside and needs -moz-outline, no IE support.</td></tr>
 </tbody>
 
 <tbody>
-<tr><th colspan="3">Transform, target milestone 1.2</th></tr>
+<tr><th colspan="3">Transform, target milestone 1.4</th></tr>
 <tr><td rowspan="5">align</td><td>CAPTION</td><td>Near-equiv style 'caption-side', drop left and right</td></tr>
     <tr><td>IMG</td><td rowspan="2">Margin-left and margin-right = auto or parent div</td></tr>
     <tr><td>TABLE</td></tr>
diff --git a/library/HTMLPurifier/AttrDef/Host.php b/library/HTMLPurifier/AttrDef/Host.php
index c373d9ef..47bce063 100644
--- a/library/HTMLPurifier/AttrDef/Host.php
+++ b/library/HTMLPurifier/AttrDef/Host.php
@@ -11,9 +11,14 @@ class HTMLPurifier_AttrDef_Host extends HTMLPurifier_AttrDef
 {
     
     /**
-     * Instances of HTMLPurifier_AttrDef_IPv4 and HTMLPurifier_AttrDef_IPv6
+     * Instance of HTMLPurifier_AttrDef_IPv4 sub-validator
      */
-    var $ipv4, $ipv6;
+    var $ipv4;
+    
+    /**
+     * Instance of HTMLPurifier_AttrDef_IPv6 sub-validator
+     */
+    var $ipv6;
     
     function HTMLPurifier_AttrDef_Host() {
         $this->ipv4 = new HTMLPurifier_AttrDef_IPv4();
diff --git a/library/HTMLPurifier/ChildDef.php b/library/HTMLPurifier/ChildDef.php
index 1147645f..e6cc93f8 100644
--- a/library/HTMLPurifier/ChildDef.php
+++ b/library/HTMLPurifier/ChildDef.php
@@ -327,6 +327,8 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
         $is_collecting = false; // are we globbing together tokens to package
                                 // into one of the collectors?
         $collection = array(); // collected nodes
+        $tag_index = 0; // the first node might be whitespace,
+                            // so this tells us where the start tag is
         
         foreach ($tokens_of_children as $token) {
             $is_child = ($nesting == 0);
@@ -344,7 +346,7 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
                 if ($is_child) {
                     // okay, let's stash the tokens away
                     // first token tells us the type of the collection
-                    switch ($collection[0]->name) {
+                    switch ($collection[$tag_index]->name) {
                         case 'tr':
                         case 'tbody':
                             $content[] = $collection;
@@ -356,13 +358,13 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
                         case 'thead':
                         case 'tfoot':
                             // access the appropriate variable, $thead or $tfoot
-                            $var = $collection[0]->name;
+                            $var = $collection[$tag_index]->name;
                             if ($$var === false) {
                                 $$var = $collection;
                             } else {
                                 // transmutate the first and less entries into
                                 // tbody tags, and then put into content
-                                $collection[0]->name = 'tbody';
+                                $collection[$tag_index]->name = 'tbody';
                                 $collection[count($collection)-1]->name = 'tbody';
                                 $content[] = $collection;
                             }
@@ -373,6 +375,7 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
                     }
                     $collection = array();
                     $is_collecting = false;
+                    $tag_index = 0;
                 } else {
                     // add the node to the collection
                     $collection[] = $token;
@@ -387,7 +390,9 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
                 if ($token->name == 'col') {
                     // the only empty tag in the possie, we can handle it
                     // immediately
-                    $cols[] = array($token);
+                    $cols[] = array_merge($collection, array($token));
+                    $collection = array();
+                    $tag_index = 0;
                     continue;
                 }
                 switch($token->name) {
@@ -401,7 +406,10 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
                         $collection[] = $token;
                         continue;
                     default:
-                        // unrecognized, drop silently
+                        if ($token->type == 'text' && $token->is_whitespace) {
+                            $collection[] = $token;
+                            $tag_index++;
+                        }
                         continue;
                 }
             }
@@ -415,6 +423,10 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
         if ($thead !== false)   $ret = array_merge($ret, $thead);
         if ($tfoot !== false)   $ret = array_merge($ret, $tfoot);
         foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
+        if (!empty($collection) && $is_collecting == false){
+            // grab the trailing space
+            $ret = array_merge($ret, $collection);
+        }
         
         array_pop($tokens_of_children); // remove phantom token
         
@@ -423,4 +435,4 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
     }
 }
 
-?>
\ No newline at end of file
+?>
diff --git a/library/HTMLPurifier/Encoder.php b/library/HTMLPurifier/Encoder.php
index a2795297..9d7a35ef 100644
--- a/library/HTMLPurifier/Encoder.php
+++ b/library/HTMLPurifier/Encoder.php
@@ -88,7 +88,7 @@ class HTMLPurifier_Encoder
         if ($iconv && !$force_php) {
             // do the shortcut way
             $str = @iconv('UTF-8', 'UTF-8//IGNORE', $str);
-            return strtr($str, $non_sgml_chars);;
+            return strtr($str, $non_sgml_chars);
         }
         
         $mState = 0; // cached expected number of octets after the current octet
diff --git a/library/HTMLPurifier/Generator.php b/library/HTMLPurifier/Generator.php
index 613ea965..72ca3921 100644
--- a/library/HTMLPurifier/Generator.php
+++ b/library/HTMLPurifier/Generator.php
@@ -23,6 +23,21 @@ HTMLPurifier_ConfigSchema::define(
     'This directive was available since 1.1.'
 );
 
+// extension constraints could be factored into ConfigSchema
+HTMLPurifier_ConfigSchema::define(
+    'Core', 'TidyFormat', false, 'bool',
+    '<p>Determines whether or not to run Tidy on the final output for pretty '.
+    'formatting reasons, such as indentation and wrap.</p><p>This can greatly '.
+    'improve readability for editors who are hand-editing the HTML, but is '.
+    'by no means necessary as HTML Purifier has already fixed all major '.
+    'errors the HTML may have had. Tidy is a non-default extension, and this directive '.
+    'will silently fail if Tidy is not available.</p><p>If you are looking to make '.
+    'the overall look of your page\'s source better, I recommend running Tidy '.
+    'on the entire page rather than just user-content (after all, the '.
+    'indentation relative to the containing blocks will be incorrect).</p><p>This '.
+    'directive was available since 1.1.1.</p>'
+);
+
 /**
  * Generates HTML from tokens.
  */
@@ -56,6 +71,30 @@ class HTMLPurifier_Generator
         foreach ($tokens as $token) {
             $html .= $this->generateFromToken($token);
         }
+        if ($config->get('Core', 'TidyFormat') && extension_loaded('tidy')) {
+            
+            $tidy_options = array(
+               'indent'=> true,
+               'output-xhtml' => $this->_xhtml,
+               'show-body-only' => true,
+               'indent-spaces' => 2,
+               'wrap' => 68,
+            );
+            if (version_compare(PHP_VERSION, '5', '<')) {
+                tidy_set_encoding('utf8');
+                foreach ($tidy_options as $key => $value) {
+                    tidy_setopt($key, $value);
+                }
+                tidy_parse_string($html);
+                tidy_clean_repair();
+                $html = tidy_get_output();
+            } else {
+                $tidy = new Tidy;
+                $tidy->parseString($html, $tidy_options, 'utf8');
+                $tidy->cleanRepair();
+                $html = (string) $tidy;
+            }
+        }
         return $html;
     }
     
diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php
index a24d1014..fbdecb8f 100644
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -18,6 +18,13 @@ require_once 'HTMLPurifier/TokenFactory.php';
  * 
  * @note PHP's DOM extension does not actually parse any entities, we use
  *       our own function to do that.
+ * 
+ * @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
+ *          If this is a huge problem, due to the fact that HTML is hand
+ *          edited and youa re unable to get a parser cache that caches the
+ *          the output of HTML Purifier while keeping the original HTML lying
+ *          around, you may want to run Tidy on the resulting output or use
+ *          HTMLPurifier_DirectLex
  */
 
 class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
diff --git a/test-settings.sample.php b/test-settings.sample.php
new file mode 100644
index 00000000..1fbcfe4e
--- /dev/null
+++ b/test-settings.sample.php
@@ -0,0 +1,17 @@
+<?php
+
+// This file is necessary to run the unit tests and profiling
+// scripts.
+
+// Is PEAR available on your system? If it isn't, set to false. If PEAR
+// is not part of the default include_path, add it.
+$GLOBALS['HTMLPurifierTest']['PEAR'] = true;
+
+// How many times should profiling scripts iterate over the function? More runs 
+// means more accurate results, but they'll take longer to perform.
+$GLOBALS['HTMLPurifierTest']['Runs'] = 2;
+
+// Where is SimpleTest located?
+$simpletest_location = '/path/to/simpletest/';
+
+?>
\ No newline at end of file
diff --git a/tests/HTMLPurifier/ChildDefTest.php b/tests/HTMLPurifier/ChildDefTest.php
index 33c9907b..be81831f 100644
--- a/tests/HTMLPurifier/ChildDefTest.php
+++ b/tests/HTMLPurifier/ChildDefTest.php
@@ -1,7 +1,7 @@
 <?php
 
 require_once 'HTMLPurifier/ChildDef.php';
-require_once 'HTMLPurifier/Lexer.php';
+require_once 'HTMLPurifier/Lexer/DirectLex.php';
 require_once 'HTMLPurifier/Generator.php';
 
 class HTMLPurifier_ChildDefTest extends UnitTestCase
@@ -12,7 +12,8 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
     var $gen;
     
     function HTMLPurifier_ChildDefTest() {
-        $this->lex = HTMLPurifier_Lexer::create();
+        // it is vital that the tags be treated as literally as possible
+        $this->lex = new HTMLPurifier_Lexer_DirectLex();
         $this->gen = new HTMLPurifier_Generator();
         parent::UnitTestCase();
     }
@@ -98,6 +99,14 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
         $inputs[6] = 'foo';
         $expect[6] = false;
         
+        // whitespace sticks to the previous element, last whitespace is
+        // stationary
+        $inputs[7] = "\n   <tr />\n  <tr />\n ";
+        $expect[7] = true;
+        
+        $inputs[8] = "\n\t<tbody />\n\t\t<tfoot />\n\t\t\t";
+        $expect[8] = "\n\t\t<tfoot />\n\t<tbody />\n\t\t\t";
+        
         $this->assertSeries($inputs, $expect, $config);
         
     }
@@ -209,4 +218,4 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
     
 }
 
-?>
\ No newline at end of file
+?>
diff --git a/tests/HTMLPurifier/GeneratorTest.php b/tests/HTMLPurifier/GeneratorTest.php
index 6b85a9ca..a6ca4043 100644
--- a/tests/HTMLPurifier/GeneratorTest.php
+++ b/tests/HTMLPurifier/GeneratorTest.php
@@ -123,6 +123,9 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase
     var $config;
     function assertGeneration($tokens, $expect) {
         $result = $this->gen->generateFromTokens($tokens, $this->config);
+        // normalized newlines, this probably should be put somewhere else
+        $result = str_replace("\r\n", "\n", $result);
+        $result = str_replace("\r", "\n", $result);
         $this->assertEqual($expect, $result);
     }
     
@@ -148,6 +151,25 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase
         
     }
     
+    function test_generateFromTokens_TidyFormat() {
+        // abort test if tidy isn't loaded
+        if (!extension_loaded('tidy')) return;
+        
+        $this->config = HTMLPurifier_Config::createDefault();
+        $this->config->set('Core', 'TidyFormat', true);
+        
+        // nice wrapping please
+        $this->assertGeneration(
+            array(
+                new HTMLPurifier_Token_Start('div'),
+                new HTMLPurifier_Token_Text('Text'),
+                new HTMLPurifier_Token_End('div')
+            ),
+            "<div>\n  Text\n</div>\n"
+        );
+        
+    }
+    
 }
 
 ?>
\ No newline at end of file