From 086dc9177b077431ffb4c75df8aebb6e5fa4c121 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sun, 20 May 2007 02:12:01 +0000 Subject: [PATCH] [1.7.0] Add documentation for the Tidy functionality - Make specifying the child property for ElementDef unnecessary when overloading content_model or content_model_type - Add necessary includes to Tidy module files - Move div@align fix to Tidy_Proprietary - Future proof attrTransform.php by setting doctype to strict git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1071 48356398-32a2-884e-a903-53898d9a118a --- docs/enduser-tidy.html | 230 ++++++++++++++++++ docs/index.html | 3 + docs/style.css | 1 + library/HTMLPurifier/ElementDef.php | 10 +- .../HTMLModule/Tidy/Proprietary.php | 27 ++ .../HTMLPurifier/HTMLModule/Tidy/XHTML.php | 1 + .../HTMLModule/Tidy/XHTMLAndHTML4.php | 13 +- .../HTMLModule/Tidy/XHTMLStrict.php | 2 +- library/HTMLPurifier/HTMLModuleManager.php | 11 +- smoketests/attrTransform.php | 1 + 10 files changed, 290 insertions(+), 9 deletions(-) create mode 100644 docs/enduser-tidy.html create mode 100644 library/HTMLPurifier/HTMLModule/Tidy/Proprietary.php diff --git a/docs/enduser-tidy.html b/docs/enduser-tidy.html new file mode 100644 index 00000000..51b52128 --- /dev/null +++ b/docs/enduser-tidy.html @@ -0,0 +1,230 @@ + + + + + + + +Tidy - HTML Purifier + + + +

Tidy

+ +
Filed under Development
+
Return to the index.
+
HTML Purifier End-User Documentation
+ +

You've probably heard of HTML Tidy, Dave Raggett's little piece +of software that cleans up poorly written HTML. Let me say it straight +out:

+ +

This ain't HTML Tidy!

+ +

Rather, Tidy stands for a cool set of Tidy-inspired in HTML Purifier +that allows users to submit deprecated elements and attributes and get +valid strict markup back. For example:

+ +
<center>Centered</center>
+ +

...becomes:

+ +
<div style="text-align:center;">Centered</div>
+ +

...when this particular fix is run on the HTML. This tutorial will give +you down the lowdown of what exactly HTML Purifier will do when Tidy +is on, and how to fine tune this behavior. Once again, you do +not need Tidy installed on your PHP to use these features!

+ +

What does it do?

+ +

Tidy will do several things to your HTML:

+ + + +

What are levels?

+ +

Levels describe how aggressive the Tidy module should be when +cleaning up HTML. There are four levels to pick: none, light, medium +and heavy. Each of these levels has a well-defined set of behavior +associated with it, although it may change depending on your doctype.

+ +
+
light
+
This is the lenient level. If a tag or attribute + is about to be removed because it isn't supported by the + doctype, Tidy will step in and change into an alternative that + is supported.
+
medium
+
This is the correctional level. At this level, + all the functions of light are performed, as well as some extra, + non-essential best practices enforcement. Changes made on this + level are very benign and are unlikely to cause problems.
+
heavy
+
This is the aggressive level. If a tag or + attribute is deprecated, it will be converted into a non-deprecated + version, no ifs ands or buts.
+
+ +

By default, Tidy operates on the medium level. You can +change the level of cleaning by setting the %HTML.TidyLevel configuration +directive:

+ +
$config->set('HTML', 'TidyLevel', 'heavy'); // burn baby burn!
+ +

Is the light level really light?

+ +

It depends on what doctype you're using. If your documents are HTML +4.01 Transitional, HTML Purifier will be lazy +and won't clean up your center +or font tags. But if you're using HTML 4.01 Strict, +HTML Purifier has no choice: it has to convert them, or they will +be nuked out of existence. So while light on Transitional will result +in little to no changes, light on Strict will still result in quite +a lot of fixes.

+ +

This is different behavior from 1.6 or before, where deprecated +tags in transitional documents would +always be cleaned up regardless. This is also better behavior.

+ +

My pages look different!

+ +

HTML Purifier is tasked with converting deprecated tags and +attributes to standards-compliant alternatives, which usually +need copious amounts of CSS. It's also not foolproof: sometimes +things do get lost in the translation. This is why when HTML Purifier +can get away with not doing cleaning, it won't; this is why +the default value is medium and not heavy.

+ +

Fortunately, only a few attributes have problems with the switch +over. They are described below:

+ + + + + + + + + + + + + + + + + + + + + + + + +
Element@AttrChanges
caption@alignFirefox supports stuffing the caption on the + left and right side of the table, a feature that + Internet Explorer, understandably, does not have. + When align equals right or left, the text will simply + be aligned on the left or right side.
img@alignThe implementation for align bottom is good, but not + perfect. There are a few pixel differences.
br@clearClear both gets a little wonky in Internet Explorer. Haven't + really been able to figure out why.
hr@noshadeAll browsers implement this slightly differently: we've + chosen to make noshade horizontal rules gray.
+ +

There are a few more minor, although irritating, bugs. +Some older browsers support deprecated attributes, +but not CSS. Transformed elements and attributes will look unstyled +to said browsers. Also, CSS precedence is slightly different for +inline styles versus presentational markup. In increasing precedence:

+ +
    +
  1. Presentational attributes
  2. +
  3. External style sheets
  4. +
  5. Inline styling
  6. +
+ +

This means that styling that may have been masked by external CSS +declarations will start showing up (a good thing, perhaps). Finally, +if you've turned off the style attribute, almost all of +these transformations will not work. Sorry mates.

+ +

You can review the rendering before and after of these transformations +by consulting the attrTransform.php +smoketest.

+ +

I like the general idea, but the specifics bug me!

+ +

So you want HTML Purifier to clean up your HTML, but you're not +so happy about the br@clear implementation. That's perfectly fine! +HTML Purifier will make accomodations:

+ +
$config->set('HTML', 'Doctype', 'XHTML 1.0 Transitional');
+$config->set('HTML', 'TidyLevel', 'heavy'); // all changes, minus...
+$config->set('HTML', 'TidyRemove', 'br@clear');
+ +

That third line does the magic, removing the br@clear fix +from the module, ensuring that <br clear="both" /> +will pass through unharmed. The reverse is possible too:

+ +
$config->set('HTML', 'Doctype', 'XHTML 1.0 Transitional');
+$config->set('HTML', 'TidyLevel', 'none'); // no changes, plus...
+$config->set('HTML', 'TidyAdd', 'p@align');
+ +

In this case, all transformations are shut off, except for the p@align +one, which you found handy.

+ +

To find out what the names of fixes you want to turn on or off are, +you'll have to consult the source code, specifically the files in +HTMLPurifier/HTMLModule/Tidy/. There is, however, a +general syntax:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameExampleInterpretation
elementfontTag transform for element
element@attrbr@clearAttribute transform for attr on element
@attr@langGlobal attribute transform for attr
e#content_model_typeblockquote#content_model_typeChange of child processing implementation for e
+ +

So... what's the lowdown?

+ +

The lowdown is, quite frankly, HTML Purifier's default settings are +probably good enough. The next step is to bump the level up to heavy, +and if that still doesn't satisfy your appetite, do some fine tuning. +Other than that, don't worry about it: this all works silently and +effectively in the background.

+ +
$Id: $
+ + \ No newline at end of file diff --git a/docs/index.html b/docs/index.html index 7a7ec0a3..dde340cc 100644 --- a/docs/index.html +++ b/docs/index.html @@ -34,6 +34,9 @@ information for casual developers using HTML Purifier.

UTF-8: The Secret of Character Encoding
Describes the rationale for using UTF-8, the ramifications otherwise, and how to make the switch.
+
Tidy
+
Tutorial for tweaking HTML Purifier's Tidy-like behavior.
+

Development

diff --git a/docs/style.css b/docs/style.css index 75a3e2f7..9751a026 100644 --- a/docs/style.css +++ b/docs/style.css @@ -25,6 +25,7 @@ h4 {font-family:sans-serif; font-size:0.9em; font-weight:bold; } .aside {margin-left:2em; font-family:sans-serif; font-size:0.9em; } blockquote .label {font-weight:bold; font-size:1em; margin:0 0 .1em; border-bottom:1px solid #CCC;} +.emphasis {font-weight:bold; text-align:center; font-size:1.3em;} /* A regular table */ .table {border-collapse:collapse; border-bottom:2px solid #888; margin-left:2em; } diff --git a/library/HTMLPurifier/ElementDef.php b/library/HTMLPurifier/ElementDef.php index ec559a3e..d99bfab5 100644 --- a/library/HTMLPurifier/ElementDef.php +++ b/library/HTMLPurifier/ElementDef.php @@ -141,9 +141,15 @@ class HTMLPurifier_ElementDef $this->_mergeAssocArray($this->auto_close, $def->auto_close); $this->_mergeAssocArray($this->excludes, $def->excludes); + if(!empty($def->content_model)) { + $this->content_model .= ' | ' . $def->content_model; + $this->child = false; + } + if(!empty($def->content_model_type)) { + $this->content_model_type = $def->content_model_type; + $this->child = false; + } if(!is_null($def->child)) $this->child = $def->child; - if(!empty($def->content_model)) $this->content_model .= ' | ' . $def->content_model; - if(!empty($def->content_model_type)) $this->content_model_type = $def->content_model_type; if(!is_null($def->descendants_are_inline)) $this->descendants_are_inline = $def->descendants_are_inline; if(!is_null($def->safe)) $this->safe = $def->safe; diff --git a/library/HTMLPurifier/HTMLModule/Tidy/Proprietary.php b/library/HTMLPurifier/HTMLModule/Tidy/Proprietary.php new file mode 100644 index 00000000..2412f0fb --- /dev/null +++ b/library/HTMLPurifier/HTMLModule/Tidy/Proprietary.php @@ -0,0 +1,27 @@ + \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/Tidy/XHTML.php b/library/HTMLPurifier/HTMLModule/Tidy/XHTML.php index 24b084e8..78ddef47 100644 --- a/library/HTMLPurifier/HTMLModule/Tidy/XHTML.php +++ b/library/HTMLPurifier/HTMLModule/Tidy/XHTML.php @@ -1,6 +1,7 @@ doctypes->register( 'HTML 4.01 Transitional', false, array_merge($common, $transitional, $non_xml), - array('Tidy_Transitional') + array('Tidy_Transitional', 'Tidy_Proprietary') ); $this->doctypes->register( 'HTML 4.01 Strict', false, array_merge($common, $non_xml), - array('Tidy_Strict') + array('Tidy_Strict', 'Tidy_Proprietary') ); $this->doctypes->register( 'XHTML 1.0 Transitional', true, array_merge($common, $transitional, $xml, $non_xml), - array('Tidy_Transitional', 'Tidy_XHTML') + array('Tidy_Transitional', 'Tidy_XHTML', 'Tidy_Proprietary') ); $this->doctypes->register( 'XHTML 1.0 Strict', true, array_merge($common, $xml, $non_xml), - array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_XHTMLStrict') + array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_XHTMLStrict', 'Tidy_Proprietary') ); $this->doctypes->register( 'XHTML 1.1', true, array_merge($common, $xml), - array('Tidy_Strict', 'Tidy_XHTML') // Tidy_XHTML1_1 + array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary') // Tidy_XHTML1_1 ); } diff --git a/smoketests/attrTransform.php b/smoketests/attrTransform.php index 7b6a4fd1..05f61813 100644 --- a/smoketests/attrTransform.php +++ b/smoketests/attrTransform.php @@ -42,6 +42,7 @@ $xml = simplexml_load_file('attrTransform.xml'); // attr transform enabled HTML Purifier $config = HTMLPurifier_Config::createDefault(); +$config->set('HTML', 'Doctype', 'XHTML 1.0 Strict'); $purifier = new HTMLPurifier($config); $title = isset($_GET['title']) ? $_GET['title'] : true;