diff --git a/docs/enduser-customize.html b/docs/enduser-customize.html new file mode 100644 index 00000000..6821e561 --- /dev/null +++ b/docs/enduser-customize.html @@ -0,0 +1,403 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head> +<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> +<meta name="description" content="Tutorial for customizing HTML Purifier's tag and attribute sets." /> +<link rel="stylesheet" type="text/css" href="style.css" /> + +<title>Customize - HTML Purifier</title> + +</head><body> + +<h1 class="subtitled">Customize!</h1> +<div class="subtitle">HTML Purifier is a Swiss-Army Knife</div> + +<div id="filing">Filed under End-User</div> +<div id="index">Return to the <a href="index.html">index</a>.</div> +<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div> + +<div id="applicability"> + This document covers currently unreleased functionality and + only applies to recent SVN checkouts. +</div> + +<p> + You may have heard of the <a href="dev-advanced-api.html">Advanced API</a>. + If you're interested in reading dry prose and boring functional + specifications, feel free to click that link to get a no-nonsense overview + on the Advanced API. For the rest of us, there's this tutorial. By the time + you're finished reading this, you should have a pretty good idea on + how to implement custom tags and attributes that HTML Purifier may not have. +</p> + +<h2>Is it necessary?</h2> + +<p> + Before we even write any code, it is paramount to consider whether or + not the code we're writing is necessary or not. HTML Purifier, by default, + contains a large set of elements and attributes: large enough so that + <em>any</em> element or attribute in XHTML 1.0 (and its HTML variant) + that can be safely used by the general public is implemented. +</p> + +<p> + So what needs to be implemented? (Feel free to skip this section if + you know what you want). +</p> + +<h3>XHTML 1.0</h3> + +<p> + All of the modules listed below are based off of the + <a href="http://www.w3.org/TR/2001/REC-xhtml-modularization-20010410/abstract_modules.html#sec_5.2.">modularization of + XHTML</a>, which, while technically for XHTML 1.1, is quite a useful + resource. +</p> + +<ul> + <li>Structure</li> + <li>Frames</li> + <li>Applets (deprecated)</li> + <li>Forms</li> + <li>Image maps</li> + <li>Objects</li> + <li>Frames</li> + <li>Events</li> + <li>Meta-information</li> + <li>Style sheets</li> + <li>Link (not hypertext)</li> + <li>Base</li> + <li>Name</li> +</ul> + +<p> + If you don't recognize it, you probably don't need it. But the curious + can look all of these modules up in the above-mentioned document. Note + that inline scripting comes packaged with HTML Purifier (more on this + later). +</p> + +<h3>XHTML 1.1</h3> + +<p> + We have not implemented the + <a href="http://www.w3.org/TR/2001/REC-ruby-20010531/">Ruby module</a>, + which defines a set of tags + for publishing short annotations for text, used mostly in Japanese + and Chinese school texts. +</p> + +<h3>XHTML 2.0</h3> + +<p> + <a href="http://www.w3.org/TR/xhtml2/">XHTML 2.0</a> is still a + working draft, so any elements introduced in the + specification have not been implemented and will not be implemented + until we get a recommendation or proposal. Because XHTML 2.0 is + an entirely new markup language, implementing rules for it will be + no easy task. +</p> + +<h3>HTML 5</h3> + +<p> + <a href="http://www.whatwg.org/specs/web-apps/current-work/">HTML 5</a> + is a fork of HTML 4.01 by WHATWG, who believed that XHTML 2.0 was headed + in the wrong direction. It too is a working draft, and may change + drastically before publication, but it should be noted that the + <code>canvas</code> tag has been implemented by many browser vendors. +</p> + +<h3>Proprietary</h3> + +<p> + There are a number of proprietary tags still in the wild. Many of them + have been documented in <a href="ref-proprietary-tags.txt">ref-proprietary-tags.txt</a>, + but there is currently no implementation for any of them. +</p> + +<h3>Extensions</h3> + +<p> + There are also a number of other XML languages out there that can + be embedded in HTML documents: two of the most popular are MathML and + SVG, and I frequently get requests to implement these. But they are + expansive, comprehensive specifications, and it would take far too long + to implement them <em>correctly</em> (most systems I've seen go as far + as whitelisting tags and no further; come on, what about nesting!) +</p> + +<p> + Word of warning: HTML Purifier is currently <em>not</em> namespace + aware. +</p> + +<h2>Giving back</h2> + +<p> + As you may imagine from the details above (don't be abashed if you didn't + read it all: a glance over would have done), there's quite a bit that + HTML Purifier doesn't implement. Recent architectural changes have + allowed HTML Purifier to implement elements and attributes that are not + safe! Don't worry, they won't be activated unless you set %HTML.Trusted + to true, but they certainly help out users who need to put, say, forms + on their page and don't want to go through the trouble of reading this + and implementing it themself. +</p> + +<p> + So any of the above that you implement for your own application could + help out some other poor sap on the other side of the globe. Help us + out, and send back code so that it can be hammered into a module and + released with the core. Any code would be greatly appreciated! +</p> + +<h2>And now...</h2> + +<p> + Enough philosophical talk, time for some code: +</p> + +<pre>$config = HTMLPurifier_Config::createDefault(); +$config->set('HTML', 'DefinitionID', 'enduser-customize.html tutorial'); +$config->set('HTML', 'DefinitionRev', 1); +$def =& $config->getHTMLDefinition(true);</pre> + +<p> + Assuming that HTML Purifier has already been properly loaded (hint: + include <code>HTMLPurifier.auto.php</code>), this code will set up + the environment that you need to start customizing the HTML definition. + What's going on? +</p> + +<ul> + <li> + The first three lines are regular configuration code: + <ul> + <li> + %HTML.DefinitionID is set to a unique identifier for your + custom HTML definition. This prevents it from clobbering + other custom definitions on the same installation. + </li> + <li> + %HTML.DefinitionRev is a revision integer of your HTML + definition. Because HTML definitions are cached, you'll need + to increment this whenever you make a change in order to flush + the cache. + </li> + </ul> + </li> + <li> + The fourth line retrieves a raw <code>HTMLPurifier_HTMLDefinition</code> + object that we will be tweaking. If the parameter was removed, we + would be retrieving a fully formed definition object, which is somewhat + useless for customization purposes. + </li> +</ul> + +<h3>Broken backwards-compatibility</h3> + +<p> + Those of you who have already been twiddling around with the raw + HTML definition object, you'll be noticing that you're getting an error + when you attempt to retrieve the raw definition object without specifying + a DefinitionID. It is vital to caching (see below) that you make a unique + name for your customized definition, so make up something right now and + things will operate again. +</p> + +<h2>Turn off caching</h2> + +<p> + To make development easier, we're going to temporarily turn off + definition caching: +</p> + +<pre>$config = HTMLPurifier_Config::createDefault(); +$config->set('HTML', 'DefinitionID', 'enduser-customize.html tutorial'); +$config->set('HTML', 'DefinitionRev', 1); +<strong>$config->set('Core', 'DefinitionCache', null); // remove this later!</strong> +$def =& $config->getHTMLDefinition(true);</pre> + +<p> + A few things should be mentioned about the caching mechanism before + we move on. For performance reasons, HTML Purifier caches generated + <code>HTMLPurifier_Definition</code> objects in serialized files + stored (by default) in <code>library/HTMLPurifier/DefinitionCache/Serializer</code>. + A lot of processing is done in order to create these objects, so it + makes little sense to repeat the same processing over and over again + whenever HTML Purifier is called. +</p> + +<p> + In order to identify a cache entry, HTML Purifier uses three variables: + the library's version number, the value of %HTML.DefinitionRev and + a serial of relevant configuration. Whenever any of these changes, + a new HTML definition is generated. Notice that there is no way + for the definition object to track changes to customizations: here, it + is up to you to supply appropriate information to DefinitionID and + DefinitionRev. +</p> + +<h2>Add an attribute</h2> + +<p> + For this example, we're going to implement the <code>target</code> attribute found + on <code>a</code> elements. To implement an attribute, we have to + ask a few questions: +</p> + +<ol> + <li>What element is it found on?</li> + <li>What is its name?</li> + <li>What are valid values for it?</li> +</ol> + +<p> + The first two are easy: the element is <code>a</code> and the attribute + is <code>target</code>. The third question is a little trickier. + Lets allow the special values: _blank, _self, _target and _top. + The form of this is called an <strong>enumeration</strong>, a list of + valid values, although only one can be used at a time. To translate + this into code form, we write: +</p> + +<pre>$config = HTMLPurifier_Config::createDefault(); +$config->set('HTML', 'DefinitionID', 'enduser-customize.html tutorial'); +$config->set('HTML', 'DefinitionRev', 1); +$config->set('Core', 'DefinitionCache', null); // remove this later! +$def =& $config->getHTMLDefinition(true); +<strong>$def->addAttribute('a', 'target', 'Enum#_blank,_self,_target,_top');</strong></pre> + +<p> + The <code>Enum#_blank,_self,_target,_top</code> does all the magic. + The string is split into two parts, separated by a hash mark (#): +</p> + +<ol> + <li>The first part is the name of what we call an <code>AttrDef</code></li> + <li>The second part is the parameter of the above-mentioned <code>AttrDef</code></li> +</ol> + +<p> + If that sounds vague and generic, it's because it is! HTML Purifier defines + an assortment of different attribute types one can use, and each of these + has their own specialized parameter format. Here are some of the more useful + ones: +</p> + +<table class="table"> + <thead> + <tr> + <th>Type</th> + <th>Format</th> + <th>Description</th> + </tr> + </thead> + <tbody> + <tr> + <th>Enum</th> + <td><em>[s:]</em>value1,value2,...</td> + <td> + Attribute with a number of valid values, one of which may be used. When + s: is present, the enumeration is case sensitive. + </td> + </tr> + <tr> + <th>Bool</th> + <td>attribute_name</td> + <td> + Boolean attribute, with only one valid value: the name + of the attribute. + </td> + </tr> + <tr> + <th>CDATA</th> + <td></td> + <td> + Attribute of arbitrary text. Can also be referred to as <strong>Text</strong> + (the specification makes a semantic distinction between the two). + </td> + </tr> + <tr> + <th>ID</th> + <td></td> + <td> + Attribute that specifies a unique ID + </td> + </tr> + <tr> + <th>Pixels</th> + <td></td> + <td> + Attribute that specifies an integer pixel length + </td> + </tr> + <tr> + <th>Length</th> + <td></td> + <td> + Attribute that specifies a pixel or percentage length + </td> + </tr> + <tr> + <th>NMTOKENS</th> + <td></td> + <td> + Attribute that specifies a number of name tokens, example: the + <code>class</code> attribute + </td> + </tr> + <tr> + <th>URI</th> + <td></td> + <td> + Attribute that specifies a URI, example: the <code>href</code> + attribute + </td> + </tr> + <tr> + <th>Number</th> + <td></td> + <td> + Attribute that specifies an positive integer number + </td> + </tr> + </tbody> +</table> + +<p> + For a complete list, consult <code>library/HTMLPurifier/AttrTypes.php</code>; + more information on attributes that accept parameters can be found on their + respective includes in <code>library/HTMLPurifier/AttrDef</code>. +</p> + +<p> + Sometimes, the restrictive list in AttrTypes just doesn't cut it. Don't + sweat: you can also use a fully instantiated object as the value. The + equivalent, verbose form of the above example is: +</p> + +<pre>$config = HTMLPurifier_Config::createDefault(); +$config->set('HTML', 'DefinitionID', 'enduser-customize.html tutorial'); +$config->set('HTML', 'DefinitionRev', 1); +$config->set('Core', 'DefinitionCache', null); // remove this later! +$def =& $config->getHTMLDefinition(true); +<strong>$def->addAttribute('a', 'target', new HTMLPurifier_AttrDef_Enum( + array('_blank','_self','_target','_top') +));</strong></pre> + +<p> + Trust me, you'll learn to love the shorthand. +</p> + +<h2>Add an element</h2> + +<p> + To be written... +</p> + +<div id="version">$Id: enduser-tidy.html 1158 2007-06-18 19:26:29Z Edward $</div> + +</body></html> \ No newline at end of file diff --git a/docs/index.html b/docs/index.html index 4a8dabb1..69ee6207 100644 --- a/docs/index.html +++ b/docs/index.html @@ -37,6 +37,9 @@ information for casual developers using HTML Purifier.</p> <dt><a href="enduser-tidy.html">Tidy</a></dt> <dd>Tutorial for tweaking HTML Purifier's Tidy-like behavior.</dd> +<dt><a href="enduser-customize.html">Customize</a></dt> +<dd>Tutorial for customizing HTML Purifier's tag and attribute sets.</dd> + </dl> <h2>Development</h2> diff --git a/library/HTMLPurifier/CSSDefinition.php b/library/HTMLPurifier/CSSDefinition.php index e5d963f3..af6c6058 100644 --- a/library/HTMLPurifier/CSSDefinition.php +++ b/library/HTMLPurifier/CSSDefinition.php @@ -17,6 +17,15 @@ require_once 'HTMLPurifier/AttrDef/CSS/TextDecoration.php'; require_once 'HTMLPurifier/AttrDef/CSS/URI.php'; require_once 'HTMLPurifier/AttrDef/Enum.php'; +HTMLPurifier_ConfigSchema::define( + 'CSS', 'DefinitionRev', 1, 'int', ' +<p> + Revision identifier for your custom definition. See + %HTML.DefinitionRev for details. This directive has been available + since 1.7.0. +</p> +'); + /** * Defines allowed CSS attributes and what their values are. * @see HTMLPurifier_HTMLDefinition diff --git a/library/HTMLPurifier/Config.php b/library/HTMLPurifier/Config.php index 6e078eb7..cf5eb66f 100644 --- a/library/HTMLPurifier/Config.php +++ b/library/HTMLPurifier/Config.php @@ -43,16 +43,6 @@ class HTMLPurifier_Config */ var $version = '1.6.1'; - /** - * Integer key users can use to indicate they have manually - * overridden some internal behavior and would like the - * cache to invalidate itself. This is used in conjunction - * with the %NAMESPACE.DefinitionID, the semantic difference is - * that the configuration directive determines "semantic identity", - * while this integer indicates "chronological identity". - */ - var $revision = 1; - /** * Two-level associative array of configuration directives */ diff --git a/library/HTMLPurifier/DefinitionCache.php b/library/HTMLPurifier/DefinitionCache.php index bf3f9896..81cd7b33 100644 --- a/library/HTMLPurifier/DefinitionCache.php +++ b/library/HTMLPurifier/DefinitionCache.php @@ -36,7 +36,7 @@ class HTMLPurifier_DefinitionCache */ function generateKey($config) { return $config->version . '-' . // possibly replace with function calls - $config->revision . '-' . + $config->get($this->type, 'DefinitionRev') . '-' . $config->getBatchSerial($this->type); } @@ -50,7 +50,7 @@ class HTMLPurifier_DefinitionCache list($version, $revision, $hash) = explode('-', $key, 3); $compare = version_compare($version, $config->version); if ($compare > 0) return false; - if ($compare == 0 && $revision >= $config->revision) return false; + if ($compare == 0 && $revision >= $config->get($this->type, 'DefinitionRev')) return false; return true; } diff --git a/library/HTMLPurifier/HTMLDefinition.php b/library/HTMLPurifier/HTMLDefinition.php index b621fb6f..c998aed6 100644 --- a/library/HTMLPurifier/HTMLDefinition.php +++ b/library/HTMLPurifier/HTMLDefinition.php @@ -37,6 +37,19 @@ $def->addAttribute(\'a\', \'tabindex\', \'Number\'); </p> '); +HTMLPurifier_ConfigSchema::define( + 'HTML', 'DefinitionRev', 1, 'int', ' +<p> + Revision identifier for your custom definition specified in + %HTML.DefinitionID. This serves the same purpose: uniquely identifying + your custom definition, but this one does so in a chronological + context: revision 3 is more up-to-date then revision 2. Thus, when + this gets incremented, the cache handling is smart enough to clean + up any older revisions of your definition as well as flush the + cache. This directive has been available since 1.7.0. +</p> +'); + HTMLPurifier_ConfigSchema::define( 'HTML', 'BlockWrapper', 'p', 'string', ' <p> diff --git a/tests/HTMLPurifier/DefinitionCache/SerializerTest.php b/tests/HTMLPurifier/DefinitionCache/SerializerTest.php index 20d877d0..99867b03 100644 --- a/tests/HTMLPurifier/DefinitionCache/SerializerTest.php +++ b/tests/HTMLPurifier/DefinitionCache/SerializerTest.php @@ -11,10 +11,10 @@ class HTMLPurifier_DefinitionCache_SerializerTest extends HTMLPurifier_Definitio $cache = new HTMLPurifier_DefinitionCache_Serializer('Test'); $config = $this->generateConfigMock('serial'); + $config->setReturnValue('get', 2, array('Test', 'DefinitionRev')); $config->version = '1.0.0'; - $config->revision = 2; - $config_md5 = '1.0.0-' . $config->revision . '-serial'; + $config_md5 = '1.0.0-2-serial'; $file = realpath( $rel_file = dirname(__FILE__) . @@ -120,12 +120,12 @@ class HTMLPurifier_DefinitionCache_SerializerTest extends HTMLPurifier_Definitio $config1 = $this->generateConfigMock(); $config1->version = '0.9.0'; - $config1->revision = 574; + $config1->setReturnValue('get', 574, array('Test', 'DefinitionRev')); $def1 = $this->generateDefinition(array('info' => 1)); $config2 = $this->generateConfigMock(); $config2->version = '1.0.0beta'; - $config2->revision = 1; + $config2->setReturnValue('get', 1, array('Test', 'DefinitionRev')); $def2 = $this->generateDefinition(array('info' => 3)); $cache->set($def1, $config1); @@ -158,7 +158,7 @@ class HTMLPurifier_DefinitionCache_SerializerTest extends HTMLPurifier_Definitio $cache = new HTMLPurifier_DefinitionCache_Serializer('Test'); $config = $this->generateConfigMock('serial'); $config->version = '1.0.0'; - $config->revision = 1; + $config->setReturnValue('get', 1, array('Test', 'DefinitionRev')); $dir = dirname(__FILE__) . '/SerializerTest'; $config->setReturnValue('get', $dir, array('Cache', 'SerializerPath')); diff --git a/tests/HTMLPurifier/DefinitionCacheHarness.php b/tests/HTMLPurifier/DefinitionCacheHarness.php index ebc15d68..efcb49a4 100644 --- a/tests/HTMLPurifier/DefinitionCacheHarness.php +++ b/tests/HTMLPurifier/DefinitionCacheHarness.php @@ -13,7 +13,6 @@ class HTMLPurifier_DefinitionCacheHarness extends UnitTestCase $config = new HTMLPurifier_ConfigMock($this); $config->setReturnValue('getBatchSerial', $serial, array('Test')); $config->version = '1.0.0'; - $config->revision = 1; return $config; } diff --git a/tests/HTMLPurifier/DefinitionCacheTest.php b/tests/HTMLPurifier/DefinitionCacheTest.php index eb78a11c..702712ff 100644 --- a/tests/HTMLPurifier/DefinitionCacheTest.php +++ b/tests/HTMLPurifier/DefinitionCacheTest.php @@ -8,9 +8,16 @@ class HTMLPurifier_DefinitionCacheTest extends UnitTestCase function test_isOld() { $cache = new HTMLPurifier_DefinitionCache('Test'); // non-functional + $old_copy = HTMLPurifier_ConfigSchema::instance(); + $o = new HTMLPurifier_ConfigSchema(); + HTMLPurifier_ConfigSchema::instance($o); + + HTMLPurifier_ConfigSchema::defineNamespace('Test', 'Test namespace'); + HTMLPurifier_ConfigSchema::define('Test', 'DefinitionRev', 1, 'int', 'Definition revision.'); + $config = HTMLPurifier_Config::createDefault(); $config->version = '1.0.0'; - $config->revision = 10; + $config->set('Test', 'DefinitionRev', 10); $this->assertIdentical($cache->isOld('1.0.0-10-hashstuffhere', $config), false); $this->assertIdentical($cache->isOld('1.5.0-1-hashstuffhere', $config), false); @@ -18,6 +25,9 @@ class HTMLPurifier_DefinitionCacheTest extends UnitTestCase $this->assertIdentical($cache->isOld('0.9.0-1-hashstuffhere', $config), true); $this->assertIdentical($cache->isOld('1.0.0-1-hashstuffhere', $config), true); $this->assertIdentical($cache->isOld('1.0.0beta-11-hashstuffhere', $config), true); + + HTMLPurifier_ConfigSchema::instance($old_copy); + } }