Relax allowed values of class for certain doctypes, see %Attr.ClassUseCDATA

Signed-off-by: Edward Z. Yang <edwardzyang@thewritingpot.com>
2024-12-22 08:21:52 +00:00 · 2009-05-26 01:07:40 -04:00 · 2009-05-26 01:07:40 -04:00 · 84abae08f5
commit 84abae08f5
parent 10e2d32a79
8 changed files with 64 additions and 7 deletions
--- a/5
+++ b/5
@ -18,8 +18,11 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
    %FilterParam.ExtractStyleBlocksEscaping -> %Filter.ExtractStyleBlocks.Escaping
    %FilterParam.ExtractStyleBlocksScope -> %Filter.ExtractStyleBlocks.Scope
    %FilterParam.ExtractStyleBlocksTidyImpl -> %Filter.ExtractStyleBlocks.TidyImpl
-  As usual, the old directive names will still work, but will through E_NOTICE
+  As usual, the old directive names will still work, but will throw E_NOTICE
  errors.
+# The allowed values for class have been relaxed to allow all of CDATA for
+  doctypes that are not XHTML 1.1 or XHTML 2.0.  For old behavior, set
+  %Attr.ClassUseCDATA to false.
 ! More robust support for name="" and id=""
 ! HTMLPurifier_Config::inherit($config) allows you to inherit one
  configuration, and have changes to that configuration be propagated
--- a/2
+++ b/2
@ -18,8 +18,6 @@ afraid to cast your vote for the next feature to be implemented!
  http://htmlpurifier.org/phorum/read.php?3,3491,3548
 - Fix ImgRequired to handle data correctly
 - Think about allowing explicit order of operations hooks for transforms
- Allow more relaxed "class" definition than NMTOKENS for appropriate
-  doctypes

 FUTURE VERSIONS
 ---------------
--- a/library/HTMLPurifier/AttrDef/HTML/Class.php
+++ b/library/HTMLPurifier/AttrDef/HTML/Class.php
@ -5,6 +5,15 @@
 */
 class HTMLPurifier_AttrDef_HTML_Class extends HTMLPurifier_AttrDef_HTML_Nmtokens
 {
+    protected function split($string, $config, $context) {
+        // really, this twiddle should be lazy loaded
+        $name = $config->getDefinition('HTML')->doctype->name;
+        if ($name == "XHTML 1.1" || $name == "XHTML 2.0") {
+            return parent::split($string, $config, $context);
+        } else {
+            return preg_split('/\s+/', $string);
+        }
+    }
    protected function filter($tokens, $config, $context) {
        $allowed = $config->get('Attr.AllowedClasses');
        $forbidden = $config->get('Attr.ForbiddenClasses');
@ -14,9 +23,9 @@ class HTMLPurifier_AttrDef_HTML_Class extends HTMLPurifier_AttrDef_HTML_Nmtokens
                ($allowed === null || isset($allowed[$token])) &&
                !isset($forbidden[$token])
            ) {
-                $ret[] = $token;
+                $ret[$token] = true;
            }
        }
-        return $ret;
+        return array_keys($ret);
    }
 }
--- a/library/HTMLPurifier/AttrDef/HTML/Nmtokens.php
+++ b/library/HTMLPurifier/AttrDef/HTML/Nmtokens.php
@ -13,7 +13,7 @@ class HTMLPurifier_AttrDef_HTML_Nmtokens extends HTMLPurifier_AttrDef
        // early abort: '' and '0' (strings that convert to false) are invalid
        if (!$string) return false;

-        $tokens = $this->split($string);
+        $tokens = $this->split($string, $config, $context);
        $tokens = $this->filter($tokens, $config, $context);
        if (empty($tokens)) return false;
        return implode(' ', $tokens);
@ -23,7 +23,7 @@ class HTMLPurifier_AttrDef_HTML_Nmtokens extends HTMLPurifier_AttrDef
    /**
     * Splits a space separated list of tokens into its constituent parts.
     */
-    protected function split($string) {
+    protected function split($string, $config, $context) {
        // OPTIMIZABLE!
        // do the preg_match, capture all subpatterns for reformulation

--- a/library/HTMLPurifier/ConfigSchema/schema.ser
+++ b/library/HTMLPurifier/ConfigSchema/schema.ser
--- a/library/HTMLPurifier/ConfigSchema/schema/Attr.ClassUseCDATA.txt
+++ b/library/HTMLPurifier/ConfigSchema/schema/Attr.ClassUseCDATA.txt
@ -0,0 +1,19 @@
+Attr.ClassUseCDATA
+TYPE: bool/null
+DEFAULT: null
+VERSION: 4.0.0
+--DESCRIPTION--
+If null, class will auto-detect the doctype and, if matching XHTML 1.1 or
+XHTML 2.0, will use the restrictive NMTOKENS specification of class. Otherwise,
+it will use a relaxed CDATA definition.  If true, the relaxed CDATA definition
+is forced; if false, the NMTOKENS definition is forced.  To get behavior
+of HTML Purifier prior to 4.0.0, set this directive to false.
+
+Some rational behind the auto-detection:
+in previous versions of HTML Purifier, it was assumed that the form of
+class was NMTOKENS, as specified by the XHTML Modularization (representing
+XHTML 1.1 and XHTML 2.0).  The DTDs for HTML 4.01 and XHTML 1.0, however
+specify class as CDATA.  HTML 5 effectively defines it as CDATA, but
+with the additional constraint that each name should be unique (this is not
+explicitly outlined in previous specifications).
+--# vim: et sw=4 sts=4
--- a/tests/HTMLPurifier/AttrDef/HTML/ClassTest.php
+++ b/tests/HTMLPurifier/AttrDef/HTML/ClassTest.php
@ -18,4 +18,31 @@ class HTMLPurifier_AttrDef_HTML_ClassTest extends HTMLPurifier_AttrDef_HTML_Nmto
        $this->assertDef('bar', false);
        $this->assertDef('foo bar', 'foo');
    }
+    function testDefault() {
+        $this->assertDef('valid');
+        $this->assertDef('a0-_');
+        $this->assertDef('-valid');
+        $this->assertDef('_valid');
+        $this->assertDef('double valid');
+
+        $this->assertDef('0stillvalid');
+        $this->assertDef('-0');
+
+        // test conditional replacement
+        $this->assertDef('validassoc 0valid', 'validassoc 0valid');
+
+        // test whitespace leniency
+        $this->assertDef(" double\nvalid\r", 'double valid');
+
+        // test case sensitivity
+        $this->assertDef('VALID');
+
+        // test duplicate removal
+        $this->assertDef('valid valid', 'valid');
+    }
+    function testXHTML11Behavior() {
+        $this->config->set('HTML.Doctype', 'XHTML 1.1');
+        $this->assertDef('0invalid', false);
+        $this->assertDef('valid valid', 'valid');
+    }
 }
--- a/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php
+++ b/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php
@ -32,6 +32,7 @@ class HTMLPurifier_Strategy_ValidateAttributesTest extends
    }

    function testSelectivelyRemoveInvalidClasses() {
+        $this->config->set('HTML.Doctype', 'XHTML 1.1');
        $this->assertResult(
            '<div class="valid 0invalid">Keep valid.</div>',
            '<div class="valid">Keep valid.</div>'