diff --git a/NEWS b/NEWS
index 7cb1117c..8dd7d144 100644
--- a/NEWS
+++ b/NEWS
@@ -19,6 +19,9 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
spaces. This constitutes a slight semantic change, which can be
reverted using %Output.FixInnerHTML. Reported by Neike Taika-Tessaro
and Mario Heiderich.
+# Protect against cssText/innerHTML by restricting allowed characters
+ used in fonts further than mandated by the specification. Reported
+ by Neike Taika-Tessaro and Mario Heiderich.
! Added %HTML.Nofollow to add rel="nofollow" to external links.
! More types of SPL autoloaders allowed on later versions of PHP.
! Implementations for position, top, left, right, bottom, z-index
diff --git a/configdoc/usage.xml b/configdoc/usage.xml
index a401a9ee..5704b095 100644
--- a/configdoc/usage.xml
+++ b/configdoc/usage.xml
@@ -96,27 +96,32 @@
+
+
+ 62
- 57
+ 63
- 58
+ 64
- 87
+ 93
- 101
+ 107
266
@@ -124,7 +129,7 @@
- 102
+ 108
@@ -254,6 +259,9 @@
64
+
+ 75
+
@@ -288,6 +296,11 @@
12
+
+
+ 50
+
+
18
@@ -454,7 +467,7 @@
- 45
+ 53
19
diff --git a/library/HTMLPurifier/AttrDef/CSS/FontFamily.php b/library/HTMLPurifier/AttrDef/CSS/FontFamily.php
index c29834b6..0d9a4e12 100644
--- a/library/HTMLPurifier/AttrDef/CSS/FontFamily.php
+++ b/library/HTMLPurifier/AttrDef/CSS/FontFamily.php
@@ -6,6 +6,39 @@
class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
{
+ protected $mask = null;
+
+ public function __construct() {
+ $this->mask = '- ';
+ for ($c = 'a'; $c <= 'z'; $c++) $this->mask .= $c;
+ for ($c = 'A'; $c <= 'Z'; $c++) $this->mask .= $c;
+ for ($c = '0'; $c <= '9'; $c++) $this->mask .= $c; // cast-y, but should be fine
+ // special bytes used by UTF-8
+ for ($i = 0x80; $i <= 0xFF; $i++) {
+ // We don't bother excluding invalid bytes in this range,
+ // because the our restriction of well-formed UTF-8 will
+ // prevent these from ever occurring.
+ $this->mask .= chr($i);
+ }
+
+ /*
+ PHP's internal strcspn implementation is
+ O(length of string * length of mask), making it inefficient
+ for large masks. However, it's still faster than
+ preg_match 8)
+ for (p = s1;;) {
+ spanp = s2;
+ do {
+ if (*spanp == c || p == s1_end) {
+ return p - s1;
+ }
+ } while (spanp++ < (s2_end - 1));
+ c = *++p;
+ }
+ */
+ // possible optimization: invert the mask.
+ }
+
public function validate($string, $config, $context) {
static $generic_names = array(
'serif' => true,
@@ -56,17 +89,103 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
// shouldn't show up regardless
$font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);
- // These ugly transforms don't pose a security
- // risk (as \\ and \" might). We could try to be clever and
- // use single-quote wrapping when there is a double quote
- // present, but I have choosen not to implement that.
- // (warning: this code relies on the selection of quotation
- // mark below)
- $font = str_replace('\\', '\\5C ', $font);
- $font = str_replace('"', '\\22 ', $font);
+ // Here, there are various classes of characters which need
+ // to be treated differently:
+ // - Alphanumeric characters are essentially safe. We
+ // handled these above.
+ // - Spaces require quoting, though most parsers will do
+ // the right thing if there aren't any characters that
+ // can be misinterpreted
+ // - Dashes rarely occur, but they fairly unproblematic
+ // for parsing/rendering purposes.
+ // The above characters cover the majority of Western font
+ // names.
+ // - Arbitrary Unicode characters not in ASCII. Because
+ // most parsers give little thought to Unicode, treatment
+ // of these codepoints is basically uniform, even for
+ // punctuation-like codepoints. These characters can
+ // show up in non-Western pages and are supported by most
+ // major browsers, for example: "MS 明朝" is a
+ // legitimate font-name
+ // . See
+ // the CSS3 spec for more examples:
+ //
+ // You can see live samples of these on the Internet:
+ //
+ // However, most of these fonts have ASCII equivalents:
+ // for example, 'MS Mincho', and it's considered
+ // professional to use ASCII font names instead of
+ // Unicode font names. Thanks Takeshi Terada for
+ // providing this information.
+ // The following characters, to my knowledge, have not been
+ // used to name font names.
+ // - Single quote. While theoretically you might find a
+ // font name that has a single quote in its name (serving
+ // as an apostrophe, e.g. Dave's Scribble), I haven't
+ // been able to find any actual examples of this.
+ // Internet Explorer's cssText translation (which I
+ // believe is invoked by innerHTML) normalizes any
+ // quoting to single quotes, and fails to escape single
+ // quotes. (Note that this is not IE's behavior for all
+ // CSS properties, just some sort of special casing for
+ // font-family). So a single quote *cannot* be used
+ // safely in the font-family context if there will be an
+ // innerHTML/cssText translation. Note that Firefox 3.x
+ // does this too.
+ // - Double quote. In IE, these get normalized to
+ // single-quotes, no matter what the encoding. (Fun
+ // fact, in IE8, the 'content' CSS property gained
+ // support, where they special cased to preserve encoded
+ // double quotes, but still translate unadorned double
+ // quotes into single quotes.) So, because their
+ // fixpoint behavior is identical to single quotes, they
+ // cannot be allowed either. Firefox 3.x displays
+ // single-quote style behavior.
+ // - Backslashes are reduced by one (so \\ -> \) every
+ // iteration, so they cannot be used safely. This shows
+ // up in IE7, IE8 and FF3
+ // - Semicolons, commas and backticks are handled properly.
+ // - The rest of the ASCII punctuation is handled properly.
+ // We haven't checked what browsers do to unadorned
+ // versions, but this is not important as long as the
+ // browser doesn't /remove/ surrounding quotes (as IE does
+ // for HTML).
+ //
+ // With these results in hand, we conclude that there are
+ // various levels of safety:
+ // - Paranoid: alphanumeric, spaces and dashes(?)
+ // - International: Paranoid + non-ASCII Unicode
+ // - Edgy: Everything except quotes, backslashes
+ // - NoJS: Standards compliance, e.g. sod IE. Note that
+ // with some judicious character escaping (since certain
+ // types of escaping doesn't work) this is theoretically
+ // OK as long as innerHTML/cssText is not called.
+ // We believe that international is a reasonable default
+ // (that we will implement now), and once we do more
+ // extensive research, we may feel comfortable with dropping
+ // it down to edgy.
- // complicated font, requires quoting
- $final .= "\"$font\", "; // note that this will later get turned into "
+ // Edgy: alphanumeric, spaces, dashes and Unicode. Use of
+ // str(c)spn assumes that the string was already well formed
+ // Unicode (which of course it is).
+ if (strspn($font, $this->mask) !== strlen($font)) {
+ continue;
+ }
+
+ // Historical:
+ // In the absence of innerHTML/cssText, these ugly
+ // transforms don't pose a security risk (as \\ and \"
+ // might--these escapes are not supported by most browsers).
+ // We could try to be clever and use single-quote wrapping
+ // when there is a double quote present, but I have choosen
+ // not to implement that. (NOTE: you can reduce the amount
+ // of escapes by one depending on what quoting style you use)
+ // $font = str_replace('\\', '\\5C ', $font);
+ // $font = str_replace('"', '\\22 ', $font);
+ // $font = str_replace("'", '\\27 ', $font);
+
+ // font possibly with spaces, requires quoting
+ $final .= "'$font', ";
}
$final = rtrim($final, ', ');
if ($final === '') return false;
diff --git a/library/HTMLPurifier/URI.php b/library/HTMLPurifier/URI.php
index 92bff87a..efdfb2c6 100644
--- a/library/HTMLPurifier/URI.php
+++ b/library/HTMLPurifier/URI.php
@@ -185,7 +185,7 @@ class HTMLPurifier_URI
// Reconstruct the result
// One might wonder about parsing quirks from browsers after
- // this reconstruction. Unfortunately, parsing behaviro depends
+ // this reconstruction. Unfortunately, parsing behavior depends
// on what *scheme* was employed (file:///foo is handled *very*
// differently than http:///foo), so unfortunately we have to
// defer to the schemes to do the right thing.
diff --git a/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php b/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php
index 2060076b..fda8e01f 100644
--- a/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php
+++ b/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php
@@ -8,30 +8,32 @@ class HTMLPurifier_AttrDef_CSS_FontFamilyTest extends HTMLPurifier_AttrDefHarnes
$this->def = new HTMLPurifier_AttrDef_CSS_FontFamily();
$this->assertDef('Gill, Helvetica, sans-serif');
- $this->assertDef('"Times New Roman", serif');
- $this->assertDef('\'Times New Roman\'', '"Times New Roman"');
+ $this->assertDef("'Times New Roman', serif");
+ $this->assertDef("\"Times New Roman\"", "'Times New Roman'");
$this->assertDef('01234');
$this->assertDef(',', false);
- $this->assertDef('Times New Roman, serif', '"Times New Roman", serif');
- $this->assertDef($d = '"John\'s Font"');
- $this->assertDef("John's Font", $d);
- $this->assertDef($d = "\"\xE5\xAE\x8B\xE4\xBD\x93\"");
+ $this->assertDef('Times New Roman, serif', "'Times New Roman', serif");
+ $this->assertDef($d = "'\xE5\xAE\x8B\xE4\xBD\x93'");
$this->assertDef("\xE5\xAE\x8B\xE4\xBD\x93", $d);
- $this->assertDef("'\\','f'", "\"\\5C \", f");
- $this->assertDef("'\\01'", "\"\"");
- $this->assertDef("'\\20'", "\" \"");
- $this->assertDef("\\0020", "\" \"");
+ $this->assertDef("'\\01'", "''");
+ $this->assertDef("'\\20'", "' '");
+ $this->assertDef("\\0020", "' '");
$this->assertDef("'\\000045'", "E");
$this->assertDef("','", false);
- $this->assertDef("',' foobar','", "\" foobar\"");
- $this->assertDef("'\\27'", "\"'\"");
- $this->assertDef('"\\22"', "\"\\22 \"");
- $this->assertDef('"\\""', "\"\\22 \"");
- $this->assertDef('"\'"', "\"'\"");
+ $this->assertDef("',' foobar','", "' foobar'");
$this->assertDef("'\\000045a'", "Ea");
$this->assertDef("'\\00045 a'", "Ea");
- $this->assertDef("'\\00045 a'", "\"E a\"");
+ $this->assertDef("'\\00045 a'", "'E a'");
$this->assertDef("'\\\nf'", "f");
+ // No longer supported, except maybe in NoJS mode (see source
+ // file for more explanation)
+ //$this->assertDef($d = '"John\'s Font"');
+ //$this->assertDef("John's Font", $d);
+ //$this->assertDef("'\\','f'", "\"\\5C \", f");
+ //$this->assertDef("'\\27'", "\"'\"");
+ //$this->assertDef('"\\22"', "\"\\22 \"");
+ //$this->assertDef('"\\""', "\"\\22 \"");
+ //$this->assertDef('"\'"', "\"'\"");
}
function testAllowed() {
@@ -40,8 +42,8 @@ class HTMLPurifier_AttrDef_CSS_FontFamilyTest extends HTMLPurifier_AttrDefHarnes
$this->assertDef('serif');
$this->assertDef('sans-serif', false);
$this->assertDef('serif, sans-serif', 'serif');
- $this->assertDef('Times New Roman', '"Times New Roman"');
- $this->assertDef('"Times New Roman"');
+ $this->assertDef('Times New Roman', "'Times New Roman'");
+ $this->assertDef("'Times New Roman'");
$this->assertDef('foo', false);
}
diff --git a/tests/HTMLPurifier/AttrDef/CSS/FontTest.php b/tests/HTMLPurifier/AttrDef/CSS/FontTest.php
index 40559485..91870d13 100644
--- a/tests/HTMLPurifier/AttrDef/CSS/FontTest.php
+++ b/tests/HTMLPurifier/AttrDef/CSS/FontTest.php
@@ -11,10 +11,10 @@ class HTMLPurifier_AttrDef_CSS_FontTest extends HTMLPurifier_AttrDefHarness
// hodgepodge of usage cases from W3C spec, but " -> '
$this->assertDef('12px/14px sans-serif');
$this->assertDef('80% sans-serif');
- $this->assertDef('x-large/110% "New Century Schoolbook", serif');
+ $this->assertDef("x-large/110% 'New Century Schoolbook', serif");
$this->assertDef('bold italic large Palatino, serif');
$this->assertDef('normal small-caps 120%/120% fantasy');
- $this->assertDef('300 italic 1.3em/1.7em "FB Armada", sans-serif');
+ $this->assertDef("300 italic 1.3em/1.7em 'FB Armada', sans-serif");
$this->assertDef('600 9px Charcoal');
$this->assertDef('600 9px/ 12px Charcoal', '600 9px/12px Charcoal');
diff --git a/tests/HTMLPurifier/AttrDef/CSSTest.php b/tests/HTMLPurifier/AttrDef/CSSTest.php
index 72a17e1a..56917aec 100644
--- a/tests/HTMLPurifier/AttrDef/CSSTest.php
+++ b/tests/HTMLPurifier/AttrDef/CSSTest.php
@@ -62,7 +62,7 @@ class HTMLPurifier_AttrDef_CSSTest extends HTMLPurifier_AttrDefHarness
$this->assertDef('width:-50px;', false);
$this->assertDef('text-decoration:underline;');
$this->assertDef('font-family:sans-serif;');
- $this->assertDef('font-family:Gill, "Times New Roman", sans-serif;');
+ $this->assertDef("font-family:Gill, 'Times New Roman', sans-serif;");
$this->assertDef('font:12px serif;');
$this->assertDef('border:1px solid #000;');
$this->assertDef('border-bottom:2em double #FF00FA;');
diff --git a/tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php b/tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php
index e93d65e6..d3fb38f1 100644
--- a/tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php
+++ b/tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php
@@ -79,10 +79,13 @@ class HTMLPurifier_Filter_ExtractStyleBlocksTest extends HTMLPurifier_Harness
}
function test_cleanCSS_angledBrackets() {
- $this->assertCleanCSS(
- ".class {\nfont-family:'';\n}",
- ".class {\nfont-family:\"\\3C /style\\3E \";\n}"
- );
+ // [Content] No longer can smuggle in angled brackets using
+ // font-family; when we add support for 'content', reinstate
+ // this test.
+ //$this->assertCleanCSS(
+ // ".class {\nfont-family:'';\n}",
+ // ".class {\nfont-family:\"\\3C /style\\3E \";\n}"
+ //);
}
function test_cleanCSS_angledBrackets2() {
@@ -97,6 +100,7 @@ class HTMLPurifier_Filter_ExtractStyleBlocksTest extends HTMLPurifier_Harness
$this->assertCleanCSS("div {bogus:tree;}", "div {\n}");
}
+ /* [CONTENT]
function test_cleanCSS_escapeCodes() {
$this->assertCleanCSS(
".class {\nfont-family:\"\\3C /style\\3E \";\n}"
@@ -109,6 +113,7 @@ class HTMLPurifier_Filter_ExtractStyleBlocksTest extends HTMLPurifier_Harness
".class {\nfont-family:\"\";\n}"
);
}
+ */
function test_cleanCSS_scope() {
$this->config->set('Filter.ExtractStyleBlocks.Scope', '#foo');
diff --git a/tests/HTMLPurifier/HTMLT/shift-jis-preserve-yen.htmlt b/tests/HTMLPurifier/HTMLT/shift-jis-preserve-yen.htmlt
index 6e9640be..f22417c0 100644
--- a/tests/HTMLPurifier/HTMLT/shift-jis-preserve-yen.htmlt
+++ b/tests/HTMLPurifier/HTMLT/shift-jis-preserve-yen.htmlt
@@ -4,5 +4,5 @@ if (!function_exists('iconv')) return true;
Core.Encoding = "Shift_JIS"
Core.EscapeNonASCIICharacters = true
--HTML--
-111
+111
--# vim: et sw=4 sts=4
diff --git a/tests/HTMLPurifier/HTMLT/shift-jis-remove-yen.htmlt b/tests/HTMLPurifier/HTMLT/shift-jis-remove-yen.htmlt
index dd6dee6c..6c2d3bc4 100644
--- a/tests/HTMLPurifier/HTMLT/shift-jis-remove-yen.htmlt
+++ b/tests/HTMLPurifier/HTMLT/shift-jis-remove-yen.htmlt
@@ -3,7 +3,7 @@ if (!function_exists('iconv')) return true;
--INI--
Core.Encoding = Shift_JIS
--HTML--
-111
+111
--EXPECT--
-111
+111
--# vim: et sw=4 sts=4