diff --git a/NEWS b/NEWS index 25eae785..1ac9e0f1 100644 --- a/NEWS +++ b/NEWS @@ -15,6 +15,13 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier to http. Reported by Neike Taika-Tessaro. # Core.EscapeNonASCIICharacters now always transforms entities to entities, even if target encoding is UTF-8. +# Tighten up selector validation in ExtractStyleBlocks. + Non-syntactically valid selectors are now rejected, along with + some of the more obscure ones such as attribute selectors, the + :lang pseudoselector, and anything not in CSS2.1. Furthermore, + ID and class selectors now work properly with the relevant + configuration attributes. Also, mute errors when parsing CSS + with CSS Tidy. ! Added support for 'scope' attribute on tables. ! Added %HTML.TargetBlank, which adds target="blank" to all outgoing links. ! Properly handle sub-lists directly nested inside of lists in diff --git a/configdoc/usage.xml b/configdoc/usage.xml index e53be30d..f2fcea6c 100644 --- a/configdoc/usage.xml +++ b/configdoc/usage.xml @@ -323,23 +323,23 @@ - 20 + 30 - 26 + 36 - 28 - 31 + 38 + 41 - 54 + 64 @@ -411,17 +411,17 @@ - 41 + 53 - 65 + 77 - 123 + 275 @@ -513,7 +513,7 @@ - 8 + 12 diff --git a/library/HTMLPurifier.includes.php b/library/HTMLPurifier.includes.php index 47eba544..76a3bb9a 100644 --- a/library/HTMLPurifier.includes.php +++ b/library/HTMLPurifier.includes.php @@ -91,6 +91,7 @@ require 'HTMLPurifier/AttrDef/CSS/DenyElementDecorator.php'; require 'HTMLPurifier/AttrDef/CSS/Filter.php'; require 'HTMLPurifier/AttrDef/CSS/Font.php'; require 'HTMLPurifier/AttrDef/CSS/FontFamily.php'; +require 'HTMLPurifier/AttrDef/CSS/Ident.php'; require 'HTMLPurifier/AttrDef/CSS/ImportantDecorator.php'; require 'HTMLPurifier/AttrDef/CSS/Length.php'; require 'HTMLPurifier/AttrDef/CSS/ListStyle.php'; diff --git a/library/HTMLPurifier.safe-includes.php b/library/HTMLPurifier.safe-includes.php index 3902b878..d49b196c 100644 --- a/library/HTMLPurifier.safe-includes.php +++ b/library/HTMLPurifier.safe-includes.php @@ -85,6 +85,7 @@ require_once $__dir . '/HTMLPurifier/AttrDef/CSS/DenyElementDecorator.php'; require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Filter.php'; require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Font.php'; require_once $__dir . '/HTMLPurifier/AttrDef/CSS/FontFamily.php'; +require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Ident.php'; require_once $__dir . '/HTMLPurifier/AttrDef/CSS/ImportantDecorator.php'; require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Length.php'; require_once $__dir . '/HTMLPurifier/AttrDef/CSS/ListStyle.php'; diff --git a/library/HTMLPurifier/AttrDef/CSS/Ident.php b/library/HTMLPurifier/AttrDef/CSS/Ident.php new file mode 100644 index 00000000..779794a0 --- /dev/null +++ b/library/HTMLPurifier/AttrDef/CSS/Ident.php @@ -0,0 +1,24 @@ +selector = $selector; + } public function validate($id, $config, $context) { - if (!$config->get('Attr.EnableID')) return false; + if (!$this->selector && !$config->get('Attr.EnableID')) return false; $id = trim($id); // trim it first @@ -33,10 +43,10 @@ class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef '%Attr.IDPrefix is set', E_USER_WARNING); } - //if (!$this->ref) { + if (!$this->selector) { $id_accumulator =& $context->get('IDAccumulator'); if (isset($id_accumulator->ids[$id])) return false; - //} + } // we purposely avoid using regex, hopefully this is faster @@ -56,7 +66,7 @@ class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef return false; } - if (/*!$this->ref && */$result) $id_accumulator->add($id); + if (!$this->selector && $result) $id_accumulator->add($id); // if no change was made to the ID, return the result // else, return the new id if stripping whitespace made it diff --git a/library/HTMLPurifier/Filter/ExtractStyleBlocks.php b/library/HTMLPurifier/Filter/ExtractStyleBlocks.php index bbf78a66..39621ee2 100644 --- a/library/HTMLPurifier/Filter/ExtractStyleBlocks.php +++ b/library/HTMLPurifier/Filter/ExtractStyleBlocks.php @@ -21,10 +21,22 @@ class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter private $_styleMatches = array(); private $_tidy; + private $_id_attrdef; + private $_class_attrdef; + private $_enum_attrdef; + public function __construct() { $this->_tidy = new csstidy(); + $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true); + $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident(); + $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(array('first-child', 'link', 'visited', 'active', 'hover', 'focus')); } + /** + * Error-handler that mutes errors, alternative to shut-up operator. + */ + public static function muteErrorHandler() {} + /** * Save the contents of CSS blocks to style matches * @param $matches preg_replace style $matches array @@ -77,27 +89,166 @@ class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter $css = substr($css, 0, -3); } $css = trim($css); + set_error_handler(array('HTMLPurifier_Filter_ExtractStyleBlocks', 'muteErrorHandler')); $this->_tidy->parse($css); + restore_error_handler(); $css_definition = $config->getDefinition('CSS'); + $html_definition = $config->getDefinition('HTML'); + $new_css = array(); foreach ($this->_tidy->css as $k => $decls) { // $decls are all CSS declarations inside an @ selector $new_decls = array(); foreach ($decls as $selector => $style) { $selector = trim($selector); if ($selector === '') continue; // should not happen - if ($selector[0] === '+') { - if ($selector !== '' && $selector[0] === '+') continue; - } - if (!empty($scopes)) { - $new_selector = array(); // because multiple ones are possible - $selectors = array_map('trim', explode(',', $selector)); - foreach ($scopes as $s1) { - foreach ($selectors as $s2) { - $new_selector[] = "$s1 $s2"; + // Parse the selector + // Here is the relevant part of the CSS grammar: + // + // ruleset + // : selector [ ',' S* selector ]* '{' ... + // selector + // : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]? + // combinator + // : '+' S* + // : '>' S* + // simple_selector + // : element_name [ HASH | class | attrib | pseudo ]* + // | [ HASH | class | attrib | pseudo ]+ + // element_name + // : IDENT | '*' + // ; + // class + // : '.' IDENT + // ; + // attrib + // : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S* + // [ IDENT | STRING ] S* ]? ']' + // ; + // pseudo + // : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ] + // ; + // + // For reference, here are the relevant tokens: + // + // HASH #{name} + // IDENT {ident} + // INCLUDES == + // DASHMATCH |= + // STRING {string} + // FUNCTION {ident}\( + // + // And the lexical scanner tokens + // + // name {nmchar}+ + // nmchar [_a-z0-9-]|{nonascii}|{escape} + // nonascii [\240-\377] + // escape {unicode}|\\[^\r\n\f0-9a-f] + // unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])? + // ident -?{nmstart}{nmchar*} + // nmstart [_a-z]|{nonascii}|{escape} + // string {string1}|{string2} + // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\" + // string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\' + // + // We'll implement a subset (in order to reduce attack + // surface); in particular: + // + // - No Unicode support + // - No escapes support + // - No string support (by proxy no attrib support) + // - element_name is matched against allowed + // elements (some people might find this + // annoying...) + // - Pseudo-elements one of :first-child, :link, + // :visited, :active, :hover, :focus + + // handle ruleset + $selectors = array_map('trim', explode(',', $selector)); + $new_selectors = array(); + foreach ($selectors as $sel) { + // split on +, > and spaces + $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE); + // even indices are chunks, odd indices are + // delimiters + $nsel = null; + $delim = null; // guaranteed to be non-null after + // two loop iterations + for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) { + $x = $basic_selectors[$i]; + if ($i % 2) { + // delimiter + if ($x === ' ') { + $delim = ' '; + } else { + $delim = ' ' . $x . ' '; + } + } else { + // simple selector + $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE); + $sdelim = null; + $nx = null; + for ($j = 0, $cc = count($components); $j < $cc; $j ++) { + $y = $components[$j]; + if ($j === 0) { + if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) { + $nx = $y; + } else { + // $nx stays null; this matters + // if we don't manage to find + // any valid selector content, + // in which case we ignore the + // outer $delim + } + } elseif ($j % 2) { + // set delimiter + $sdelim = $y; + } else { + $attrdef = null; + if ($sdelim === '#') { + $attrdef = $this->_id_attrdef; + } elseif ($sdelim === '.') { + $attrdef = $this->_class_attrdef; + } elseif ($sdelim === ':') { + $attrdef = $this->_enum_attrdef; + } else { + throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split'); + } + $r = $attrdef->validate($y, $config, $context); + if ($r !== false) { + if ($r !== true) { + $y = $r; + } + if ($nx === null) { + $nx = ''; + } + $nx .= $sdelim . $y; + } + } + } + if ($nx !== null) { + if ($nsel === null) { + $nsel = $nx; + } else { + $nsel .= $delim . $nx; + } + } else { + // delimiters to the left of invalid + // basic selector ignored + } + } + } + if ($nsel !== null) { + if (!empty($scopes)) { + foreach ($scopes as $s) { + $new_selectors[] = "$s $nsel"; + } + } else { + $new_selectors[] = $nsel; } } - $selector = implode(', ', $new_selector); // now it's a string } + if (empty($new_selectors)) continue; + $selector = implode(', ', $new_selectors); foreach ($style as $name => $value) { if (!isset($css_definition->info[$name])) { unset($style[$name]); @@ -110,10 +261,11 @@ class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter } $new_decls[$selector] = $style; } - $this->_tidy->css[$k] = $new_decls; + $new_css[$k] = $new_decls; } // remove stuff that shouldn't be used, could be reenabled // after security risks are analyzed + $this->_tidy->css = $new_css; $this->_tidy->import = array(); $this->_tidy->charset = null; $this->_tidy->namespace = null; diff --git a/tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php b/tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php index d3fb38f1..3466d6aa 100644 --- a/tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php +++ b/tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php @@ -10,7 +10,7 @@ class HTMLPurifier_Filter_ExtractStyleBlocksTest extends HTMLPurifier_Harness function test_tokenizeHTML_extractStyleBlocks() { $this->config->set('Filter.ExtractStyleBlocks', true); $purifier = new HTMLPurifier($this->config); - $result = $purifier->purify('Test'); + $result = $purifier->purify('Test'); $this->assertIdentical($result, 'Test'); $this->assertIdentical($purifier->context->get('StyleBlocks'), array( @@ -153,7 +153,7 @@ class HTMLPurifier_Filter_ExtractStyleBlocksTest extends HTMLPurifier_Harness $this->config->set('Filter.ExtractStyleBlocks.Scope', '#foo, .bar'); $this->assertCleanCSS( "p, div {\ntext-indent:1em;\n}", - "#foo p, #foo div, .bar p, .bar div {\ntext-indent:1em;\n}" + "#foo p, .bar p, #foo div, .bar div {\ntext-indent:1em;\n}" ); } @@ -191,6 +191,41 @@ text-align:right; ); } + function test_atSelector() { + $this->assertCleanCSS( +"{ + b { text-align: center; } +}", +"" + ); + } + + function test_selectorValidation() { + $this->assertCleanCSS( +"&, & { +text-align: center; +}", +"" + ); + $this->assertCleanCSS( +"&, b { +text-align:center; +}", +"b { +text-align:center; +}" + ); + $this->assertCleanCSS( +"& a #foo:hover.bar +b > i { +text-align:center; +}", +"a #foo:hover.bar + b \\3E i { +text-align:center; +}" + ); + $this->assertCleanCSS("doesnt-exist { text-align:center }", ""); + } + } // vim: et sw=4 sts=4