0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-09-18 18:25:18 +00:00

fix: CSSTidy ImportantComments not handled properly (#359)

* fix: CSSTidy ImportantComments not handled properly

Signed-off-by: Francis Lévesque <wolfrank2164@gmail.com>

* fix: CSSTidy ImportantComments not handled properly -> remove comments

Signed-off-by: Francis Lévesque <wolfrank2164@gmail.com>
Co-authored-by: Edward Z. Yang <ezyang@meta.com>
This commit is contained in:
Francis Lévesque 2023-01-21 22:44:44 -05:00 committed by GitHub
parent 9ec687c904
commit 78a9b4d0da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 175 additions and 158 deletions

View File

@ -146,175 +146,179 @@ class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter
foreach ($this->_tidy->css as $k => $decls) { foreach ($this->_tidy->css as $k => $decls) {
// $decls are all CSS declarations inside an @ selector // $decls are all CSS declarations inside an @ selector
$new_decls = array(); $new_decls = array();
foreach ($decls as $selector => $style) { if (is_array($decls)) {
$selector = trim($selector); foreach ($decls as $selector => $style) {
if ($selector === '') { $selector = trim($selector);
continue; if ($selector === '') {
} // should not happen continue;
// Parse the selector } // should not happen
// Here is the relevant part of the CSS grammar: // Parse the selector
// // Here is the relevant part of the CSS grammar:
// ruleset //
// : selector [ ',' S* selector ]* '{' ... // ruleset
// selector // : selector [ ',' S* selector ]* '{' ...
// : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]? // selector
// combinator // : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]?
// : '+' S* // combinator
// : '>' S* // : '+' S*
// simple_selector // : '>' S*
// : element_name [ HASH | class | attrib | pseudo ]* // simple_selector
// | [ HASH | class | attrib | pseudo ]+ // : element_name [ HASH | class | attrib | pseudo ]*
// element_name // | [ HASH | class | attrib | pseudo ]+
// : IDENT | '*' // element_name
// ; // : IDENT | '*'
// class // ;
// : '.' IDENT // class
// ; // : '.' IDENT
// attrib // ;
// : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S* // attrib
// [ IDENT | STRING ] S* ]? ']' // : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S*
// ; // [ IDENT | STRING ] S* ]? ']'
// pseudo // ;
// : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ] // pseudo
// ; // : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ]
// // ;
// For reference, here are the relevant tokens: //
// // For reference, here are the relevant tokens:
// HASH #{name} //
// IDENT {ident} // HASH #{name}
// INCLUDES == // IDENT {ident}
// DASHMATCH |= // INCLUDES ==
// STRING {string} // DASHMATCH |=
// FUNCTION {ident}\( // STRING {string}
// // FUNCTION {ident}\(
// And the lexical scanner tokens //
// // And the lexical scanner tokens
// name {nmchar}+ //
// nmchar [_a-z0-9-]|{nonascii}|{escape} // name {nmchar}+
// nonascii [\240-\377] // nmchar [_a-z0-9-]|{nonascii}|{escape}
// escape {unicode}|\\[^\r\n\f0-9a-f] // nonascii [\240-\377]
// unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])? // escape {unicode}|\\[^\r\n\f0-9a-f]
// ident -?{nmstart}{nmchar*} // unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])?
// nmstart [_a-z]|{nonascii}|{escape} // ident -?{nmstart}{nmchar*}
// string {string1}|{string2} // nmstart [_a-z]|{nonascii}|{escape}
// string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\" // string {string1}|{string2}
// string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\' // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
// // string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\'
// We'll implement a subset (in order to reduce attack //
// surface); in particular: // We'll implement a subset (in order to reduce attack
// // surface); in particular:
// - No Unicode support //
// - No escapes support // - No Unicode support
// - No string support (by proxy no attrib support) // - No escapes support
// - element_name is matched against allowed // - No string support (by proxy no attrib support)
// elements (some people might find this // - element_name is matched against allowed
// annoying...) // elements (some people might find this
// - Pseudo-elements one of :first-child, :link, // annoying...)
// :visited, :active, :hover, :focus // - Pseudo-elements one of :first-child, :link,
// :visited, :active, :hover, :focus
// handle ruleset // handle ruleset
$selectors = array_map('trim', explode(',', $selector)); $selectors = array_map('trim', explode(',', $selector));
$new_selectors = array(); $new_selectors = array();
foreach ($selectors as $sel) { foreach ($selectors as $sel) {
// split on +, > and spaces // split on +, > and spaces
$basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE); $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE);
// even indices are chunks, odd indices are // even indices are chunks, odd indices are
// delimiters // delimiters
$nsel = null; $nsel = null;
$delim = null; // guaranteed to be non-null after $delim = null; // guaranteed to be non-null after
// two loop iterations // two loop iterations
for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) { for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) {
$x = $basic_selectors[$i]; $x = $basic_selectors[$i];
if ($i % 2) { if ($i % 2) {
// delimiter // delimiter
if ($x === ' ') { if ($x === ' ') {
$delim = ' '; $delim = ' ';
} else {
$delim = ' ' . $x . ' ';
}
} else {
// simple selector
$components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE);
$sdelim = null;
$nx = null;
for ($j = 0, $cc = count($components); $j < $cc; $j++) {
$y = $components[$j];
if ($j === 0) {
if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) {
$nx = $y;
} else {
// $nx stays null; this matters
// if we don't manage to find
// any valid selector content,
// in which case we ignore the
// outer $delim
}
} elseif ($j % 2) {
// set delimiter
$sdelim = $y;
} else { } else {
$attrdef = null; $delim = ' ' . $x . ' ';
if ($sdelim === '#') {
$attrdef = $this->_id_attrdef;
} elseif ($sdelim === '.') {
$attrdef = $this->_class_attrdef;
} elseif ($sdelim === ':') {
$attrdef = $this->_enum_attrdef;
} else {
throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split');
}
$r = $attrdef->validate($y, $config, $context);
if ($r !== false) {
if ($r !== true) {
$y = $r;
}
if ($nx === null) {
$nx = '';
}
$nx .= $sdelim . $y;
}
}
}
if ($nx !== null) {
if ($nsel === null) {
$nsel = $nx;
} else {
$nsel .= $delim . $nx;
} }
} else { } else {
// delimiters to the left of invalid // simple selector
// basic selector ignored $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE);
$sdelim = null;
$nx = null;
for ($j = 0, $cc = count($components); $j < $cc; $j++) {
$y = $components[$j];
if ($j === 0) {
if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) {
$nx = $y;
} else {
// $nx stays null; this matters
// if we don't manage to find
// any valid selector content,
// in which case we ignore the
// outer $delim
}
} elseif ($j % 2) {
// set delimiter
$sdelim = $y;
} else {
$attrdef = null;
if ($sdelim === '#') {
$attrdef = $this->_id_attrdef;
} elseif ($sdelim === '.') {
$attrdef = $this->_class_attrdef;
} elseif ($sdelim === ':') {
$attrdef = $this->_enum_attrdef;
} else {
throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split');
}
$r = $attrdef->validate($y, $config, $context);
if ($r !== false) {
if ($r !== true) {
$y = $r;
}
if ($nx === null) {
$nx = '';
}
$nx .= $sdelim . $y;
}
}
}
if ($nx !== null) {
if ($nsel === null) {
$nsel = $nx;
} else {
$nsel .= $delim . $nx;
}
} else {
// delimiters to the left of invalid
// basic selector ignored
}
}
}
if ($nsel !== null) {
if (!empty($scopes)) {
foreach ($scopes as $s) {
$new_selectors[] = "$s $nsel";
}
} else {
$new_selectors[] = $nsel;
} }
} }
} }
if ($nsel !== null) { if (empty($new_selectors)) {
if (!empty($scopes)) {
foreach ($scopes as $s) {
$new_selectors[] = "$s $nsel";
}
} else {
$new_selectors[] = $nsel;
}
}
}
if (empty($new_selectors)) {
continue;
}
$selector = implode(', ', $new_selectors);
foreach ($style as $name => $value) {
if (!isset($css_definition->info[$name])) {
unset($style[$name]);
continue; continue;
} }
$def = $css_definition->info[$name]; $selector = implode(', ', $new_selectors);
$ret = $def->validate($value, $config, $context); foreach ($style as $name => $value) {
if ($ret === false) { if (!isset($css_definition->info[$name])) {
unset($style[$name]); unset($style[$name]);
} else { continue;
$style[$name] = $ret; }
$def = $css_definition->info[$name];
$ret = $def->validate($value, $config, $context);
if ($ret === false) {
unset($style[$name]);
} else {
$style[$name] = $ret;
}
} }
$new_decls[$selector] = $style;
} }
$new_decls[$selector] = $style; } else {
continue;
} }
$new_css[$k] = $new_decls; $new_css[$k] = $new_decls;
} }

View File

@ -214,6 +214,19 @@ text-align:right
); );
} }
public function test_keepImportantComments()
{
$this->assertCleanCSS(
"/*! Important */
div {
text-align:right /*! Important2 */
}",
"div {
text-align:right
}"
);
}
public function test_atSelector() public function test_atSelector()
{ {
$this->assertCleanCSS( $this->assertCleanCSS(