0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-12-22 08:21:52 +00:00

[3.0.0] Fully implement CSS extraction and cleaning. See NEWS for more information, it is now a Filter.

- Some Lexer things were moved around

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1469 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2007-12-12 21:46:30 +00:00
parent 831f552ec5
commit 5b3431d889
9 changed files with 231 additions and 92 deletions

6
NEWS
View File

@ -18,8 +18,10 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
+ PHP4 reference/foreach cruft removed (in progress)
! CSS properties are no case-insensitive
! DefinitionCacheFactory now can register new implementations
! <style> tags can now be extracted from input HTML using %HTML.ExtractStyleBlocks.
These contents can be retrieved from $context->get('StyleBlocks');
! New HTMLPurifier_Filter_ExtractStyleBlocks for extracting <style> from
documents and cleaning their contents up. Requires the CSSTidy library
<http://csstidy.sourceforge.net/>. You can access the blocks with the
'StyleBlocks' Context variable ($purifier->context->get('StyleBlocks'))
. Unit tests for Injector improved
2.1.3, released 2007-11-05

View File

@ -0,0 +1,104 @@
<?php
require_once 'HTMLPurifier/Filter.php';
/**
* This filter extracts <style> blocks from input HTML, cleans them up
* using CSSTidy, and then places them in $purifier->context->get('StyleBlocks')
* so they can be used elsewhere in the document.
* @note See tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php
* @todo Allow for selectors to be munged/checked
* @todo Expose CSSTidy configuration so that custom changes can be made
*/
class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter
{
public $name = 'ExtractStyleBlocks';
private $_styleMatches = array();
private $_tidy, $_disableCharacterEscaping;
/**
* @param $tidy Instance of csstidy to use, false to turn off cleaning,
* and null to automatically instantiate
* @param $disable_character_escaping Whether or not to stop munging
* <, > and &. This can be set to true if the CSS will
* be placed in an external style and not inline.
*/
public function __construct($tidy = null, $disable_character_escaping = false) {
if ($tidy === null) $tidy = new csstidy();
$this->_tidy = $tidy;
$this->_disableCharacterEscaping = $disable_character_escaping;
}
/**
* Save the contents of CSS blocks to style matches
* @param $matches preg_replace style $matches array
*/
protected function styleCallback($matches) {
$this->_styleMatches[] = $matches[1];
}
/**
* Removes inline <style> tags from HTML, saves them for later use
* @todo Extend to indicate non-text/css style blocks
*/
public function preFilter($html, $config, &$context) {
$html = preg_replace_callback('#<style(?:\s.*)?>(.+)</style>#isU', array($this, 'styleCallback'), $html);
$style_blocks = $this->_styleMatches;
$this->_styleMatches = array(); // reset
$context->register('StyleBlocks', $style_blocks); // $context must not be reused
if ($this->_tidy) {
foreach ($style_blocks as &$style) {
$style = $this->cleanCSS($style, $config, $context);
}
}
return $html;
}
/**
* Takes CSS (the stuff found in <style>) and cleans it.
* @warning Requires CSSTidy <http://csstidy.sourceforge.net/>
* @param $css CSS styling to clean
* @param $config Instance of HTMLPurifier_Config
* @param $context Instance of HTMLPurifier_Context
* @return Cleaned CSS
*/
public function cleanCSS($css, $config, &$context) {
$this->_tidy->parse($css);
$css_definition = $config->getDefinition('CSS');
foreach ($this->_tidy->css as &$decls) {
// $decls are all CSS declarations inside an @ selector
foreach ($decls as &$style) {
foreach ($style as $name => $value) {
if (!isset($css_definition->info[$name])) {
unset($style[$name]);
continue;
}
$def = $css_definition->info[$name];
$ret = $def->validate($value, $config, $context);
if ($ret === false) unset($style[$name]);
else $style[$name] = $ret;
}
}
}
// remove stuff that shouldn't be used, could be reenabled
// after security risks are analyzed
$this->_tidy->import = array();
$this->_tidy->charset = null;
$this->_tidy->namespace = null;
$printer = new csstidy_print($this->_tidy);
$css = $printer->plain();
// we are going to escape any special characters <>& to ensure
// that no funny business occurs (i.e. </style> in a font-family prop).
if (!$this->_disableCharacterEscaping) {
$css = str_replace(
array('<', '>', '&'),
array('\3C ', '\3E ', '\26 '),
$css
);
}
return $css;
}
}

View File

@ -5,7 +5,7 @@ require_once 'HTMLPurifier/Filter.php';
class HTMLPurifier_Filter_YouTube extends HTMLPurifier_Filter
{
public $name = 'YouTube preservation';
public $name = 'YouTube';
public function preFilter($html, $config, &$context) {
$pre_regex = '#<object[^>]+>.+?'.

View File

@ -79,14 +79,6 @@ It is not necessary and will have no effect for PHP 4.
This directive has been available since 2.1.0.
');
HTMLPurifier_ConfigSchema::define(
'HTML', 'ExtractStyleBlocks', false, 'bool', '
This directive enables extraction of <code>style</code> tags contents so
that they can be incorporated in the <code>head</code> of the document,
after sufficient validation.
This directive has been available since 3.0.0.
');
/**
* Forgivingly lexes HTML (SGML-style) markup into tokens.
*
@ -346,34 +338,6 @@ class HTMLPurifier_Lexer
// represent non-SGML characters (horror, horror!)
$html = HTMLPurifier_Encoder::cleanUTF8($html);
if ($config->get('HTML', 'ExtractStyleBlocks')) {
// extract <style> CSS blocks
$html = $this->extractStyleBlocks($html, $config, $context);
}
return $html;
}
private $_styleMatches = array();
/**
* Save the contents of CSS blocks to style matches
*/
protected function styleCallback($matches) {
$this->_styleMatches[] = $matches[1];
}
/**
* Removes inline <style> tags from HTML, saves them for later use
* @todo Extend to indicate non-text/css style blocks
*/
public function extractStyleBlocks($html, $config, $context) {
$html = preg_replace_callback('#<style(?:\s.*)?>(.+)</style>#isU', array($this, 'styleCallback'), $html);
$style_blocks = $this->_styleMatches;
$this->_styleMatches = array(); // reset
// this is a persistent context, so we have to overwrite it with every call
if ($context->exists('StyleBlocks')) $context->destroy('StyleBlocks');
$context->register('StyleBlocks', $style_blocks);
return $html;
}

View File

@ -14,6 +14,9 @@ set_time_limit(0);
// Where is SimpleTest located?
$simpletest_location = '/path/to/simpletest/';
// Where is CSSTidy located?
$csstidy_location = '/path/to/csstidy/';
// How many times should profiling scripts iterate over the function? More runs
// means more accurate results, but they'll take longer to perform.
$GLOBALS['HTMLPurifierTest']['Runs'] = 2;

View File

@ -0,0 +1,112 @@
<?php
require_once 'HTMLPurifier/Filter/ExtractStyleBlocks.php';
/**
* @todo Assimilate CSSTidy into our library
*/
class HTMLPurifier_Filter_ExtractStyleBlocksTest extends HTMLPurifier_Harness
{
// usual use case:
function test_tokenizeHTML_extractStyleBlocks() {
$purifier = new HTMLPurifier($this->config);
$purifier->addFilter(new HTMLPurifier_Filter_ExtractStyleBlocks());
$result = $purifier->purify('<style type="text/css">.foo {text-align:center;bogus:remove-me;}</style>Test<style>* {font-size:12pt;}</style>');
$this->assertIdentical($result, 'Test');
$this->assertIdentical($purifier->context->get('StyleBlocks'),
array(
".foo {\ntext-align:center;\n}",
"* {\nfont-size:12pt;\n}"
)
);
}
function assertExtractStyleBlocks($html, $expect = true, $styles = array()) {
$filter = new HTMLPurifier_Filter_ExtractStyleBlocks(false); // disable cleaning
if ($expect === true) $expect = $html;
$result = $filter->preFilter($html, $this->config, $this->context);
$this->assertIdentical($result, $expect);
$this->assertIdentical($this->context->get('StyleBlocks'), $styles);
}
function test_extractStyleBlocks_preserve() {
$this->assertExtractStyleBlocks('Foobar');
}
function test_extractStyleBlocks_allStyle() {
$this->assertExtractStyleBlocks('<style>foo</style>', '', array('foo'));
}
function test_extractStyleBlocks_multipleBlocks() {
$this->assertExtractStyleBlocks(
"<style>1</style><style>2</style>NOP<style>4</style>",
"NOP",
array('1', '2', '4')
);
}
function test_extractStyleBlocks_blockWithAttributes() {
$this->assertExtractStyleBlocks(
'<style type="text/css">css</style>',
'',
array('css')
);
}
function test_extractStyleBlocks_styleWithPadding() {
$this->assertExtractStyleBlocks(
"Alas<styled>Awesome</styled>\n<style>foo</style> Trendy!",
"Alas<styled>Awesome</styled>\n Trendy!",
array('foo')
);
}
function assertCleanCSS($input, $expect = true) {
$filter = new HTMLPurifier_Filter_ExtractStyleBlocks();
if ($expect === true) $expect = $input;
$result = $filter->cleanCSS($input, $this->config, $this->context);
$this->assertIdentical($result, $expect);
}
function test_cleanCSS_malformed() {
$this->assertCleanCSS('</style>', '');
}
function test_cleanCSS_selector() {
$this->assertCleanCSS("a .foo #id div.cl#foo {\nfont-weight:700;\n}");
}
function test_cleanCSS_angledBrackets() {
$this->assertCleanCSS(
".class {\nfont-family:'</style>';\n}",
".class {\nfont-family:'\\3C /style\\3E ';\n}"
);
}
function test_cleanCSS_angledBrackets2() {
// CSSTidy's behavior in this case is wrong, and should be fixed
//$this->assertCleanCSS(
// "span[title=\"</style>\"] {\nfont-size:12pt;\n}",
// "span[title=\"\\3C /style\\3E \"] {\nfont-size:12pt;\n}"
//);
}
function test_cleanCSS_bogus() {
$this->assertCleanCSS("div {bogus:tree;}", "div {\n}");
}
function test_cleanCSS_escapeCodes() {
$this->assertCleanCSS(
".class {\nfont-family:'\\3C /style\\3E ';\n}"
);
}
function test_cleanCSS_noEscapeCodes() {
$filter = new HTMLPurifier_Filter_ExtractStyleBlocks(null, true);
$input = ".class {\nfont-family:'</style>';\n}";
$result = $filter->cleanCSS($input, $this->config, $this->context);
$this->assertIdentical($result, $input);
}
}

View File

@ -31,48 +31,6 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
$this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
}
// HTMLPurifier_Lexer->extractStyleBlocks() --------------------------------
function assertExtractStyleBlocks($html, $expect = true, $styles = array()) {
$lexer = HTMLPurifier_Lexer::create($this->config);
if ($expect === true) $expect = $html;
$result = $lexer->extractStyleBlocks($html, $this->config, $this->context);
$this->assertIdentical($result, $expect);
$this->assertIdentical($this->context->get('StyleBlocks'), $styles);
}
function test_extractStyleBlocks_preserve() {
$this->assertExtractStyleBlocks('Foobar');
}
function test_extractStyleBlocks_allStyle() {
$this->assertExtractStyleBlocks('<style>foo</style>', '', array('foo'));
}
function test_extractStyleBlocks_multipleBlocks() {
$this->assertExtractStyleBlocks(
"<style>1</style><style>2</style>NOP<style>4</style>",
"NOP",
array('1', '2', '4')
);
}
function test_extractStyleBlocks_blockWithAttributes() {
$this->assertExtractStyleBlocks(
'<style type="text/css">css</style>',
'',
array('css')
);
}
function test_extractStyleBlocks_styleWithPadding() {
$this->assertExtractStyleBlocks(
"Alas<styled>Awesome</styled>\n<style>foo</style> Trendy!",
"Alas<styled>Awesome</styled>\n Trendy!",
array('foo')
);
}
// HTMLPurifier_Lexer->parseData() -----------------------------------------
function assertParseData($input, $expect = true) {
@ -553,17 +511,6 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
);
}
function test_tokenizeHTML_extractStyleBlocks() {
$this->config->set('HTML', 'ExtractStyleBlocks', true);
$this->assertTokenization(
'<style type="text/css">.foo {text-align:center;}</style>Test',
array(
new HTMLPurifier_Token_Text('Test')
)
);
$this->assertIdentical($this->context->get('StyleBlocks'), array('.foo {text-align:center;}'));
}
/*
function test_tokenizeHTML_() {

View File

@ -16,6 +16,7 @@ $GLOBALS['HTMLPurifierTest'] = array();
$GLOBALS['HTMLPurifierTest']['PEAR'] = false; // do PEAR tests
$GLOBALS['HTMLPurifierTest']['PH5P'] = version_compare(PHP_VERSION, "5", ">=") && class_exists('DOMDocument');
$simpletest_location = 'simpletest/'; // reasonable guess
$csstidy_location = false;
// load SimpleTest
if (file_exists('../conf/test-settings.php')) include '../conf/test-settings.php';
@ -24,6 +25,11 @@ require_once $simpletest_location . 'unit_tester.php';
require_once $simpletest_location . 'reporter.php';
require_once $simpletest_location . 'mock_objects.php';
if ($csstidy_location !== false) {
require_once $csstidy_location . 'class.csstidy.php';
require_once $csstidy_location . 'class.csstidy_print.php';
}
error_reporting(E_ALL | E_STRICT); // after SimpleTest is loaded, turn on compile time errors
// load Debugger

View File

@ -75,6 +75,7 @@ $test_files[] = 'HTMLPurifier/ErrorCollectorTest.php';
$test_files[] = 'HTMLPurifier/EncoderTest.php';
$test_files[] = 'HTMLPurifier/EntityLookupTest.php';
$test_files[] = 'HTMLPurifier/EntityParserTest.php';
$test_files[] = 'HTMLPurifier/Filter/ExtractStyleBlocksTest.php';
$test_files[] = 'HTMLPurifier/GeneratorTest.php';
$test_files[] = 'HTMLPurifier/HTMLDefinitionTest.php';
$test_files[] = 'HTMLPurifier/HTMLModuleManagerTest.php';