0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-09-19 10:45:18 +00:00

[1.3.0] Huge upgrade, (X)HTML Strict now supported

+ Transparently handles inline elements in block context (blockquote)
! Added GET method to demo for easier validation, added 50kb max input size
! New directive %HTML.BlockWrapper, for block-ifying inline elements
! New directive %HTML.Parent, allows you to only allow inline content
- Added missing type to ChildDef_Chameleon
. ChildDef_Required guards against empty tags
. Lookup table HTMLDefinition->info_flow_elements added
. Added peace-of-mind variable initialization to Strategy_FixNesting

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@560 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-11-23 03:23:35 +00:00
parent d8673539ab
commit b1b3377b9c
12 changed files with 289 additions and 32 deletions

9
NEWS
View File

@ -11,6 +11,15 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
1.3.0, unknown release date
(major feature release)
! (X)HTML Strict now supported
+ Transparently handles inline elements in block context (blockquote)
! Added GET method to demo for easier validation, added 50kb max input size
! New directive %HTML.BlockWrapper, for block-ifying inline elements
! New directive %HTML.Parent, allows you to only allow inline content
- Added missing type to ChildDef_Chameleon
. ChildDef_Required guards against empty tags
. Lookup table HTMLDefinition->info_flow_elements added
. Added peace-of-mind variable initialization to Strategy_FixNesting
1.2.1, unknown release date
(bugfix/minor feature release, may be dropped if 1.2.0 is stable)

View File

@ -1,11 +1,30 @@
<?php
header('Content-type:text/html;charset=UTF-8');
// using _REQUEST because we accept GET and POST requests
?><!DOCTYPE html
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
$content = empty($_REQUEST['xml']) ? 'text/html' : 'application/xhtml+xml';
header("Content-type:$content;charset=UTF-8");
// prevent PHP versions with shorttags from barfing
echo '<?xml version="1.0" encoding="UTF-8" ?>
';
function getFormMethod() {
return (isset($_REQUEST['post'])) ? 'post' : 'get';
}
if (empty($_REQUEST['strict'])) {
?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<?php
} else {
?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<?php
}
?>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
<head>
<title>HTMLPurifier Live Demo</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
@ -14,15 +33,21 @@ header('Content-type:text/html;charset=UTF-8');
<h1>HTMLPurifier Live Demo</h1>
<?php
set_include_path('../../library' . PATH_SEPARATOR . get_include_path());
require_once 'HTMLPurifier.php';
require_once '../../library/HTMLPurifier.auto.php';
if (!empty($_POST['html'])) {
if (!empty($_REQUEST['html'])) { // start result
$html = get_magic_quotes_gpc() ? stripslashes($_POST['html']) : $_POST['html'];
if (strlen($_REQUEST['html']) > 50000) {
?>
<p>Request exceeds maximum allowed text size of 50kb.</p>
<?php
} else { // start main processing
$html = get_magic_quotes_gpc() ? stripslashes($_REQUEST['html']) : $_REQUEST['html'];
$config = HTMLPurifier_Config::createDefault();
$config->set('Core', 'TidyFormat', !empty($_POST['tidy']));
$config->set('Core', 'TidyFormat', !empty($_REQUEST['tidy']));
$config->set('HTML', 'Strict', !empty($_REQUEST['strict']));
$purifier = new HTMLPurifier($config);
$pure_html = $purifier->purify($html);
@ -43,7 +68,17 @@ echo htmlspecialchars($pure_html, ENT_COMPAT, 'UTF-8');
?></pre>
<?php
if (getFormMethod() == 'post') { // start POST validation notice
?>
<p>If you would like to validate the code with
<a href="http://validator.w3.org/#validate-by-input">W3C's
validator</a>, copy and paste the <em>entire</em> demo page's source.</p>
<?php
} // end POST validation notice
} // end main processing
// end result
} else {
?>
@ -54,12 +89,13 @@ will filter it.</p>
}
?>
<form name="filter" action="demo.php<?php
if (isset($_GET['profile']) || isset($_GET['XDEBUG_PROFILE'])) {
echo '?XDEBUG_PROFILE=1';
} ?>" method="post">
<form id="filter" action="demo.php<?php
echo '?' . getFormMethod();
if (isset($_REQUEST['profile']) || isset($_REQUEST['XDEBUG_PROFILE'])) {
echo '&amp;XDEBUG_PROFILE=1';
} ?>" method="<?php echo getFormMethod(); ?>">
<fieldset>
<legend>HTML</legend>
<legend>HTML Purifier Input (<?php echo getFormMethod(); ?>)</legend>
<textarea name="html" cols="60" rows="15"><?php
if (isset($html)) {
@ -67,13 +103,31 @@ if (isset($html)) {
HTMLPurifier_Encoder::cleanUTF8($html), ENT_COMPAT, 'UTF-8');
}
?></textarea>
<?php if (getFormMethod() == 'get') { ?>
<p><strong>Warning:</strong> GET request method can only hold
approximately 2000 characters. If you need to test anything
larger than that, try the <a href="demo.php?post">POST form</a>.</p>
<?php } ?>
<div>Nicely format output with Tidy? <input type="checkbox" value="1"
name="tidy"<?php if (!empty($_POST['tidy'])) echo ' checked="checked"'; ?> /></div>
name="tidy"<?php if (!empty($_REQUEST['tidy'])) echo ' checked="checked"'; ?> /></div>
<div>XHTML 1.0 Strict output? <input type="checkbox" value="1"
name="strict"<?php if (!empty($_REQUEST['strict'])) echo ' checked="checked"'; ?> /></div>
<div>Serve as application/xhtml+xml? (not for IE) <input type="checkbox" value="1"
name="xml"<?php if (!empty($_REQUEST['xml'])) echo ' checked="checked"'; ?> /></div>
<div>
<input type="submit" value="Submit" name="submit" class="button" />
</div>
</fieldset>
</form>
<p>Return to <a href="http://hp.jpsband.org/">HTMLPurifier's home page</a>.</p>
<p>Return to <a href="http://hp.jpsband.org/">HTMLPurifier's home page</a>.
Try the form in <a href="demo.php?get">GET</a> and <a href="demo.php?post">POST</a> request
flavors (GET is easy to validate, but POST allows larger inputs).</p>
<?php if(getFormMethod() == 'get') { ?>
<p>
<a href="http://validator.w3.org/check?uri=referer"><img
src="http://www.w3.org/Icons/valid-xhtml10"
alt="Valid XHTML 1.0 Transitional" height="31" width="88" style="border:0;" /></a>
</p>
<?php } ?>
</body>
</html>

View File

@ -7,11 +7,11 @@ to HTML Purifier, though, so let's take a look:
== Major incompatibilities ==
BLOCKQUOTE changes from 'flow' to 'block'
[done] BLOCKQUOTE changes from 'flow' to 'block'
behavior: inline inner contents should not be nuked, paragraph as necessary
U, S, STRIKE cut
[partially-done] U, S, STRIKE cut
behavior: replace with appropriate inline span + CSS
ADDRESS from potpourri to Inline (removes p tags)
[partially-done] ADDRESS from potpourri to Inline (removes p tags) (lower importance)
behavior: p tags silently dropped or replaced with something (<br>)
== Things we can loosen up ==
@ -38,5 +38,6 @@ A tag's attribute 'target' (for selecting frames) cut
OL/LI tag's attribute 'start' (for renumbering lists) cut
behavior: no substitute, just delete
Attribute 'name' deprecated in favor of 'id'
behavior: create proper AttrTransform
behavior: not allowed in first place, but create proper AttrTransform
PRE tag allows SUB/SUP? (strict dtd comment vs syntax, loose disallows)
behavior: disallow as usual

View File

@ -23,6 +23,8 @@ class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
*/
var $block;
var $type = 'chameleon';
/**
* @param $inline List of elements to allow when inline.
* @param $block List of elements to allow when block.

View File

@ -20,7 +20,10 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
$elements = explode('|', $elements);
}
$elements = array_flip($elements);
foreach ($elements as $i => $x) $elements[$i] = true;
foreach ($elements as $i => $x) {
$elements[$i] = true;
if (empty($i)) unset($elements[$i]);
}
$this->elements = $elements;
$this->gen = new HTMLPurifier_Generator();
}

View File

@ -0,0 +1,70 @@
<?php
require_once 'HTMLPurifier/ChildDef/Required.php';
/**
* Takes the contents of blockquote when in strict and reformats for validation.
*
* From XHTML 1.0 Transitional to Strict, there is a notable change where
*/
class HTMLPurifier_ChildDef_StrictBlockquote
extends HTMLPurifier_ChildDef_Required
{
var $allow_empty = true;
var $type = 'strictblockquote';
var $init = false;
function HTMLPurifier_ChildDef_StrictBlockquote() {}
function validateChildren($tokens_of_children, $config, &$context) {
$def = $config->getHTMLDefinition();
if (!$this->init) {
// allow all inline elements
$this->elements = $def->info_flow_elements;
$this->elements['#PCDATA'] = true;
$this->init = true;
}
$result = parent::validateChildren($tokens_of_children, $config, $context);
if ($result === false) return array();
if ($result === true) $result = $tokens_of_children;
$block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper);
$block_wrap_end = new HTMLPurifier_Token_End( $def->info_block_wrapper);
$is_inline = false;
$depth = 0;
$ret = array();
// assuming that there are no comment tokens
foreach ($result as $i => $token) {
$token = $result[$i];
// ifs are nested for readability
if (!$is_inline) {
if (!$depth) {
if (($token->type == 'text') ||
($def->info[$token->name]->type == 'inline')) {
$is_inline = true;
$ret[] = $block_wrap_start;
}
}
} else {
if (!$depth) {
// starting tokens have been inline text / empty
if ($token->type == 'start' || $token->type == 'empty') {
if ($def->info[$token->name]->type == 'block') {
// ended
$ret[] = $block_wrap_end;
$is_inline = false;
}
}
}
}
$ret[] = $token;
if ($token->type == 'start') $depth++;
if ($token->type == 'end') $depth--;
}
if ($is_inline) $ret[] = $block_wrap_end;
return $ret;
}
}
?>

View File

@ -23,6 +23,7 @@ require_once 'HTMLPurifier/ChildDef.php';
require_once 'HTMLPurifier/ChildDef/Required.php';
require_once 'HTMLPurifier/ChildDef/Optional.php';
require_once 'HTMLPurifier/ChildDef/Table.php';
require_once 'HTMLPurifier/ChildDef/StrictBlockquote.php';
require_once 'HTMLPurifier/Generator.php';
require_once 'HTMLPurifier/Token.php';
require_once 'HTMLPurifier/TagTransform.php';
@ -45,6 +46,23 @@ HTMLPurifier_ConfigSchema::define(
'Determines whether or not to use Transitional (loose) or Strict rulesets.'
);
HTMLPurifier_ConfigSchema::define(
'HTML', 'BlockWrapper', 'p', 'string',
'String name of element to wrap inline elements that are inside a block '.
'context. This only occurs in the children of blockquote in strict mode. '.
'Example: by default value, <code>&lt;blockquote&gt;Foo&lt;/blockquote&gt;</code> '.
'would become <code>&lt;blockquote&gt;&lt;p&gt;Foo&lt;/p&gt;&lt;/blockquote&gt;</code>. The '.
'<code>&lt;p&gt;</code> tags can be replaced '.
'with whatever you desire, as long as it is a block level element.'
);
HTMLPurifier_ConfigSchema::define(
'HTML', 'Parent', 'div', 'string',
'String name of element that HTML fragment passed to library will be '.
'inserted in. An interesting variation would be using span as the '.
'parent element, meaning that only inline tags would be allowed.'
);
/**
* Defines the purified HTML type with large amounts of objects.
*
@ -79,11 +97,17 @@ class HTMLPurifier_HTMLDefinition
/**
* String name of parent element HTML will be going into.
* @todo Allow this to be overloaded by user config
* @public
*/
var $info_parent = 'div';
/**
* String name of element used to wrap inline elements in block context
* @note This is rarely used except for BLOCKQUOTEs in strict mode
* @public
*/
var $info_block_wrapper = 'p';
/**
* Associative array of deprecated tag name to HTMLPurifier_TagTransform
* @public
@ -102,6 +126,11 @@ class HTMLPurifier_HTMLDefinition
*/
var $info_attr_transform_post = array();
/**
* Lookup table of flow elements
*/
var $info_flow_elements = array();
/**
* Initializes the definition, the meat of the class.
*/
@ -164,11 +193,9 @@ class HTMLPurifier_HTMLDefinition
$e_phrase_basic = 'em | strong | dfn | code | q | samp | kbd | var'.
' | cite | abbr | acronym';
$e_phrase = "$e_phrase_basic | $e_phrase_extra";
$e_inline_forms = ''; // humor the dtd
$e_misc_inline = 'ins | del';
$e_misc = "$e_misc_inline";
$e_inline = "a | $e_special | $e_fontstyle | $e_phrase".
" | $e_inline_forms";
$e_inline = "a | $e_special | $e_fontstyle | $e_phrase";
// pseudo-property we created for convenience, see later on
$e__inline = "#PCDATA | $e_inline | $e_misc_inline";
// note the casing
@ -181,11 +208,10 @@ class HTMLPurifier_HTMLDefinition
$e__flow = "#PCDATA | $e_block | $e_inline | $e_misc";
$e_Flow = new HTMLPurifier_ChildDef_Optional($e__flow);
$e_a_content = new HTMLPurifier_ChildDef_Optional("#PCDATA".
" | $e_special | $e_fontstyle | $e_phrase | $e_inline_forms".
" | $e_misc_inline");
" | $e_special | $e_fontstyle | $e_phrase | $e_misc_inline");
$e_pre_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | a".
" | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic".
" | $e_inline_forms | $e_misc_inline");
" | $e_misc_inline");
$e_form_content = new HTMLPurifier_ChildDef_Optional('');//unused
$e_form_button_content = new HTMLPurifier_ChildDef_Optional('');//unused
@ -198,7 +224,7 @@ class HTMLPurifier_HTMLDefinition
$this->info['div']->child = $e_Flow;
if ($config->get('HTML', 'Strict')) {
$this->info['blockquote']->child = $e_Block;
$this->info['blockquote']->child = new HTMLPurifier_ChildDef_StrictBlockquote();
} else {
$this->info['blockquote']->child = $e_Flow;
}
@ -276,7 +302,7 @@ class HTMLPurifier_HTMLDefinition
// reuses $e_Inline and $e_Block
foreach ($e_Inline->elements as $name => $bool) {
if ($name == '#PCDATA' || $name == '') continue;
if ($name == '#PCDATA') continue;
$this->info[$name]->type = 'inline';
}
@ -284,6 +310,10 @@ class HTMLPurifier_HTMLDefinition
$this->info[$name]->type = 'block';
}
foreach ($e_Flow->elements as $name => $bool) {
$this->info_flow_elements[$name] = true;
}
//////////////////////////////////////////////////////////////////////
// info[]->excludes : defines elements that aren't allowed in here
@ -447,6 +477,28 @@ class HTMLPurifier_HTMLDefinition
}
}
//////////////////////////////////////////////////////////////////////
// info_block_wrapper : wraps inline elements in block context
$block_wrapper = $config->get('HTML', 'BlockWrapper');
if (isset($e_Block->elements[$block_wrapper])) {
$this->info_block_wrapper = $block_wrapper;
} else {
trigger_error('Cannot use non-block element as block wrapper.',
E_USER_ERROR);
}
//////////////////////////////////////////////////////////////////////
// info_parent : parent element of the HTML fragment
$parent = $config->get('HTML', 'Parent');
if (isset($this->info[$parent])) {
$this->info_parent = $parent;
} else {
trigger_error('Cannot use unrecognized element as parent.',
E_USER_ERROR);
}
}
function setAttrForTableElements($attr, $def) {

View File

@ -141,6 +141,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
if ($excluded) {
// there is an exclusion, remove the entire node
$result = false;
$excludes = array(); // not used, but good to initialize anyway
} else {
// DEFINITION CALL
$def = $definition->info[$tokens[$i]->name];

View File

@ -0,0 +1,50 @@
<?php
require_once 'HTMLPurifier/ChildDefHarness.php';
require_once 'HTMLPurifier/ChildDef/StrictBlockquote.php';
class HTMLPurifier_ChildDef_StrictBlockquoteTest
extends HTMLPurifier_ChildDefHarness
{
function test() {
$this->obj = new HTMLPurifier_ChildDef_StrictBlockquote();
$this->assertResult('');
$this->assertResult('<p>Valid</p>');
$this->assertResult('<div>Still valid</div>');
$this->assertResult('Needs wrap', '<p>Needs wrap</p>');
$this->assertResult(
'Wrap'. '<p>Do not wrap</p>',
'<p>Wrap</p><p>Do not wrap</p>'
);
$this->assertResult(
'<p>Do not</p>'.'<b>Wrap</b>',
'<p>Do not</p><p><b>Wrap</b></p>'
);
$this->assertResult(
'<li>Not allowed</li>Paragraph.<p>Hmm.</p>',
'<p>Not allowedParagraph.</p><p>Hmm.</p>'
);
$this->assertResult(
$var = 'He said<br />perhaps<br />we should <b>nuke</b> them.',
"<p>$var</p>"
);
$this->assertResult(
'<foo>Bar</foo><bas /><b>People</b>Conniving.'. '<p>Fools!</p>',
'<p>Bar'. '<b>People</b>Conniving.</p><p>Fools!</p>'
);
$this->assertResult('Needs wrap', '<div>Needs wrap</div>',
array('HTML.BlockWrapper' => 'div'));
$this->assertResult('Needs wrap', '<p>Needs wrap</p>',
array('HTML.BlockWrapper' => 'dav'));
$this->assertError('Cannot use non-block element as block wrapper.');
$this->assertNoErrors();
}
}
?>

View File

@ -83,6 +83,20 @@ class HTMLPurifier_Strategy_FixNestingTest extends HTMLPurifier_StrategyHarness
'<a><span></span></a>'
);
// test inline parent
$this->assertResult(
'<b>Bold</b>', true, array('HTML.Parent' => 'span')
);
$this->assertResult(
'<div>Reject</div>', 'Reject', array('HTML.Parent' => 'span')
);
$this->assertResult(
'<div>Accept</div>', true, array('HTML.Parent' => 'script')
);
$this->assertError('Cannot use unrecognized element as parent.');
$this->assertNoErrors();
}
}

View File

@ -31,7 +31,7 @@ class HTMLPurifier_Test extends UnitTestCase
$this->assertPurification(
'<blockquote>Illegal contents</blockquote>',
'<blockquote></blockquote>'
'<blockquote><p>Illegal contents</p></blockquote>'
);
}

View File

@ -49,6 +49,7 @@ $test_files[] = 'ChildDef/OptionalTest.php';
$test_files[] = 'ChildDef/ChameleonTest.php';
$test_files[] = 'ChildDef/CustomTest.php';
$test_files[] = 'ChildDef/TableTest.php';
$test_files[] = 'ChildDef/StrictBlockquoteTest.php';
$test_files[] = 'GeneratorTest.php';
$test_files[] = 'EntityLookupTest.php';
$test_files[] = 'Strategy/RemoveForeignElementsTest.php';