0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-03 13:21:51 +00:00

Massively refactored Definition, moved MakeWellFormed HTML specific code out.

Add table functionality for nesting, don't know how I missed that. It's still broken though.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@135 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-07-31 00:15:01 +00:00
parent 2b5589c884
commit 9c6ae16764
3 changed files with 91 additions and 64 deletions

View File

@ -33,30 +33,6 @@ class HTMLPurifier_Definition
var $info = array();
// used solely by HTMLPurifier_Strategy_MakeWellFormed
var $info_closes_p = array(
// these are all block elements: blocks aren't allowed in P
'address' => true,
'blockquote' => true,
'dd' => true,
'dir' => true,
'div' => true,
'dl' => true,
'dt' => true,
'h1' => true,
'h2' => true,
'h3' => true,
'h4' => true,
'h5' => true,
'h6' => true,
'hr' => true,
'ol' => true,
'p' => true,
'pre' => true,
'table' => true,
'ul' => true
);
// used solely by HTMLPurifier_Strategy_ValidateAttributes
var $info_global_attr = array();
@ -75,7 +51,23 @@ class HTMLPurifier_Definition
function HTMLPurifier_Definition() {}
function setup() {
// emulates the structure of the DTD
// these are condensed, however, with bad stuff taken out
// screening process was done by hand
// The code makes certain assumptions about the structure of this
// definition for optimization reasons:
//
// FixNesting - There will never be a need for cascading removal
// of tags, usually triggered by a node requiring the
// existence of another node that may be deleted.
//////////////////////////////////////////////////////////////////////
// info[] : initializes the definition objects
// if you attempt to define rules later on for a tag not in this array
// PHP will create an stdclass
$allowed_tags =
array(
@ -84,28 +76,23 @@ class HTMLPurifier_Definition
'q', 'sub', 'tt', 'sup', 'i', 'b', 'big', 'small', 'u', 's',
'strike', 'bdo', 'span', 'dt', 'p', 'h1', 'h2', 'h3', 'h4',
'h5', 'h6', 'ol', 'ul', 'dl', 'address', 'img', 'br', 'hr',
'pre', 'a'
'pre', 'a', 'table', 'caption', 'thead', 'tfoot', 'tbody',
'colgroup', 'col', 'td', 'th', 'tr'
);
foreach ($allowed_tags as $tag) {
$this->info[$tag] = new HTMLPurifier_ElementDef();
}
//////////////////////////////////////////////////////////////////////
// info[]->child : defines allowed children for elements
// entities: prefixed with e_ and _ replaces .
// we don't use an array because that complicates interpolation
// strings are used instead of arrays because if you use arrays,
// you have to do some hideous manipulation with array_merge()
// these are condensed, remember, with bad stuff taken out
// transforms: font, menu, dir, center
// DON'T MONKEY AROUND THIS unless you know what you are doing
// and also know the assumptions the code makes about what this
// contains for optimization purposes (see fixNesting)
// child info
$e_special_extra = 'img';
$e_special_basic = 'br | span | bdo';
$e_special = "$e_special_basic | $e_special_extra";
@ -140,7 +127,7 @@ class HTMLPurifier_Definition
$this->info['ins']->child =
$this->info['del']->child =
$this->info['blockquote']->child =
$this->info['blockquote']->child=
$this->info['dd']->child =
$this->info['li']->child =
$this->info['div']->child = $e_Flow;
@ -162,7 +149,7 @@ class HTMLPurifier_Definition
$this->info['i']->child =
$this->info['b']->child =
$this->info['big']->child =
$this->info['small']->child =
$this->info['small']->child=
$this->info['u']->child =
$this->info['s']->child =
$this->info['strike']->child =
@ -177,10 +164,12 @@ class HTMLPurifier_Definition
$this->info['h5']->child =
$this->info['h6']->child = $e_Inline;
// the only three required definitions, besides custom table code
$this->info['ol']->child =
$this->info['ul']->child = new HTMLPurifier_ChildDef_Required('li');
$this->info['dl']->child = new HTMLPurifier_ChildDef_Required('dt|dd');
$this->info['address']->child =
new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline".
" | $e_misc_inline");
@ -193,7 +182,23 @@ class HTMLPurifier_Definition
$this->info['a']->child = $e_a_content;
// attribute info
$this->info['table']->child = new HTMLPurifier_ChildDef(
'(caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))');
// not a real entity, watch the double underscore
$e__row = new HTMLPurifier_ChildDef_Required('tr');
$this->info['thead']->child = $e__row;
$this->info['tfoot']->child = $e__row;
$this->info['tbody']->child = $e__row;
$this->info['colgroup']->child = new HTMLPurifier_ChildDef_Optional('col');
$this->info['col']->child = new HTMLPurifier_ChildDef_Empty();
$this->info['tr']->child = new HTMLPurifier_ChildDef_Required('th | td');
$this->info['th']->child = $e_Flow;
$this->info['td']->child = $e_Flow;
//////////////////////////////////////////////////////////////////////
// info[]->attr : defines allowed attributes for elements
// this doesn't include REQUIRED declarations, those are handled
// by the transform classes
@ -205,6 +210,39 @@ class HTMLPurifier_Definition
'dir' => new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false),
);
//////////////////////////////////////////////////////////////////////
// UNIMP : info_tag_transform : transformations of tags
// font -> span / attributes: size color face
// css: font-size color font-family
// menu -> ul
// dir -> ul
// center -> div / css: text-align: center;
//////////////////////////////////////////////////////////////////////
// info[]->auto_close : tags that automatically close another
// these are all block elements: blocks aren't allowed in P
$this->info['p']->auto_close = array_flip(array(
'address', 'blockquote', 'dd', 'dir', 'div', 'dl', 'dt',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'p', 'pre',
'table', 'ul'
));
$this->info['li']->auto_close = array('li' => true);
// we need TABLE and heading mismatch code
// we may need to make this more flexible for heading mismatch,
// or we can just create another info
//////////////////////////////////////////////////////////////////////
// UNIMP : info[]->attr_transform : attribute transformations in elements
//////////////////////////////////////////////////////////////////////
// UNIMP : info_attr_transform : global attribute transform (for xml:lang)
// this might have bad implications for performance
}
}
@ -212,8 +250,9 @@ class HTMLPurifier_Definition
class HTMLPurifier_ElementDef
{
var $child;
var $attr = array();
var $auto_close = array();
var $child;
}

View File

@ -59,35 +59,19 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// if there's a parent, check for special case
if (!empty($current_nesting)) {
$current_parent = array_pop($current_nesting);
// this ought to be moved to definition
$parent = array_pop($current_nesting);
$parent_name = $parent->name;
$parent_info = $this->definition->info[$parent_name];
// check if we're closing a P tag
if ($current_parent->name == 'p' &&
isset($this->definition->info_closes_p[$token->name])
) {
$result[] = new HTMLPurifier_Token_End('p');
if (isset($parent_info->auto_close[$token->name])) {
$result[] = new HTMLPurifier_Token_End($parent_name);
$result[] = $token;
$current_nesting[] = $token;
continue;
}
// check if we're closing a LI tag
if ($current_parent->name == 'li' &&
$token->name == 'li'
) {
$result[] = new HTMLPurifier_Token_End('li');
$result[] = $token;
$current_nesting[] = $token;
continue;
}
// this is more TIDY stuff
// we should also get some TABLE related code
// mismatched h#
$current_nesting[] = $current_parent; // undo the pop
$current_nesting[] = $parent; // undo the pop
}
$result[] = $token;

View File

@ -37,6 +37,10 @@ class HTMLPurifier_Strategy_FixNestingTest
$inputs[4] = '<ul>Illegal text<li>Legal item</li></ul>';
$expect[4] = '<ul><li>Legal item</li></ul>';
// test custom table definition
$inputs[5] = '<table><tr><td>Cell 1</td></tr></table>';
$expect[5] = '<table><tr><td>Cell 1</td></tr></table>';
$this->assertStrategyWorks($strategy, $inputs, $expect);
}