0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-12-22 16:31:53 +00:00

Make the definition format much more logical. Begin migrating specification docs to their respective classes.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@133 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-07-30 19:11:18 +00:00
parent 70bd80e66a
commit 558c49a92d
6 changed files with 86 additions and 93 deletions

View File

@ -1,9 +1,7 @@
HTML Purifier Specification HTML Purifier
by Edward Z. Yang by Edward Z. Yang
== Introduction ==
There are a number of ad hoc HTML filtering solutions out there on the web There are a number of ad hoc HTML filtering solutions out there on the web
(some examples including HTML_Safe, kses and SafeHtmlChecker.class.php) that (some examples including HTML_Safe, kses and SafeHtmlChecker.class.php) that
claim to filter HTML properly, preventing malicious JavaScript and layout claim to filter HTML properly, preventing malicious JavaScript and layout
@ -56,29 +54,6 @@ HTML tags. Things like blog comments are, in all likelihood, most appropriately
written in an extremely restrictive set of markup that doesn't require written in an extremely restrictive set of markup that doesn't require
all this functionality (or not written in HTML at all). all this functionality (or not written in HTML at all).
== STAGE 1 - parsing ==
Status: A (see source, mainly internals and UTF-8)
The Lexer (currently we have three choices) handles parsing into Tokens.
Here are the mappings for Lexer_PEARSax3
* Start(name, attributes) is openHandler
* End(name) is closeHandler
* Empty(name, attributes) is openHandler (is in array of empties)
* Data(parse(text)) is dataHandler
* Comment(text) is escapeHandler (has leading -)
* Data(text) is escapeHandler (has leading [, CDATA)
Ignorable/not being implemented (although we probably want to output them raw):
* ProcessingInstructions(text) is piHandler
* JavaOrASPInstructions(text) is jaspHandler
== STAGE 2 - remove foreign elements == == STAGE 2 - remove foreign elements ==
Status: A- (transformations need to be implemented) Status: A- (transformations need to be implemented)

View File

@ -34,6 +34,7 @@ class HTMLPurifier_Definition
'table' => true, 'table' => true,
'ul' => true 'ul' => true
); );
var $info_global_attr = array();
function instance() { function instance() {
static $instance = null; static $instance = null;
@ -49,6 +50,20 @@ class HTMLPurifier_Definition
function setup() { function setup() {
// emulates the structure of the DTD // emulates the structure of the DTD
$allowed_tags =
array(
'ins', 'del', 'blockquote', 'dd', 'li', 'div', 'em', 'strong',
'dfn', 'code', 'samp', 'kbd', 'var', 'cite', 'abbr', 'acronym',
'q', 'sub', 'tt', 'sup', 'i', 'b', 'big', 'small', 'u', 's',
'strike', 'bdo', 'span', 'dt', 'p', 'h1', 'h2', 'h3', 'h4',
'h5', 'h6', 'ol', 'ul', 'dl', 'address', 'img', 'br', 'hr',
'pre', 'a'
);
foreach ($allowed_tags as $tag) {
$this->info[$tag] = new HTMLPurifier_ElementDef();
}
// entities: prefixed with e_ and _ replaces . // entities: prefixed with e_ and _ replaces .
// we don't use an array because that complicates interpolation // we don't use an array because that complicates interpolation
// strings are used instead of arrays because if you use arrays, // strings are used instead of arrays because if you use arrays,
@ -96,73 +111,67 @@ class HTMLPurifier_Definition
$e_form_content = new HTMLPurifier_ChildDef_Optional(''); //unused $e_form_content = new HTMLPurifier_ChildDef_Optional(''); //unused
$e_form_button_content = new HTMLPurifier_ChildDef_Optional(''); // unused $e_form_button_content = new HTMLPurifier_ChildDef_Optional(''); // unused
$this->info['child'] = array(); $this->info['ins']->child =
$this->info['del']->child =
$this->info['blockquote']->child =
$this->info['dd']->child =
$this->info['li']->child =
$this->info['div']->child = $e_Flow;
$this->info['child']['ins'] = $this->info['em']->child =
$this->info['child']['del'] = $this->info['strong']->child =
$this->info['child']['blockquote'] = $this->info['dfn']->child =
$this->info['child']['dd'] = $this->info['code']->child =
$this->info['child']['li'] = $this->info['samp']->child =
$this->info['child']['div'] = $e_Flow; $this->info['kbd']->child =
$this->info['var']->child =
$this->info['cite']->child =
$this->info['abbr']->child =
$this->info['acronym']->child =
$this->info['q']->child =
$this->info['sub']->child =
$this->info['tt']->child =
$this->info['sup']->child =
$this->info['i']->child =
$this->info['b']->child =
$this->info['big']->child =
$this->info['small']->child =
$this->info['u']->child =
$this->info['s']->child =
$this->info['strike']->child =
$this->info['bdo']->child =
$this->info['span']->child =
$this->info['dt']->child =
$this->info['p']->child =
$this->info['h1']->child =
$this->info['h2']->child =
$this->info['h3']->child =
$this->info['h4']->child =
$this->info['h5']->child =
$this->info['h6']->child = $e_Inline;
$this->info['child']['em'] = $this->info['ol']->child =
$this->info['child']['strong'] = $this->info['ul']->child = new HTMLPurifier_ChildDef_Required('li');
$this->info['child']['dfn'] =
$this->info['child']['code'] =
$this->info['child']['samp'] =
$this->info['child']['kbd'] =
$this->info['child']['var'] =
$this->info['child']['code'] =
$this->info['child']['samp'] =
$this->info['child']['kbd'] =
$this->info['child']['var'] =
$this->info['child']['cite'] =
$this->info['child']['abbr'] =
$this->info['child']['acronym'] =
$this->info['child']['q'] =
$this->info['child']['sub'] =
$this->info['child']['tt'] =
$this->info['child']['sup'] =
$this->info['child']['i'] =
$this->info['child']['b'] =
$this->info['child']['big'] =
$this->info['child']['small'] =
$this->info['child']['u'] =
$this->info['child']['s'] =
$this->info['child']['strike'] =
$this->info['child']['bdo'] =
$this->info['child']['span'] =
$this->info['child']['dt'] =
$this->info['child']['p'] =
$this->info['child']['h1'] =
$this->info['child']['h2'] =
$this->info['child']['h3'] =
$this->info['child']['h4'] =
$this->info['child']['h5'] =
$this->info['child']['h6'] = $e_Inline;
$this->info['child']['ol'] = $this->info['dl']->child = new HTMLPurifier_ChildDef_Required('dt|dd');
$this->info['child']['ul'] = new HTMLPurifier_ChildDef_Required('li'); $this->info['address']->child =
$this->info['child']['dl'] = new HTMLPurifier_ChildDef_Required('dt|dd');
$this->info['child']['address'] =
new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline". new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline".
" | $e_misc_inline"); " | $e_misc_inline");
$this->info['child']['img'] = $this->info['img']->child =
$this->info['child']['br'] = $this->info['br']->child =
$this->info['child']['hr'] = new HTMLPurifier_ChildDef_Empty(); $this->info['hr']->child = new HTMLPurifier_ChildDef_Empty();
$this->info['child']['pre'] = $e_pre_content; $this->info['pre']->child = $e_pre_content;
$this->info['child']['a'] = $e_a_content; $this->info['a']->child = $e_a_content;
// attribute info // attribute info
// this doesn't include REQUIRED declarations, those are handled // this doesn't include REQUIRED declarations, those are handled
// by the transform classes // by the transform classes
// attrs, included in almost every single one except for a few // attrs, included in almost every single one except for a few
$this->info['attr']['*'] = array( $this->info_global_attr = array(
// core attrs // core attrs
'id' => new HTMLPurifier_AttrDef_ID(), 'id' => new HTMLPurifier_AttrDef_ID(),
// i18n // i18n
@ -176,13 +185,8 @@ class HTMLPurifier_Definition
class HTMLPurifier_ElementDef class HTMLPurifier_ElementDef
{ {
var $child_def; var $child;
var $attr_def = array(); var $attr = array();
function HTMLPurifier_ElementDef($child_def, $attr_def = array()) {
$this->child_def = $child_def;
$this->attr_def = $attr_def;
}
} }

View File

@ -38,8 +38,11 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
// $i is index of start token // $i is index of start token
// $j is index of end token // $j is index of end token
// DEFINITION CALL
$child_def = $this->definition->info[$tokens[$i]->name]->child;
// have DTD child def validate children // have DTD child def validate children
$child_def = $this->definition->info['child'][$tokens[$i]->name];
$result = $child_def->validateChildren($child_tokens); $result = $child_def->validateChildren($child_tokens);
// process result // process result

View File

@ -23,7 +23,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$result[] = $token; $result[] = $token;
continue; continue;
} }
$info = $this->definition->info['child'][$token->name]; // assumption but valid
// DEFINITION CALL
$info = $this->definition->info[$token->name]->child;
// test if it claims to be a start tag but is empty // test if it claims to be a start tag but is empty
if ($info->type == 'empty' && if ($info->type == 'empty' &&

View File

@ -4,6 +4,13 @@ require_once 'HTMLPurifier/Strategy.php';
require_once 'HTMLPurifier/Definition.php'; require_once 'HTMLPurifier/Definition.php';
require_once 'HTMLPurifier/Generator.php'; require_once 'HTMLPurifier/Generator.php';
/**
* Removes all unrecognized tags from the list of tokens.
*
* This strategy iterates through all the tokens and removes unrecognized
* tokens.
*/
class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
{ {
@ -19,7 +26,8 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
$result = array(); $result = array();
foreach($tokens as $token) { foreach($tokens as $token) {
if (!empty( $token->is_tag )) { if (!empty( $token->is_tag )) {
if (!isset($this->definition->info['child'][$token->name])) { // DEFINITION CALL
if (!isset($this->definition->info[$token->name])) {
// invalid tag, generate HTML and insert in // invalid tag, generate HTML and insert in
$token = new HTMLPurifier_Token_Text( $token = new HTMLPurifier_Token_Text(
$this->generator->generateFromToken($token) $this->generator->generateFromToken($token)

View File

@ -15,13 +15,14 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
function execute($tokens) { function execute($tokens) {
$accumulator = new HTMLPurifier_IDAccumulator(); $accumulator = new HTMLPurifier_IDAccumulator();
$d_defs = $this->definition->info['attr']['*']; $d_defs = $this->definition->info_global_attr;
foreach ($tokens as $key => $token) { foreach ($tokens as $key => $token) {
if ($token->type !== 'start' && $token->type !== 'end') continue; if ($token->type !== 'start' && $token->type !== 'end') continue;
$name = $token->name;
// DEFINITION CALL
$defs = $this->definition->info[$token->name]->attr;
$attr = $token->attributes; $attr = $token->attributes;
$defs = isset($this->definition->info['attr'][$name]) ?
$this->definition->attr[$name] : array();
$changed = false; $changed = false;
foreach ($attr as $attr_key => $value) { foreach ($attr as $attr_key => $value) {
if ( isset($defs[$attr_key]) ) { if ( isset($defs[$attr_key]) ) {