0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2024-12-22 08:21:52 +00:00

Make the definition format much more logical. Begin migrating specification docs to their respective classes.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@133 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-07-30 19:11:18 +00:00
parent 70bd80e66a
commit 558c49a92d
6 changed files with 86 additions and 93 deletions

View File

@ -1,9 +1,7 @@
HTML Purifier Specification
HTML Purifier
by Edward Z. Yang
== Introduction ==
There are a number of ad hoc HTML filtering solutions out there on the web
(some examples including HTML_Safe, kses and SafeHtmlChecker.class.php) that
claim to filter HTML properly, preventing malicious JavaScript and layout
@ -56,29 +54,6 @@ HTML tags. Things like blog comments are, in all likelihood, most appropriately
written in an extremely restrictive set of markup that doesn't require
all this functionality (or not written in HTML at all).
== STAGE 1 - parsing ==
Status: A (see source, mainly internals and UTF-8)
The Lexer (currently we have three choices) handles parsing into Tokens.
Here are the mappings for Lexer_PEARSax3
* Start(name, attributes) is openHandler
* End(name) is closeHandler
* Empty(name, attributes) is openHandler (is in array of empties)
* Data(parse(text)) is dataHandler
* Comment(text) is escapeHandler (has leading -)
* Data(text) is escapeHandler (has leading [, CDATA)
Ignorable/not being implemented (although we probably want to output them raw):
* ProcessingInstructions(text) is piHandler
* JavaOrASPInstructions(text) is jaspHandler
== STAGE 2 - remove foreign elements ==
Status: A- (transformations need to be implemented)

View File

@ -34,6 +34,7 @@ class HTMLPurifier_Definition
'table' => true,
'ul' => true
);
var $info_global_attr = array();
function instance() {
static $instance = null;
@ -49,6 +50,20 @@ class HTMLPurifier_Definition
function setup() {
// emulates the structure of the DTD
$allowed_tags =
array(
'ins', 'del', 'blockquote', 'dd', 'li', 'div', 'em', 'strong',
'dfn', 'code', 'samp', 'kbd', 'var', 'cite', 'abbr', 'acronym',
'q', 'sub', 'tt', 'sup', 'i', 'b', 'big', 'small', 'u', 's',
'strike', 'bdo', 'span', 'dt', 'p', 'h1', 'h2', 'h3', 'h4',
'h5', 'h6', 'ol', 'ul', 'dl', 'address', 'img', 'br', 'hr',
'pre', 'a'
);
foreach ($allowed_tags as $tag) {
$this->info[$tag] = new HTMLPurifier_ElementDef();
}
// entities: prefixed with e_ and _ replaces .
// we don't use an array because that complicates interpolation
// strings are used instead of arrays because if you use arrays,
@ -96,73 +111,67 @@ class HTMLPurifier_Definition
$e_form_content = new HTMLPurifier_ChildDef_Optional(''); //unused
$e_form_button_content = new HTMLPurifier_ChildDef_Optional(''); // unused
$this->info['child'] = array();
$this->info['ins']->child =
$this->info['del']->child =
$this->info['blockquote']->child =
$this->info['dd']->child =
$this->info['li']->child =
$this->info['div']->child = $e_Flow;
$this->info['child']['ins'] =
$this->info['child']['del'] =
$this->info['child']['blockquote'] =
$this->info['child']['dd'] =
$this->info['child']['li'] =
$this->info['child']['div'] = $e_Flow;
$this->info['em']->child =
$this->info['strong']->child =
$this->info['dfn']->child =
$this->info['code']->child =
$this->info['samp']->child =
$this->info['kbd']->child =
$this->info['var']->child =
$this->info['cite']->child =
$this->info['abbr']->child =
$this->info['acronym']->child =
$this->info['q']->child =
$this->info['sub']->child =
$this->info['tt']->child =
$this->info['sup']->child =
$this->info['i']->child =
$this->info['b']->child =
$this->info['big']->child =
$this->info['small']->child =
$this->info['u']->child =
$this->info['s']->child =
$this->info['strike']->child =
$this->info['bdo']->child =
$this->info['span']->child =
$this->info['dt']->child =
$this->info['p']->child =
$this->info['h1']->child =
$this->info['h2']->child =
$this->info['h3']->child =
$this->info['h4']->child =
$this->info['h5']->child =
$this->info['h6']->child = $e_Inline;
$this->info['child']['em'] =
$this->info['child']['strong'] =
$this->info['child']['dfn'] =
$this->info['child']['code'] =
$this->info['child']['samp'] =
$this->info['child']['kbd'] =
$this->info['child']['var'] =
$this->info['child']['code'] =
$this->info['child']['samp'] =
$this->info['child']['kbd'] =
$this->info['child']['var'] =
$this->info['child']['cite'] =
$this->info['child']['abbr'] =
$this->info['child']['acronym'] =
$this->info['child']['q'] =
$this->info['child']['sub'] =
$this->info['child']['tt'] =
$this->info['child']['sup'] =
$this->info['child']['i'] =
$this->info['child']['b'] =
$this->info['child']['big'] =
$this->info['child']['small'] =
$this->info['child']['u'] =
$this->info['child']['s'] =
$this->info['child']['strike'] =
$this->info['child']['bdo'] =
$this->info['child']['span'] =
$this->info['child']['dt'] =
$this->info['child']['p'] =
$this->info['child']['h1'] =
$this->info['child']['h2'] =
$this->info['child']['h3'] =
$this->info['child']['h4'] =
$this->info['child']['h5'] =
$this->info['child']['h6'] = $e_Inline;
$this->info['ol']->child =
$this->info['ul']->child = new HTMLPurifier_ChildDef_Required('li');
$this->info['child']['ol'] =
$this->info['child']['ul'] = new HTMLPurifier_ChildDef_Required('li');
$this->info['child']['dl'] = new HTMLPurifier_ChildDef_Required('dt|dd');
$this->info['child']['address'] =
$this->info['dl']->child = new HTMLPurifier_ChildDef_Required('dt|dd');
$this->info['address']->child =
new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline".
" | $e_misc_inline");
$this->info['child']['img'] =
$this->info['child']['br'] =
$this->info['child']['hr'] = new HTMLPurifier_ChildDef_Empty();
$this->info['img']->child =
$this->info['br']->child =
$this->info['hr']->child = new HTMLPurifier_ChildDef_Empty();
$this->info['child']['pre'] = $e_pre_content;
$this->info['pre']->child = $e_pre_content;
$this->info['child']['a'] = $e_a_content;
$this->info['a']->child = $e_a_content;
// attribute info
// this doesn't include REQUIRED declarations, those are handled
// by the transform classes
// attrs, included in almost every single one except for a few
$this->info['attr']['*'] = array(
$this->info_global_attr = array(
// core attrs
'id' => new HTMLPurifier_AttrDef_ID(),
// i18n
@ -176,13 +185,8 @@ class HTMLPurifier_Definition
class HTMLPurifier_ElementDef
{
var $child_def;
var $attr_def = array();
function HTMLPurifier_ElementDef($child_def, $attr_def = array()) {
$this->child_def = $child_def;
$this->attr_def = $attr_def;
}
var $child;
var $attr = array();
}

View File

@ -38,8 +38,11 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
// $i is index of start token
// $j is index of end token
// DEFINITION CALL
$child_def = $this->definition->info[$tokens[$i]->name]->child;
// have DTD child def validate children
$child_def = $this->definition->info['child'][$tokens[$i]->name];
$result = $child_def->validateChildren($child_tokens);
// process result

View File

@ -23,7 +23,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$result[] = $token;
continue;
}
$info = $this->definition->info['child'][$token->name]; // assumption but valid
// DEFINITION CALL
$info = $this->definition->info[$token->name]->child;
// test if it claims to be a start tag but is empty
if ($info->type == 'empty' &&

View File

@ -4,6 +4,13 @@ require_once 'HTMLPurifier/Strategy.php';
require_once 'HTMLPurifier/Definition.php';
require_once 'HTMLPurifier/Generator.php';
/**
* Removes all unrecognized tags from the list of tokens.
*
* This strategy iterates through all the tokens and removes unrecognized
* tokens.
*/
class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
{
@ -19,7 +26,8 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
$result = array();
foreach($tokens as $token) {
if (!empty( $token->is_tag )) {
if (!isset($this->definition->info['child'][$token->name])) {
// DEFINITION CALL
if (!isset($this->definition->info[$token->name])) {
// invalid tag, generate HTML and insert in
$token = new HTMLPurifier_Token_Text(
$this->generator->generateFromToken($token)

View File

@ -15,13 +15,14 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
function execute($tokens) {
$accumulator = new HTMLPurifier_IDAccumulator();
$d_defs = $this->definition->info['attr']['*'];
$d_defs = $this->definition->info_global_attr;
foreach ($tokens as $key => $token) {
if ($token->type !== 'start' && $token->type !== 'end') continue;
$name = $token->name;
// DEFINITION CALL
$defs = $this->definition->info[$token->name]->attr;
$attr = $token->attributes;
$defs = isset($this->definition->info['attr'][$name]) ?
$this->definition->attr[$name] : array();
$changed = false;
foreach ($attr as $attr_key => $value) {
if ( isset($defs[$attr_key]) ) {