mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-23 00:41:52 +00:00
Make the definition format much more logical. Begin migrating specification docs to their respective classes.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@133 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
70bd80e66a
commit
558c49a92d
@ -1,9 +1,7 @@
|
|||||||
|
|
||||||
HTML Purifier Specification
|
HTML Purifier
|
||||||
by Edward Z. Yang
|
by Edward Z. Yang
|
||||||
|
|
||||||
== Introduction ==
|
|
||||||
|
|
||||||
There are a number of ad hoc HTML filtering solutions out there on the web
|
There are a number of ad hoc HTML filtering solutions out there on the web
|
||||||
(some examples including HTML_Safe, kses and SafeHtmlChecker.class.php) that
|
(some examples including HTML_Safe, kses and SafeHtmlChecker.class.php) that
|
||||||
claim to filter HTML properly, preventing malicious JavaScript and layout
|
claim to filter HTML properly, preventing malicious JavaScript and layout
|
||||||
@ -56,29 +54,6 @@ HTML tags. Things like blog comments are, in all likelihood, most appropriately
|
|||||||
written in an extremely restrictive set of markup that doesn't require
|
written in an extremely restrictive set of markup that doesn't require
|
||||||
all this functionality (or not written in HTML at all).
|
all this functionality (or not written in HTML at all).
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
== STAGE 1 - parsing ==
|
|
||||||
|
|
||||||
Status: A (see source, mainly internals and UTF-8)
|
|
||||||
|
|
||||||
The Lexer (currently we have three choices) handles parsing into Tokens.
|
|
||||||
|
|
||||||
Here are the mappings for Lexer_PEARSax3
|
|
||||||
|
|
||||||
* Start(name, attributes) is openHandler
|
|
||||||
* End(name) is closeHandler
|
|
||||||
* Empty(name, attributes) is openHandler (is in array of empties)
|
|
||||||
* Data(parse(text)) is dataHandler
|
|
||||||
* Comment(text) is escapeHandler (has leading -)
|
|
||||||
* Data(text) is escapeHandler (has leading [, CDATA)
|
|
||||||
|
|
||||||
Ignorable/not being implemented (although we probably want to output them raw):
|
|
||||||
* ProcessingInstructions(text) is piHandler
|
|
||||||
* JavaOrASPInstructions(text) is jaspHandler
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
== STAGE 2 - remove foreign elements ==
|
== STAGE 2 - remove foreign elements ==
|
||||||
|
|
||||||
Status: A- (transformations need to be implemented)
|
Status: A- (transformations need to be implemented)
|
||||||
|
@ -34,6 +34,7 @@ class HTMLPurifier_Definition
|
|||||||
'table' => true,
|
'table' => true,
|
||||||
'ul' => true
|
'ul' => true
|
||||||
);
|
);
|
||||||
|
var $info_global_attr = array();
|
||||||
|
|
||||||
function instance() {
|
function instance() {
|
||||||
static $instance = null;
|
static $instance = null;
|
||||||
@ -49,6 +50,20 @@ class HTMLPurifier_Definition
|
|||||||
function setup() {
|
function setup() {
|
||||||
// emulates the structure of the DTD
|
// emulates the structure of the DTD
|
||||||
|
|
||||||
|
$allowed_tags =
|
||||||
|
array(
|
||||||
|
'ins', 'del', 'blockquote', 'dd', 'li', 'div', 'em', 'strong',
|
||||||
|
'dfn', 'code', 'samp', 'kbd', 'var', 'cite', 'abbr', 'acronym',
|
||||||
|
'q', 'sub', 'tt', 'sup', 'i', 'b', 'big', 'small', 'u', 's',
|
||||||
|
'strike', 'bdo', 'span', 'dt', 'p', 'h1', 'h2', 'h3', 'h4',
|
||||||
|
'h5', 'h6', 'ol', 'ul', 'dl', 'address', 'img', 'br', 'hr',
|
||||||
|
'pre', 'a'
|
||||||
|
);
|
||||||
|
|
||||||
|
foreach ($allowed_tags as $tag) {
|
||||||
|
$this->info[$tag] = new HTMLPurifier_ElementDef();
|
||||||
|
}
|
||||||
|
|
||||||
// entities: prefixed with e_ and _ replaces .
|
// entities: prefixed with e_ and _ replaces .
|
||||||
// we don't use an array because that complicates interpolation
|
// we don't use an array because that complicates interpolation
|
||||||
// strings are used instead of arrays because if you use arrays,
|
// strings are used instead of arrays because if you use arrays,
|
||||||
@ -96,73 +111,67 @@ class HTMLPurifier_Definition
|
|||||||
$e_form_content = new HTMLPurifier_ChildDef_Optional(''); //unused
|
$e_form_content = new HTMLPurifier_ChildDef_Optional(''); //unused
|
||||||
$e_form_button_content = new HTMLPurifier_ChildDef_Optional(''); // unused
|
$e_form_button_content = new HTMLPurifier_ChildDef_Optional(''); // unused
|
||||||
|
|
||||||
$this->info['child'] = array();
|
$this->info['ins']->child =
|
||||||
|
$this->info['del']->child =
|
||||||
|
$this->info['blockquote']->child =
|
||||||
|
$this->info['dd']->child =
|
||||||
|
$this->info['li']->child =
|
||||||
|
$this->info['div']->child = $e_Flow;
|
||||||
|
|
||||||
$this->info['child']['ins'] =
|
$this->info['em']->child =
|
||||||
$this->info['child']['del'] =
|
$this->info['strong']->child =
|
||||||
$this->info['child']['blockquote'] =
|
$this->info['dfn']->child =
|
||||||
$this->info['child']['dd'] =
|
$this->info['code']->child =
|
||||||
$this->info['child']['li'] =
|
$this->info['samp']->child =
|
||||||
$this->info['child']['div'] = $e_Flow;
|
$this->info['kbd']->child =
|
||||||
|
$this->info['var']->child =
|
||||||
|
$this->info['cite']->child =
|
||||||
|
$this->info['abbr']->child =
|
||||||
|
$this->info['acronym']->child =
|
||||||
|
$this->info['q']->child =
|
||||||
|
$this->info['sub']->child =
|
||||||
|
$this->info['tt']->child =
|
||||||
|
$this->info['sup']->child =
|
||||||
|
$this->info['i']->child =
|
||||||
|
$this->info['b']->child =
|
||||||
|
$this->info['big']->child =
|
||||||
|
$this->info['small']->child =
|
||||||
|
$this->info['u']->child =
|
||||||
|
$this->info['s']->child =
|
||||||
|
$this->info['strike']->child =
|
||||||
|
$this->info['bdo']->child =
|
||||||
|
$this->info['span']->child =
|
||||||
|
$this->info['dt']->child =
|
||||||
|
$this->info['p']->child =
|
||||||
|
$this->info['h1']->child =
|
||||||
|
$this->info['h2']->child =
|
||||||
|
$this->info['h3']->child =
|
||||||
|
$this->info['h4']->child =
|
||||||
|
$this->info['h5']->child =
|
||||||
|
$this->info['h6']->child = $e_Inline;
|
||||||
|
|
||||||
$this->info['child']['em'] =
|
$this->info['ol']->child =
|
||||||
$this->info['child']['strong'] =
|
$this->info['ul']->child = new HTMLPurifier_ChildDef_Required('li');
|
||||||
$this->info['child']['dfn'] =
|
|
||||||
$this->info['child']['code'] =
|
|
||||||
$this->info['child']['samp'] =
|
|
||||||
$this->info['child']['kbd'] =
|
|
||||||
$this->info['child']['var'] =
|
|
||||||
$this->info['child']['code'] =
|
|
||||||
$this->info['child']['samp'] =
|
|
||||||
$this->info['child']['kbd'] =
|
|
||||||
$this->info['child']['var'] =
|
|
||||||
$this->info['child']['cite'] =
|
|
||||||
$this->info['child']['abbr'] =
|
|
||||||
$this->info['child']['acronym'] =
|
|
||||||
$this->info['child']['q'] =
|
|
||||||
$this->info['child']['sub'] =
|
|
||||||
$this->info['child']['tt'] =
|
|
||||||
$this->info['child']['sup'] =
|
|
||||||
$this->info['child']['i'] =
|
|
||||||
$this->info['child']['b'] =
|
|
||||||
$this->info['child']['big'] =
|
|
||||||
$this->info['child']['small'] =
|
|
||||||
$this->info['child']['u'] =
|
|
||||||
$this->info['child']['s'] =
|
|
||||||
$this->info['child']['strike'] =
|
|
||||||
$this->info['child']['bdo'] =
|
|
||||||
$this->info['child']['span'] =
|
|
||||||
$this->info['child']['dt'] =
|
|
||||||
$this->info['child']['p'] =
|
|
||||||
$this->info['child']['h1'] =
|
|
||||||
$this->info['child']['h2'] =
|
|
||||||
$this->info['child']['h3'] =
|
|
||||||
$this->info['child']['h4'] =
|
|
||||||
$this->info['child']['h5'] =
|
|
||||||
$this->info['child']['h6'] = $e_Inline;
|
|
||||||
|
|
||||||
$this->info['child']['ol'] =
|
$this->info['dl']->child = new HTMLPurifier_ChildDef_Required('dt|dd');
|
||||||
$this->info['child']['ul'] = new HTMLPurifier_ChildDef_Required('li');
|
$this->info['address']->child =
|
||||||
|
|
||||||
$this->info['child']['dl'] = new HTMLPurifier_ChildDef_Required('dt|dd');
|
|
||||||
$this->info['child']['address'] =
|
|
||||||
new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline".
|
new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline".
|
||||||
" | $e_misc_inline");
|
" | $e_misc_inline");
|
||||||
|
|
||||||
$this->info['child']['img'] =
|
$this->info['img']->child =
|
||||||
$this->info['child']['br'] =
|
$this->info['br']->child =
|
||||||
$this->info['child']['hr'] = new HTMLPurifier_ChildDef_Empty();
|
$this->info['hr']->child = new HTMLPurifier_ChildDef_Empty();
|
||||||
|
|
||||||
$this->info['child']['pre'] = $e_pre_content;
|
$this->info['pre']->child = $e_pre_content;
|
||||||
|
|
||||||
$this->info['child']['a'] = $e_a_content;
|
$this->info['a']->child = $e_a_content;
|
||||||
|
|
||||||
// attribute info
|
// attribute info
|
||||||
// this doesn't include REQUIRED declarations, those are handled
|
// this doesn't include REQUIRED declarations, those are handled
|
||||||
// by the transform classes
|
// by the transform classes
|
||||||
|
|
||||||
// attrs, included in almost every single one except for a few
|
// attrs, included in almost every single one except for a few
|
||||||
$this->info['attr']['*'] = array(
|
$this->info_global_attr = array(
|
||||||
// core attrs
|
// core attrs
|
||||||
'id' => new HTMLPurifier_AttrDef_ID(),
|
'id' => new HTMLPurifier_AttrDef_ID(),
|
||||||
// i18n
|
// i18n
|
||||||
@ -176,13 +185,8 @@ class HTMLPurifier_Definition
|
|||||||
class HTMLPurifier_ElementDef
|
class HTMLPurifier_ElementDef
|
||||||
{
|
{
|
||||||
|
|
||||||
var $child_def;
|
var $child;
|
||||||
var $attr_def = array();
|
var $attr = array();
|
||||||
|
|
||||||
function HTMLPurifier_ElementDef($child_def, $attr_def = array()) {
|
|
||||||
$this->child_def = $child_def;
|
|
||||||
$this->attr_def = $attr_def;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -38,8 +38,11 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
|
|||||||
// $i is index of start token
|
// $i is index of start token
|
||||||
// $j is index of end token
|
// $j is index of end token
|
||||||
|
|
||||||
|
|
||||||
|
// DEFINITION CALL
|
||||||
|
$child_def = $this->definition->info[$tokens[$i]->name]->child;
|
||||||
|
|
||||||
// have DTD child def validate children
|
// have DTD child def validate children
|
||||||
$child_def = $this->definition->info['child'][$tokens[$i]->name];
|
|
||||||
$result = $child_def->validateChildren($child_tokens);
|
$result = $child_def->validateChildren($child_tokens);
|
||||||
|
|
||||||
// process result
|
// process result
|
||||||
|
@ -23,7 +23,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
|
|||||||
$result[] = $token;
|
$result[] = $token;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
$info = $this->definition->info['child'][$token->name]; // assumption but valid
|
|
||||||
|
// DEFINITION CALL
|
||||||
|
$info = $this->definition->info[$token->name]->child;
|
||||||
|
|
||||||
// test if it claims to be a start tag but is empty
|
// test if it claims to be a start tag but is empty
|
||||||
if ($info->type == 'empty' &&
|
if ($info->type == 'empty' &&
|
||||||
|
@ -4,6 +4,13 @@ require_once 'HTMLPurifier/Strategy.php';
|
|||||||
require_once 'HTMLPurifier/Definition.php';
|
require_once 'HTMLPurifier/Definition.php';
|
||||||
require_once 'HTMLPurifier/Generator.php';
|
require_once 'HTMLPurifier/Generator.php';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Removes all unrecognized tags from the list of tokens.
|
||||||
|
*
|
||||||
|
* This strategy iterates through all the tokens and removes unrecognized
|
||||||
|
* tokens.
|
||||||
|
*/
|
||||||
|
|
||||||
class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
|
class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -19,7 +26,8 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
|
|||||||
$result = array();
|
$result = array();
|
||||||
foreach($tokens as $token) {
|
foreach($tokens as $token) {
|
||||||
if (!empty( $token->is_tag )) {
|
if (!empty( $token->is_tag )) {
|
||||||
if (!isset($this->definition->info['child'][$token->name])) {
|
// DEFINITION CALL
|
||||||
|
if (!isset($this->definition->info[$token->name])) {
|
||||||
// invalid tag, generate HTML and insert in
|
// invalid tag, generate HTML and insert in
|
||||||
$token = new HTMLPurifier_Token_Text(
|
$token = new HTMLPurifier_Token_Text(
|
||||||
$this->generator->generateFromToken($token)
|
$this->generator->generateFromToken($token)
|
||||||
|
@ -15,13 +15,14 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
|
|||||||
|
|
||||||
function execute($tokens) {
|
function execute($tokens) {
|
||||||
$accumulator = new HTMLPurifier_IDAccumulator();
|
$accumulator = new HTMLPurifier_IDAccumulator();
|
||||||
$d_defs = $this->definition->info['attr']['*'];
|
$d_defs = $this->definition->info_global_attr;
|
||||||
foreach ($tokens as $key => $token) {
|
foreach ($tokens as $key => $token) {
|
||||||
if ($token->type !== 'start' && $token->type !== 'end') continue;
|
if ($token->type !== 'start' && $token->type !== 'end') continue;
|
||||||
$name = $token->name;
|
|
||||||
|
// DEFINITION CALL
|
||||||
|
$defs = $this->definition->info[$token->name]->attr;
|
||||||
|
|
||||||
$attr = $token->attributes;
|
$attr = $token->attributes;
|
||||||
$defs = isset($this->definition->info['attr'][$name]) ?
|
|
||||||
$this->definition->attr[$name] : array();
|
|
||||||
$changed = false;
|
$changed = false;
|
||||||
foreach ($attr as $attr_key => $value) {
|
foreach ($attr as $attr_key => $value) {
|
||||||
if ( isset($defs[$attr_key]) ) {
|
if ( isset($defs[$attr_key]) ) {
|
||||||
|
Loading…
Reference in New Issue
Block a user