mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-01-23 05:41:53 +00:00
83f735ea7e
Fix bug in HTML_Generator that resulted in attribute-less empty elements to have extra spaces in them. Add whitespace designation to MF_Text. git-svn-id: http://htmlpurifier.org/svnroot/html_purifier/trunk@44 48356398-32a2-884e-a903-53898d9a118a
452 lines
16 KiB
PHP
452 lines
16 KiB
PHP
<?php
|
|
|
|
class PureHTMLDefinition
|
|
{
|
|
|
|
var $generator;
|
|
var $info = array();
|
|
var $info_closes_p = array(
|
|
// these are all block elements: blocks aren't allowed in P
|
|
'address' => true,
|
|
'blockquote' => true,
|
|
'dd' => true,
|
|
'dir' => true,
|
|
'div' => true,
|
|
'dl' => true,
|
|
'dt' => true,
|
|
'h1' => true,
|
|
'h2' => true,
|
|
'h3' => true,
|
|
'h4' => true,
|
|
'h5' => true,
|
|
'h6' => true,
|
|
'hr' => true,
|
|
'ol' => true,
|
|
'p' => true,
|
|
'pre' => true,
|
|
'table' => true,
|
|
'ul' => true
|
|
);
|
|
|
|
function PureHTMLDefinition() {
|
|
$this->generator = new HTML_Generator();
|
|
}
|
|
|
|
function loadData() {
|
|
// emulates the structure of the DTD
|
|
|
|
// entities: prefixed with e_ and _ replaces .
|
|
// we don't use an array because that complicates interpolation
|
|
// strings are used instead of arrays because if you use arrays,
|
|
// you have to do some hideous manipulation with array_merge()
|
|
|
|
// these are condensed, remember, with bad stuff taken out
|
|
|
|
// transforms: font, menu, dir, center
|
|
|
|
$e_special_extra = 'img';
|
|
$e_special_basic = 'br | span | bdo';
|
|
$e_special = "$e_special_basic | $e_special_extra";
|
|
$e_fontstyle_extra = 'big | small';
|
|
$e_fontstyle_basic = 'tt | i | b | u | s | strike';
|
|
$e_fontstyle = "$e_fontstyle_basic | $e_fontstyle_extra";
|
|
$e_phrase_extra = 'sub | sup';
|
|
$e_phrase_basic = 'em | strong | dfn | code | q | samp | kbd | var'.
|
|
' | cite | abbr | acronym';
|
|
$e_phrase = "$e_phrase_basic | $e_phrase_extra";
|
|
$e_inline_forms = ''; // humor the dtd
|
|
$e_misc_inline = 'ins | del';
|
|
$e_misc = "$e_misc_inline";
|
|
$e_inline = "a | $e_special | $e_fontstyle | $e_phrase".
|
|
" | $e_inline_forms";
|
|
// note the casing
|
|
$e_Inline = new HTMLDTD_ChildDef_Optional("#PCDATA | $e_inline".
|
|
" | $e_misc_inline");
|
|
$e_heading = 'h1|h2|h3|h4|h5|h6';
|
|
$e_lists = 'ul | ol | dl';
|
|
$e_blocktext = 'pre | hr | blockquote | address';
|
|
$e_block = "p | $e_heading | div | $e_lists | $e_blocktext | table";
|
|
$e_Flow = new HTMLDTD_ChildDef_Optional("#PCDATA | $e_block".
|
|
" | $e_inline | $e_misc");
|
|
$e_a_content = new HTMLDTD_ChildDef_Optional("#PCDATA | $e_special".
|
|
" | $e_fontstyle | $e_phrase | $e_inline_forms | $e_misc_inline");
|
|
$e_pre_content = new HTMLDTD_ChildDef_Optional("#PCDATA | a".
|
|
" | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic".
|
|
" | $e_inline_forms | $e_misc_inline");
|
|
$e_form_content = new HTMLDTD_ChildDef_Optional(''); //unused
|
|
$e_form_button_content = new HTMLDTD_ChildDef_Optional(''); // unused
|
|
|
|
$this->info['ins'] =
|
|
$this->info['del'] =
|
|
$this->info['blockquote'] =
|
|
$this->info['dd'] =
|
|
$this->info['li'] =
|
|
$this->info['div'] = new HTMLDTD_Element($e_Flow);
|
|
|
|
$this->info['em'] =
|
|
$this->info['strong'] =
|
|
$this->info['dfn'] =
|
|
$this->info['code'] =
|
|
$this->info['samp'] =
|
|
$this->info['kbd'] =
|
|
$this->info['var'] =
|
|
$this->info['code'] =
|
|
$this->info['samp'] =
|
|
$this->info['kbd'] =
|
|
$this->info['var'] =
|
|
$this->info['cite'] =
|
|
$this->info['abbr'] =
|
|
$this->info['acronym'] =
|
|
$this->info['q'] =
|
|
$this->info['sub'] =
|
|
$this->info['tt'] =
|
|
$this->info['sup'] =
|
|
$this->info['i'] =
|
|
$this->info['b'] =
|
|
$this->info['big'] =
|
|
$this->info['small'] =
|
|
$this->info['u'] =
|
|
$this->info['s'] =
|
|
$this->info['strike'] =
|
|
$this->info['bdo'] =
|
|
$this->info['span'] =
|
|
$this->info['dt'] =
|
|
$this->info['p'] =
|
|
$this->info['h1'] =
|
|
$this->info['h2'] =
|
|
$this->info['h3'] =
|
|
$this->info['h4'] =
|
|
$this->info['h5'] =
|
|
$this->info['h6'] = new HTMLDTD_Element($e_Inline);
|
|
|
|
$this->info['ol'] =
|
|
$this->info['ul'] =
|
|
new HTMLDTD_Element(
|
|
new HTMLDTD_ChildDef_Required('li')
|
|
);
|
|
|
|
$this->info['dl'] =
|
|
new HTMLDTD_Element(
|
|
new HTMLDTD_ChildDef_Optional('dt|dd')
|
|
);
|
|
$this->info['address'] =
|
|
new HTMLDTD_Element(
|
|
new HTMLDTD_ChildDef_Optional("#PCDATA | p | $e_inline".
|
|
" | $e_misc_inline")
|
|
);
|
|
|
|
$this->info['img'] =
|
|
$this->info['br'] =
|
|
$this->info['hr'] = new HTMLDTD_Element(new HTMLDTD_ChildDef_Empty());
|
|
|
|
$this->info['pre'] = new HTMLDTD_Element($e_pre_content);
|
|
|
|
$this->info['a'] = new HTMLDTD_Element($e_a_content);
|
|
|
|
}
|
|
|
|
function purifyTokens($tokens) {
|
|
if (empty($this->info)) $this->loadData();
|
|
$tokens = $this->removeForeignElements($tokens);
|
|
$tokens = $this->makeWellFormed($tokens);
|
|
$tokens = $this->fixNesting($tokens);
|
|
$tokens = $this->validateAttributes($tokens);
|
|
return $tokens;
|
|
}
|
|
|
|
function removeForeignElements($tokens) {
|
|
if (empty($this->info)) $this->loadData();
|
|
$result = array();
|
|
foreach($tokens as $token) {
|
|
if (is_subclass_of($token, 'MF_Tag')) {
|
|
if (!isset($this->info[$token->name])) {
|
|
// invalid tag, generate HTML and insert in
|
|
$token = new MF_Text($this->generator->generateFromToken($token));
|
|
}
|
|
} elseif (is_a($token, 'MF_Comment')) {
|
|
// strip comments
|
|
continue;
|
|
} elseif (is_a($token, 'MF_Text')) {
|
|
} else {
|
|
continue;
|
|
}
|
|
$result[] = $token;
|
|
}
|
|
return $result;
|
|
}
|
|
|
|
function makeWellFormed($tokens) {
|
|
if (empty($this->info)) $this->loadData();
|
|
$result = array();
|
|
$current_nesting = array();
|
|
foreach ($tokens as $token) {
|
|
if (!is_subclass_of($token, 'MF_Tag')) {
|
|
$result[] = $token;
|
|
continue;
|
|
}
|
|
$info = $this->info[$token->name]; // assumption but valid
|
|
|
|
// test if it claims to be a start tag but is empty
|
|
if (is_a($info->child_def, 'HTMLDTD_ChildDef_Empty') &&
|
|
is_a($token, 'MF_StartTag') ) {
|
|
|
|
$result[] = new MF_EmptyTag($token->name, $token->attributes);
|
|
continue;
|
|
}
|
|
|
|
// test if it claims to be empty but really is a start tag
|
|
if (!is_a($info->child_def, 'HTMLDTD_ChildDef_Empty') &&
|
|
is_a($token, 'MF_EmptyTag') ) {
|
|
|
|
$result[] = new MF_StartTag($token->name, $token->attributes);
|
|
$result[] = new MF_EndTag($token->name);
|
|
|
|
continue;
|
|
}
|
|
|
|
// automatically insert empty tags
|
|
if (is_a($token, 'MF_EmptyTag')) {
|
|
$result[] = $token;
|
|
continue;
|
|
}
|
|
|
|
// we give start tags precedence, so automatically accept unless...
|
|
// it's one of those special cases
|
|
if (is_a($token, 'MF_StartTag')) {
|
|
|
|
// if there's a parent, check for special case
|
|
if (!empty($current_nesting)) {
|
|
$current_parent = array_pop($current_nesting);
|
|
|
|
// check if we're closing a P tag
|
|
if ($current_parent->name == 'p' &&
|
|
isset($this->info_closes_p[$token->name])
|
|
) {
|
|
$result[] = new MF_EndTag('p');
|
|
$result[] = $token;
|
|
$current_nesting[] = $token;
|
|
continue;
|
|
}
|
|
|
|
// check if we're closing a LI tag
|
|
if ($current_parent->name == 'li' &&
|
|
$token->name == 'li'
|
|
) {
|
|
$result[] = new MF_EndTag('li');
|
|
$result[] = $token;
|
|
$current_nesting[] = $token;
|
|
continue;
|
|
}
|
|
|
|
// this is more TIDY stuff
|
|
// we should also get some TABLE related code
|
|
// mismatched h#
|
|
|
|
$current_nesting[] = $current_parent; // undo the pop
|
|
}
|
|
|
|
$result[] = $token;
|
|
$current_nesting[] = $token;
|
|
continue;
|
|
}
|
|
|
|
// sanity check
|
|
if (!is_a($token, 'MF_EndTag')) continue;
|
|
|
|
// okay, we're dealing with a closing tag
|
|
|
|
// make sure that we have something open
|
|
if (empty($current_nesting)) {
|
|
$result[] = new MF_Text($this->generator->generateFromToken($token));
|
|
continue;
|
|
}
|
|
|
|
// first, check for the simplest case: everything closes neatly
|
|
|
|
// current_nesting is modified
|
|
$current_parent = array_pop($current_nesting);
|
|
if ($current_parent->name == $token->name) {
|
|
$result[] = $token;
|
|
continue;
|
|
}
|
|
|
|
// undo the array_pop
|
|
$current_nesting[] = $current_parent;
|
|
|
|
// okay, so we're trying to close the wrong tag
|
|
|
|
// scroll back the entire nest, trying to find our tag
|
|
// feature could be to specify how far you'd like to go
|
|
$size = count($current_nesting);
|
|
// -2 because -1 is the last element, but we already checked that
|
|
$skipped_tags = false;
|
|
for ($i = $size - 2; $i >= 0; $i--) {
|
|
if ($current_nesting[$i]->name == $token->name) {
|
|
// current nesting is modified
|
|
$skipped_tags = array_splice($current_nesting, $i);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// we still didn't find the tag, so translate to text
|
|
if ($skipped_tags === false) {
|
|
$result[] = new MF_Text($this->generator->generateFromToken($token));
|
|
continue;
|
|
}
|
|
|
|
// okay, we found it, close all the skipped tags
|
|
// note that skipped tags contains the element we need closed
|
|
$size = count($skipped_tags);
|
|
for ($i = $size - 1; $i >= 0; $i--) {
|
|
$result[] = new MF_EndTag($skipped_tags[$i]->name);
|
|
}
|
|
|
|
// done!
|
|
|
|
}
|
|
|
|
// we're at the end now, fix all still unclosed tags
|
|
|
|
if (!empty($current_nesting)) {
|
|
$size = count($current_nesting);
|
|
for ($i = $size - 1; $i >= 0; $i--) {
|
|
$result[] = new MF_EndTag($current_nesting[$i]->name);
|
|
}
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
function fixNesting($tokens) {
|
|
if (empty($this->info)) $this->loadData();
|
|
|
|
}
|
|
|
|
function validateAttributes($tokens) {
|
|
if (empty($this->info)) $this->loadData();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
class HTMLDTD_Element
|
|
{
|
|
|
|
var $child_def;
|
|
var $attr_def = array();
|
|
|
|
function HTMLDTD_Element($child_def, $attr_def = array()) {
|
|
$this->child_def = $child_def;
|
|
$this->attr_def = $attr_def;
|
|
}
|
|
|
|
}
|
|
|
|
// HTMLDTD_ChildDef and inheritance have three types of output:
|
|
// true = leave nodes as is
|
|
// false = delete parent node and all children
|
|
// array(...) = replace children nodes with these
|
|
class HTMLDTD_ChildDef
|
|
{
|
|
var $dtd_regex;
|
|
function HTMLDTD_ChildDef($dtd_regex) {
|
|
$this->dtd_regex = $dtd_regex;
|
|
}
|
|
function validateChildren($tokens_of_children) {}
|
|
}
|
|
class HTMLDTD_ChildDef_Simple extends HTMLDTD_ChildDef
|
|
{
|
|
var $elements = array();
|
|
function HTMLDTD_ChildDef_Simple($elements) {
|
|
if (is_string($elements)) {
|
|
$elements = str_replace(' ', '', $elements);
|
|
$elements = explode('|', $elements);
|
|
}
|
|
$elements = array_flip($elements);
|
|
foreach ($elements as $i => $x) $elements[$i] = true;
|
|
$this->elements = $elements;
|
|
$this->gen = new HTML_Generator();
|
|
}
|
|
}
|
|
class HTMLDTD_ChildDef_Required extends HTMLDTD_ChildDef_Simple
|
|
{
|
|
function validateChildren($tokens_of_children) {
|
|
// if there are no tokens, delete parent node
|
|
if (empty($tokens_of_children)) return false;
|
|
|
|
// the new set of children
|
|
$result = array();
|
|
|
|
// current depth into the nest
|
|
$nesting = 0;
|
|
|
|
// whether or not we're deleting a node
|
|
$is_deleting = false;
|
|
|
|
// whether or not parsed character data is allowed
|
|
// this controls whether or not we silently drop a tag
|
|
// or generate escaped HTML from it
|
|
$pcdata_allowed = isset($this->elements['#PCDATA']);
|
|
|
|
// a little sanity check to make sure it's not ALL whitespace
|
|
$all_whitespace = true;
|
|
|
|
foreach ($tokens_of_children as $token) {
|
|
if (!empty($token->is_whitespace)) {
|
|
$result[] = $token;
|
|
continue;
|
|
}
|
|
$all_whitespace = false; // phew, we're not talking about whitespace
|
|
|
|
$is_child = ($nesting == 0);
|
|
|
|
if (is_a($token, 'MF_StartTag')) {
|
|
$nesting++;
|
|
} elseif (is_a($token, 'MF_EndTag')) {
|
|
$nesting--;
|
|
}
|
|
|
|
if ($is_child) {
|
|
$is_deleting = false;
|
|
if (!isset($this->elements[$token->name])) {
|
|
$is_deleting = true;
|
|
if ($pcdata_allowed) {
|
|
$result[] = new MF_Text($this->gen->generateFromToken($token));
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
if (!$is_deleting) {
|
|
$result[] = $token;
|
|
} elseif ($pcdata_allowed) {
|
|
$result[] = new MF_Text($this->gen->generateFromToken($token));
|
|
} else {
|
|
// drop silently
|
|
}
|
|
}
|
|
if (empty($result)) return false;
|
|
if ($all_whitespace) return false;
|
|
if ($tokens_of_children == $result) return true;
|
|
return $result;
|
|
}
|
|
}
|
|
class HTMLDTD_ChildDef_Optional extends HTMLDTD_ChildDef_Simple
|
|
{
|
|
function validateChildren($tokens_of_children) {
|
|
|
|
}
|
|
}
|
|
class HTMLDTD_ChildDef_Empty extends HTMLDTD_ChildDef
|
|
{
|
|
function HTMLDTD_ChildDef_Empty() {}
|
|
}
|
|
|
|
class HTMLDTD_AttrDef
|
|
{
|
|
var $def;
|
|
function HTMLDTD_AttrDef($def) {
|
|
$this->def = $def;
|
|
}
|
|
}
|
|
|
|
?>
|