true, 'blockquote' => true, 'dd' => true, 'dir' => true, 'div' => true, 'dl' => true, 'dt' => true, 'h1' => true, 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true, 'h6' => true, 'hr' => true, 'ol' => true, 'p' => true, 'pre' => true, 'table' => true, 'ul' => true ); function PureHTMLDefinition() { $this->generator = new HTML_Generator(); } function loadData() { // emulates the structure of the DTD // entities: prefixed with e_ and _ replaces . // we don't use an array because that complicates interpolation // strings are used instead of arrays because if you use arrays, // you have to do some hideous manipulation with array_merge() // these are condensed, remember, with bad stuff taken out // transforms: font, menu, dir, center // DON'T MONKEY AROUND THIS unless you know what you are doing // and also know the assumptions the code makes about what this // contains for optimization purposes (see fixNesting) $e_special_extra = 'img'; $e_special_basic = 'br | span | bdo'; $e_special = "$e_special_basic | $e_special_extra"; $e_fontstyle_extra = 'big | small'; $e_fontstyle_basic = 'tt | i | b | u | s | strike'; $e_fontstyle = "$e_fontstyle_basic | $e_fontstyle_extra"; $e_phrase_extra = 'sub | sup'; $e_phrase_basic = 'em | strong | dfn | code | q | samp | kbd | var'. ' | cite | abbr | acronym'; $e_phrase = "$e_phrase_basic | $e_phrase_extra"; $e_inline_forms = ''; // humor the dtd $e_misc_inline = 'ins | del'; $e_misc = "$e_misc_inline"; $e_inline = "a | $e_special | $e_fontstyle | $e_phrase". " | $e_inline_forms"; // note the casing $e_Inline = new HTMLDTD_ChildDef_Optional("#PCDATA | $e_inline". " | $e_misc_inline"); $e_heading = 'h1|h2|h3|h4|h5|h6'; $e_lists = 'ul | ol | dl'; $e_blocktext = 'pre | hr | blockquote | address'; $e_block = "p | $e_heading | div | $e_lists | $e_blocktext | table"; $e_Flow = new HTMLDTD_ChildDef_Optional("#PCDATA | $e_block". " | $e_inline | $e_misc"); $e_a_content = new HTMLDTD_ChildDef_Optional("#PCDATA | $e_special". " | $e_fontstyle | $e_phrase | $e_inline_forms | $e_misc_inline"); $e_pre_content = new HTMLDTD_ChildDef_Optional("#PCDATA | a". " | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic". " | $e_inline_forms | $e_misc_inline"); $e_form_content = new HTMLDTD_ChildDef_Optional(''); //unused $e_form_button_content = new HTMLDTD_ChildDef_Optional(''); // unused $this->info['ins'] = $this->info['del'] = $this->info['blockquote'] = $this->info['dd'] = $this->info['li'] = $this->info['div'] = new HTMLDTD_Element($e_Flow); $this->info['em'] = $this->info['strong'] = $this->info['dfn'] = $this->info['code'] = $this->info['samp'] = $this->info['kbd'] = $this->info['var'] = $this->info['code'] = $this->info['samp'] = $this->info['kbd'] = $this->info['var'] = $this->info['cite'] = $this->info['abbr'] = $this->info['acronym'] = $this->info['q'] = $this->info['sub'] = $this->info['tt'] = $this->info['sup'] = $this->info['i'] = $this->info['b'] = $this->info['big'] = $this->info['small'] = $this->info['u'] = $this->info['s'] = $this->info['strike'] = $this->info['bdo'] = $this->info['span'] = $this->info['dt'] = $this->info['p'] = $this->info['h1'] = $this->info['h2'] = $this->info['h3'] = $this->info['h4'] = $this->info['h5'] = $this->info['h6'] = new HTMLDTD_Element($e_Inline); $this->info['ol'] = $this->info['ul'] = new HTMLDTD_Element( new HTMLDTD_ChildDef_Required('li') ); $this->info['dl'] = new HTMLDTD_Element( new HTMLDTD_ChildDef_Required('dt|dd') ); $this->info['address'] = new HTMLDTD_Element( new HTMLDTD_ChildDef_Optional("#PCDATA | p | $e_inline". " | $e_misc_inline") ); $this->info['img'] = $this->info['br'] = $this->info['hr'] = new HTMLDTD_Element(new HTMLDTD_ChildDef_Empty()); $this->info['pre'] = new HTMLDTD_Element($e_pre_content); $this->info['a'] = new HTMLDTD_Element($e_a_content); } function purifyTokens($tokens) { if (empty($this->info)) $this->loadData(); $tokens = $this->removeForeignElements($tokens); $tokens = $this->makeWellFormed($tokens); $tokens = $this->fixNesting($tokens); $tokens = $this->validateAttributes($tokens); return $tokens; } function removeForeignElements($tokens) { if (empty($this->info)) $this->loadData(); $result = array(); foreach($tokens as $token) { if (!empty( $token->is_tag )) { if (!isset($this->info[$token->name])) { // invalid tag, generate HTML and insert in $token = new HTMLPurifier_Token_Text( $this->generator->generateFromToken($token) ); } } elseif ($token->type == 'comment') { // strip comments continue; } elseif ($token->type == 'text') { } else { continue; } $result[] = $token; } return $result; } function makeWellFormed($tokens) { if (empty($this->info)) $this->loadData(); $result = array(); $current_nesting = array(); foreach ($tokens as $token) { if (empty( $token->is_tag )) { $result[] = $token; continue; } $info = $this->info[$token->name]; // assumption but valid // test if it claims to be a start tag but is empty if ($info->child_def->type == 'empty' && $token->type == 'start' ) { $result[] = new HTMLPurifier_Token_Empty($token->name, $token->attributes); continue; } // test if it claims to be empty but really is a start tag if ($info->child_def->type != 'empty' && $token->type == 'empty' ) { $result[] = new HTMLPurifier_Token_Start($token->name, $token->attributes); $result[] = new HTMLPurifier_Token_End($token->name); continue; } // automatically insert empty tags if ($token->type == 'empty') { $result[] = $token; continue; } // we give start tags precedence, so automatically accept unless... // it's one of those special cases if ($token->type == 'start') { // if there's a parent, check for special case if (!empty($current_nesting)) { $current_parent = array_pop($current_nesting); // check if we're closing a P tag if ($current_parent->name == 'p' && isset($this->info_closes_p[$token->name]) ) { $result[] = new HTMLPurifier_Token_End('p'); $result[] = $token; $current_nesting[] = $token; continue; } // check if we're closing a LI tag if ($current_parent->name == 'li' && $token->name == 'li' ) { $result[] = new HTMLPurifier_Token_End('li'); $result[] = $token; $current_nesting[] = $token; continue; } // this is more TIDY stuff // we should also get some TABLE related code // mismatched h# $current_nesting[] = $current_parent; // undo the pop } $result[] = $token; $current_nesting[] = $token; continue; } // sanity check if ($token->type != 'end') continue; // okay, we're dealing with a closing tag // make sure that we have something open if (empty($current_nesting)) { $result[] = new HTMLPurifier_Token_Text( $this->generator->generateFromToken($token) ); continue; } // first, check for the simplest case: everything closes neatly // current_nesting is modified $current_parent = array_pop($current_nesting); if ($current_parent->name == $token->name) { $result[] = $token; continue; } // undo the array_pop $current_nesting[] = $current_parent; // okay, so we're trying to close the wrong tag // scroll back the entire nest, trying to find our tag // feature could be to specify how far you'd like to go $size = count($current_nesting); // -2 because -1 is the last element, but we already checked that $skipped_tags = false; for ($i = $size - 2; $i >= 0; $i--) { if ($current_nesting[$i]->name == $token->name) { // current nesting is modified $skipped_tags = array_splice($current_nesting, $i); break; } } // we still didn't find the tag, so translate to text if ($skipped_tags === false) { $result[] = new HTMLPurifier_Token_Text( $this->generator->generateFromToken($token) ); continue; } // okay, we found it, close all the skipped tags // note that skipped tags contains the element we need closed $size = count($skipped_tags); for ($i = $size - 1; $i >= 0; $i--) { $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name); } // done! } // we're at the end now, fix all still unclosed tags if (!empty($current_nesting)) { $size = count($current_nesting); for ($i = $size - 1; $i >= 0; $i--) { $result[] = new HTMLPurifier_Token_End($current_nesting[$i]->name); } } return $result; } function fixNesting($tokens) { if (empty($this->info)) $this->loadData(); // insert implicit "parent" node, will be removed at end array_unshift($tokens, new HTMLPurifier_Token_Start('div')); $tokens[] = new HTMLPurifier_Token_End('div'); for ($i = 0, $size = count($tokens) ; $i < $size; ) { $child_tokens = array(); // scroll to the end of this node, and report number for ($j = $i, $depth = 0; ; $j++) { if ($tokens[$j]->type == 'start') { $depth++; // skip token assignment on first iteration if ($depth == 1) continue; } elseif ($tokens[$j]->type == 'end') { $depth--; // skip token assignment on last iteration if ($depth == 0) break; } $child_tokens[] = $tokens[$j]; } // $i is index of start token // $j is index of end token // have DTD child def validate children $element_def = $this->info[$tokens[$i]->name]; $result = $element_def->child_def->validateChildren($child_tokens); // process result if ($result === true) { // leave the nodes as is } elseif($result === false) { // WARNING WARNING WARNING!!! // While for the original DTD, there will never be // cascading removal, more complex ones may have such // a problem. // If you modify the info array such that an element // that requires children may contain a child that requires // children, you need to also scroll back and re-check that // elements parent node $length = $j - $i + 1; // remove entire node array_splice($tokens, $i, $length); // change size $size -= $length; // ensure that we scroll to the next node $i--; } else { $length = $j - $i - 1; // replace node with $result array_splice($tokens, $i + 1, $length, $result); // change size $size -= $length; $size += count($result); } // scroll to next node $i++; while ($i < $size and $tokens[$i]->type != 'start') $i++; } // remove implicit divs array_shift($tokens); array_pop($tokens); return $tokens; } function validateAttributes($tokens) { if (empty($this->info)) $this->loadData(); } } class HTMLDTD_Element { var $child_def; var $attr_def = array(); function HTMLDTD_Element($child_def, $attr_def = array()) { $this->child_def = $child_def; $this->attr_def = $attr_def; } } // HTMLDTD_ChildDef and inheritance have three types of output: // true = leave nodes as is // false = delete parent node and all children // array(...) = replace children nodes with these // this is the hardest one to implement. We'll use fancy regexp tricks // right now, we only expect it to return TRUE or FALSE (it won't attempt // to fix the tree) // we may end up writing custom code for each HTML case // in order to make it self correcting class HTMLDTD_ChildDef { var $type = 'custom'; var $dtd_regex; var $_pcre_regex; function HTMLDTD_ChildDef($dtd_regex) { $this->dtd_regex = $dtd_regex; $this->_compileRegex(); } function _compileRegex() { $raw = str_replace(' ', '', $this->dtd_regex); if ($raw{0} != '(') { $raw = "($raw)"; } $reg = str_replace(',', ',?', $raw); $reg = preg_replace('/([#a-zA-Z0-9_.-]+)/', '(,?\\0)', $reg); $this->_pcre_regex = $reg; } function validateChildren($tokens_of_children) { $list_of_children = ''; $nesting = 0; // depth into the nest foreach ($tokens_of_children as $token) { if (!empty($token->is_whitespace)) continue; $is_child = ($nesting == 0); // direct if ($token->type == 'start') { $nesting++; } elseif ($token->type == 'end') { $nesting--; } if ($is_child) { $list_of_children .= $token->name . ','; } } $list_of_children = rtrim($list_of_children, ','); $okay = preg_match( '/^'.$this->_pcre_regex.'$/', $list_of_children ); return (bool) $okay; } } class HTMLDTD_ChildDef_Simple extends HTMLDTD_ChildDef { var $elements = array(); function HTMLDTD_ChildDef_Simple($elements) { if (is_string($elements)) { $elements = str_replace(' ', '', $elements); $elements = explode('|', $elements); } $elements = array_flip($elements); foreach ($elements as $i => $x) $elements[$i] = true; $this->elements = $elements; $this->gen = new HTML_Generator(); } function validateChildren() { trigger_error('Cannot call abstract function!', E_USER_ERROR); } } class HTMLDTD_ChildDef_Required extends HTMLDTD_ChildDef_Simple { var $type = 'required'; function validateChildren($tokens_of_children) { // if there are no tokens, delete parent node if (empty($tokens_of_children)) return false; // the new set of children $result = array(); // current depth into the nest $nesting = 0; // whether or not we're deleting a node $is_deleting = false; // whether or not parsed character data is allowed // this controls whether or not we silently drop a tag // or generate escaped HTML from it $pcdata_allowed = isset($this->elements['#PCDATA']); // a little sanity check to make sure it's not ALL whitespace $all_whitespace = true; foreach ($tokens_of_children as $token) { if (!empty($token->is_whitespace)) { $result[] = $token; continue; } $all_whitespace = false; // phew, we're not talking about whitespace $is_child = ($nesting == 0); if ($token->type == 'start') { $nesting++; } elseif ($token->type == 'end') { $nesting--; } if ($is_child) { $is_deleting = false; if (!isset($this->elements[$token->name])) { $is_deleting = true; if ($pcdata_allowed) { $result[] = new HTMLPurifier_Token_Text( $this->gen->generateFromToken($token) ); } continue; } } if (!$is_deleting) { $result[] = $token; } elseif ($pcdata_allowed) { $result[] = new HTMLPurifier_Token_Text( $this->gen->generateFromToken( $token ) ); } else { // drop silently } } if (empty($result)) return false; if ($all_whitespace) return false; if ($tokens_of_children == $result) return true; return $result; } } // only altered behavior is that it returns an empty array // instead of a false (to delete the node) class HTMLDTD_ChildDef_Optional extends HTMLDTD_ChildDef_Required { var $type = 'optional'; function validateChildren($tokens_of_children) { $result = parent::validateChildren($tokens_of_children); if ($result === false) return array(); return $result; } } // placeholder class HTMLDTD_ChildDef_Empty extends HTMLDTD_ChildDef { var $type = 'empty'; function HTMLDTD_ChildDef_Empty() {} function validateChildren() { return false; } } class HTMLDTD_AttrDef { var $def; function HTMLDTD_AttrDef($def) { $this->def = $def; } } ?>