generator = new HTML_Generator(); } function loadData() { // emulates the structure of the DTD // entities: prefixed with e_ and _ replaces . // we don't use an array because that complicates interpolation // strings are used instead of arrays because if you use arrays, // you have to do some hideous manipulation with array_merge() // these are condensed, remember, with bad stuff taken out // transforms: font, menu, dir, center $e_special_extra = 'img'; $e_special_basic = 'br | span | bdo'; $e_special = "$e_special_basic | $e_special_extra"; $e_fontstyle_extra = 'big | small'; $e_fontstyle_basic = 'tt | i | b | u | s | strike'; $e_fontstyle = "$e_fontstyle_basic | $e_fontstyle_extra"; $e_phrase_extra = 'sub | sup'; $e_phrase_basic = 'em | strong | dfn | code | q | samp | kbd | var'. ' | cite | abbr | acronym'; $e_phrase = "$e_phrase_basic | $e_phrase_extra"; $e_inline_forms = ''; // humor the dtd $e_misc_inline = 'ins | del'; $e_misc = "$e_misc_inline"; $e_inline = "a | $e_special | $e_fontstyle | $e_phrase". " | $e_inline_forms"; // note the casing $e_Inline = new HTMLDTD_ChildDef_Optional("#PCDATA | $e_inline". " | $e_misc_inline"); $e_heading = 'h1|h2|h3|h4|h5|h6'; $e_lists = 'ul | ol | dl'; $e_blocktext = 'pre | hr | blockquote | address'; $e_block = "p | $e_heading | div | $e_lists | $e_blocktext | table"; $e_Flow = new HTMLDTD_ChildDef_Optional("#PCDATA | $e_block". " | $e_inline | $e_misc"); $e_a_content = new HTMLDTD_ChildDef_Optional("#PCDATA | $e_special". " | $e_fontstyle | $e_phrase | $e_inline_forms | $e_misc_inline"); $e_pre_content = new HTMLDTD_ChildDef_Optional("#PCDATA | a". " | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic". " | $e_inline_forms | $e_misc_inline"); $e_form_content = new HTMLDTD_ChildDef_Optional(''); //unused $e_form_button_content = new HTMLDTD_ChildDef_Optional(''); // unused $this->info['ins'] = $this->info['del'] = $this->info['blockquote'] = $this->info['dd'] = $this->info['div'] = new HTMLDTD_Element($e_Flow); $this->info['em'] = $this->info['strong'] = $this->info['dfn'] = $this->info['code'] = $this->info['samp'] = $this->info['kbd'] = $this->info['var'] = $this->info['code'] = $this->info['samp'] = $this->info['kbd'] = $this->info['var'] = $this->info['cite'] = $this->info['abbr'] = $this->info['acronym'] = $this->info['q'] = $this->info['sub'] = $this->info['tt'] = $this->info['sup'] = $this->info['i'] = $this->info['b'] = $this->info['big'] = $this->info['small'] = $this->info['u'] = $this->info['s'] = $this->info['strike'] = $this->info['bdo'] = $this->info['span'] = $this->info['dt'] = $this->info['p'] = $this->info['h1'] = $this->info['h2'] = $this->info['h3'] = $this->info['h4'] = $this->info['h5'] = $this->info['h6'] = new HTMLDTD_Element($e_Inline); $this->info['ol'] = $this->info['ul'] = new HTMLDTD_Element( new HTMLDTD_ChildDef_Required('li') ); $this->info['dl'] = new HTMLDTD_Element( new HTMLDTD_ChildDef_Optional('dt|dd') ); $this->info['address'] = new HTMLDTD_Element( new HTMLDTD_ChildDef_Optional("#PCDATA | p | $e_inline". " | $e_misc_inline") ); $this->info['img'] = $this->info['br'] = $this->info['hr'] = new HTMLDTD_Element(new HTMLDTD_ChildDef_Empty()); $this->info['pre'] = new HTMLDTD_Element($e_pre_content); $this->info['a'] = new HTMLDTD_Element($e_a_content); } function purifyTokens($tokens) { if (empty($this->info)) $this->loadData(); $tokens = $this->removeForeignElements($tokens); $tokens = $this->makeWellFormed($tokens); $tokens = $this->fixNesting($tokens); $tokens = $this->validateAttributes($tokens); return $tokens; } function removeForeignElements($tokens) { if (empty($this->info)) $this->loadData(); $result = array(); foreach($tokens as $token) { if (is_subclass_of($token, 'MF_Tag')) { if (!isset($this->info[$token->name])) { // invalid tag, generate HTML and insert in $token = new MF_Text($this->generator->generateFromToken($token)); } } elseif (is_a($token, 'MF_Comment')) { // strip comments continue; } elseif (is_a($token, 'MF_Text')) { } else { continue; } $result[] = $token; } return $result; } function makeWellFormed($tokens) { if (empty($this->info)) $this->loadData(); $result = array(); $current_nesting = array(); foreach ($tokens as $token) { if (!is_subclass_of($token, 'MF_Tag')) { $result[] = $token; continue; } $info = $this->info[$token->name]; // assumption but valid // test if it claims to be a start tag but is empty if (is_a($info->child_def, 'HTMLDTD_ChildDef_Empty') && is_a($token, 'MF_StartTag') ) { $result[] = new MF_EmptyTag($token->name, $token->attributes); continue; } // test if it claims to be empty but really is a start tag if (!is_a($info->child_def, 'HTMLDTD_ChildDef_Empty') && is_a($token, 'MF_EmptyTag') ) { $result[] = new MF_StartTag($token->name, $token->attributes); $result[] = new MF_EndTag($token->name); continue; } // automatically insert empty tags if (is_a($token, 'MF_EmptyTag')) { $result[] = $token; continue; } // we give start tags precedence, so automatically accept if (is_a($token, 'MF_StartTag')) { $result[] = $token; $current_nesting[] = $token; continue; } // sanity check if (!is_a($token, 'MF_EndTag')) continue; // okay, we're dealing with a closing tag // make sure that we have something open if (empty($current_nesting)) { $result[] = new MF_Text($this->generator->generateFromToken($token)); continue; } // first, check for the simplest case: everything closes neatly // current_nesting is modified $current_parent = array_pop($current_nesting); if ($current_parent->name == $token->name) { $result[] = $token; continue; } // undo the array_pop $current_nesting[] = $current_parent; // okay, so we're trying to close the wrong tag // scroll back the entire nest, trying to find our tag // feature could be to specify how far you'd like to go $size = count($current_nesting); // -2 because -1 is the last element, but we already checked that $skipped_tags = false; for ($i = $size - 2; $i >= 0; $i--) { if ($current_nesting[$i]->name == $token->name) { // current nesting is modified $skipped_tags = array_splice($current_nesting, $i); break; } } // we still didn't find the tag, so translate to text if ($skipped_tags === false) { $result[] = new MF_Text($this->generator->generateFromToken($token)); continue; } // okay, we found it, close all the skipped tags // note that skipped tags contains the element we need closed $size = count($skipped_tags); for ($i = $size - 1; $i >= 0; $i--) { $result[] = new MF_EndTag($skipped_tags[$i]->name); } // done! } // we're at the end now, fix all still unclosed tags if (!empty($current_nesting)) { $size = count($current_nesting); for ($i = $size - 1; $i >= 0; $i--) { $result[] = new MF_EndTag($current_nesting[$i]->name); } } return $result; } function fixNesting($tokens) { if (empty($this->info)) $this->loadData(); } function validateAttributes($tokens) { if (empty($this->info)) $this->loadData(); } } class HTMLDTD_Element { var $child_def; var $attr_def = array(); function HTMLDTD_Element($child_def, $attr_def = array()) { $this->child_def = $child_def; $this->attr_def = $attr_def; } } class HTMLDTD_ChildDef { var $dtd_regex; function HTMLDTD_ChildDef($dtd_regex) { $this->dtd_regex = $dtd_regex; } function validateChildren($tokens_of_children) {} } class HTMLDTD_ChildDef_Simple extends HTMLDTD_ChildDef { var $elements = array(); function HTMLDTD_ChildDef_Simple($elements) { $this->elements = $elements; } } class HTMLDTD_ChildDef_Required extends HTMLDTD_ChildDef_Simple { function validateChildren($tokens_of_children) { } } class HTMLDTD_ChildDef_Optional extends HTMLDTD_ChildDef_Simple { function validateChildren($tokens_of_children) { } } class HTMLDTD_ChildDef_Empty extends HTMLDTD_ChildDef { function HTMLDTD_ChildDef_Empty() {} } class HTMLDTD_AttrDef { var $def; function HTMLDTD_AttrDef($def) { $this->def = $def; } } ?>