diff --git a/HTML_Lexer.php b/HTML_Lexer.php index cad7d4cf..6ae7c491 100644 --- a/HTML_Lexer.php +++ b/HTML_Lexer.php @@ -75,7 +75,7 @@ class HTML_Lexer if (!$inside_tag && $position_next_lt !== false) { // We are not inside tag and there still is another tag to parse - $array[] = new MF_Text(substr($string, $cursor, $position_next_lt - $cursor)); + $array[] = new MF_Text(html_entity_decode(substr($string, $cursor, $position_next_lt - $cursor))); $cursor = $position_next_lt + 1; $inside_tag = true; continue; @@ -84,7 +84,7 @@ class HTML_Lexer // If we're already at the end, break if ($cursor === strlen($string)) break; // Create Text of rest of string - $array[] = new MF_Text(substr($string, $cursor)); + $array[] = new MF_Text(html_entity_decode(substr($string, $cursor))); break; } elseif ($inside_tag && $position_next_gt !== false) { // We are in tag and it is well formed @@ -144,7 +144,7 @@ class HTML_Lexer $inside_tag = false; continue; } else { - $array[] = new MF_Text('<' . substr($string, $cursor)); + $array[] = new MF_Text('<' . html_entity_decode(substr($string, $cursor))); break; } break; @@ -234,7 +234,7 @@ class HTML_Lexer $value = substr($string, $position_next_quote + 1, $position_end_quote - $position_next_quote - 1); if ($key) { - $array[$key] = $value; + $array[$key] = html_entity_decode($value); } $cursor = $position_end_quote + 1; } else { @@ -268,6 +268,7 @@ class HTML_Lexer_Sax extends HTML_Lexer $parser->set_element_handler('openHandler','closeHandler'); $parser->set_data_handler('dataHandler'); $parser->set_escape_handler('escapeHandler'); + $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1); $parser->parse($html); return $this->tokens; } diff --git a/PureHTMLDefinition.php b/PureHTMLDefinition.php new file mode 100644 index 00000000..0ecc201f --- /dev/null +++ b/PureHTMLDefinition.php @@ -0,0 +1,169 @@ +info['ins'] = + $this->info['del'] = + $this->info['blockquote'] = + $this->info['dd'] = + $this->info['div'] = array($entity['Flow']); + + $this->info['em'] = + $this->info['strong'] = + $this->info['dfn'] = + $this->info['code'] = + $this->info['samp'] = + $this->info['kbd'] = + $this->info['var'] = + $this->info['code'] = + $this->info['samp'] = + $this->info['kbd'] = + $this->info['var'] = + $this->info['cite'] = + $this->info['abbr'] = + $this->info['acronym'] = + $this->info['q'] = + $this->info['sub'] = + $this->info['tt'] = + $this->info['sup'] = + $this->info['i'] = + $this->info['b'] = + $this->info['big'] = + $this->info['small'] = + $this->info['u'] = + $this->info['s'] = + $this->info['strike'] = + $this->info['bdo'] = + $this->info['span'] = + $this->info['dt'] = + $this->info['p'] = + $this->info['h1'] = + $this->info['h2'] = + $this->info['h3'] = + $this->info['h4'] = + $this->info['h5'] = + $this->info['h6'] = array($entity['Inline']); + + $this->info['ol'] = + $this->info['ul'] = array(array('li'),array(),'+'); + // the plus requires at least one child. I don't know what the + // empty array is for though + + $this->info['dl'] = array(array('dt','dd')); + $this->info['address'] = + array( + array_merge( + array('#PCDATA', 'p'), + $entity['inline'], + $entity['misc.inline'])); + + $this->info['img'] = + $this->info['br'] = + $this->info['hr'] = 'EMPTY'; + + $this->info['pre'] = array($entity['pre.content']); + + $this->info['a'] = array($entity['a.content']); + } + + function purifyTokens($tokens) { + if (empty($this->info)) $this->loadData(); + $tokens = $this->removeForeignElements($tokens); + $tokens = $this->makeWellFormed($tokens); + $tokens = $this->fixNesting($tokens); + $tokens = $this->validateAttributes($tokens); + return $tokens; + } + + function removeForeignElements($tokens) { + if (empty($this->info)) $this->loadData(); + $result = array(); + foreach($tokens as $token) { + if (is_subclass_of($token, 'MF_Tag')) { + if (!isset($this->info[$token->name])) continue; + } elseif (is_a($token, 'MF_Comment')) { + // strip comments + continue; + } elseif (is_a($token, 'MF_Text')) { + } else { + continue; + } + $result[] = $token; + } + return $result; + } + + function makeWellFormed($tokens) { + if (empty($this->info)) $this->loadData(); + + } + + function fixNesting($tokens) { + if (empty($this->info)) $this->loadData(); + + } + + function validateAttributes($tokens) { + if (empty($this->info)) $this->loadData(); + + } + +} + +?> \ No newline at end of file diff --git a/tester.php b/tester.php index 2b219341..f4aa2848 100644 --- a/tester.php +++ b/tester.php @@ -7,6 +7,7 @@ require_once 'XML/HTMLSax3.php'; // optional PEAR class require_once 'HTML_Purifier.php'; require_once 'HTML_Lexer.php'; require_once 'MarkupFragment.php'; +require_once 'PureHTMLDefinition.php'; $test = new GroupTest('HTML_Purifier'); @@ -14,6 +15,7 @@ chdir('tests/'); $test->addTestFile('HTML_Purifier.php'); $test->addTestFile('HTML_Lexer.php'); //$test->addTestFile('MarkupFragment.php'); +$test->addTestFile('PureHTMLDefinition.php'); chdir('../'); $test->run(new HtmlReporter()); diff --git a/tests/HTML_Lexer.php b/tests/HTML_Lexer.php index 9263bd3e..eb9f68f9 100644 --- a/tests/HTML_Lexer.php +++ b/tests/HTML_Lexer.php @@ -107,7 +107,12 @@ class TestCase_HTML_Lexer extends UnitTestCase $input[9] = '<b>'; $expect[9] = array( - new MF_Text('<b>') + new MF_Text('') + ); + $sax_expect[9] = array( + new MF_Text('<') + ,new MF_Text('b') + ,new MF_Text('>') ); // however, we may want to change both styles // into parsed: ''. SAX has an option for this diff --git a/tests/PureHTMLDefinition.php b/tests/PureHTMLDefinition.php new file mode 100644 index 00000000..f325c889 --- /dev/null +++ b/tests/PureHTMLDefinition.php @@ -0,0 +1,41 @@ +UnitTestCase(); + $this->def = new PureHTMLDefinition(); + $this->def->loadData(); + } + + function test_removeForeignElements() { + + $inputs = array(); + $expect = array(); + + $inputs[0] = array(); + $expect[0] = $inputs[0]; + + $inputs[1] = array( + new MF_Text('This is ') + ,new MF_StartTag('b', array()) + ,new MF_Text('bold') + ,new MF_EndTag('b') + ,new MF_Text(' text') + ); + $expect[1] = $inputs[1]; + + foreach ($inputs as $i => $input) { + $result = $this->def->removeForeignElements($input); + $this->assertEqual($result, $expect[$i]); + paintIf($result, $result != $expect[$i]); + } + + } + +} + +?> \ No newline at end of file