diff --git a/HTML_Lexer.php b/HTML_Lexer.php
index cad7d4cf..6ae7c491 100644
--- a/HTML_Lexer.php
+++ b/HTML_Lexer.php
@@ -75,7 +75,7 @@ class HTML_Lexer
if (!$inside_tag && $position_next_lt !== false) {
// We are not inside tag and there still is another tag to parse
- $array[] = new MF_Text(substr($string, $cursor, $position_next_lt - $cursor));
+ $array[] = new MF_Text(html_entity_decode(substr($string, $cursor, $position_next_lt - $cursor)));
$cursor = $position_next_lt + 1;
$inside_tag = true;
continue;
@@ -84,7 +84,7 @@ class HTML_Lexer
// If we're already at the end, break
if ($cursor === strlen($string)) break;
// Create Text of rest of string
- $array[] = new MF_Text(substr($string, $cursor));
+ $array[] = new MF_Text(html_entity_decode(substr($string, $cursor)));
break;
} elseif ($inside_tag && $position_next_gt !== false) {
// We are in tag and it is well formed
@@ -144,7 +144,7 @@ class HTML_Lexer
$inside_tag = false;
continue;
} else {
- $array[] = new MF_Text('<' . substr($string, $cursor));
+ $array[] = new MF_Text('<' . html_entity_decode(substr($string, $cursor)));
break;
}
break;
@@ -234,7 +234,7 @@ class HTML_Lexer
$value = substr($string, $position_next_quote + 1,
$position_end_quote - $position_next_quote - 1);
if ($key) {
- $array[$key] = $value;
+ $array[$key] = html_entity_decode($value);
}
$cursor = $position_end_quote + 1;
} else {
@@ -268,6 +268,7 @@ class HTML_Lexer_Sax extends HTML_Lexer
$parser->set_element_handler('openHandler','closeHandler');
$parser->set_data_handler('dataHandler');
$parser->set_escape_handler('escapeHandler');
+ $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
$parser->parse($html);
return $this->tokens;
}
diff --git a/PureHTMLDefinition.php b/PureHTMLDefinition.php
new file mode 100644
index 00000000..0ecc201f
--- /dev/null
+++ b/PureHTMLDefinition.php
@@ -0,0 +1,169 @@
+info['ins'] =
+ $this->info['del'] =
+ $this->info['blockquote'] =
+ $this->info['dd'] =
+ $this->info['div'] = array($entity['Flow']);
+
+ $this->info['em'] =
+ $this->info['strong'] =
+ $this->info['dfn'] =
+ $this->info['code'] =
+ $this->info['samp'] =
+ $this->info['kbd'] =
+ $this->info['var'] =
+ $this->info['code'] =
+ $this->info['samp'] =
+ $this->info['kbd'] =
+ $this->info['var'] =
+ $this->info['cite'] =
+ $this->info['abbr'] =
+ $this->info['acronym'] =
+ $this->info['q'] =
+ $this->info['sub'] =
+ $this->info['tt'] =
+ $this->info['sup'] =
+ $this->info['i'] =
+ $this->info['b'] =
+ $this->info['big'] =
+ $this->info['small'] =
+ $this->info['u'] =
+ $this->info['s'] =
+ $this->info['strike'] =
+ $this->info['bdo'] =
+ $this->info['span'] =
+ $this->info['dt'] =
+ $this->info['p'] =
+ $this->info['h1'] =
+ $this->info['h2'] =
+ $this->info['h3'] =
+ $this->info['h4'] =
+ $this->info['h5'] =
+ $this->info['h6'] = array($entity['Inline']);
+
+ $this->info['ol'] =
+ $this->info['ul'] = array(array('li'),array(),'+');
+ // the plus requires at least one child. I don't know what the
+ // empty array is for though
+
+ $this->info['dl'] = array(array('dt','dd'));
+ $this->info['address'] =
+ array(
+ array_merge(
+ array('#PCDATA', 'p'),
+ $entity['inline'],
+ $entity['misc.inline']));
+
+ $this->info['img'] =
+ $this->info['br'] =
+ $this->info['hr'] = 'EMPTY';
+
+ $this->info['pre'] = array($entity['pre.content']);
+
+ $this->info['a'] = array($entity['a.content']);
+ }
+
+ function purifyTokens($tokens) {
+ if (empty($this->info)) $this->loadData();
+ $tokens = $this->removeForeignElements($tokens);
+ $tokens = $this->makeWellFormed($tokens);
+ $tokens = $this->fixNesting($tokens);
+ $tokens = $this->validateAttributes($tokens);
+ return $tokens;
+ }
+
+ function removeForeignElements($tokens) {
+ if (empty($this->info)) $this->loadData();
+ $result = array();
+ foreach($tokens as $token) {
+ if (is_subclass_of($token, 'MF_Tag')) {
+ if (!isset($this->info[$token->name])) continue;
+ } elseif (is_a($token, 'MF_Comment')) {
+ // strip comments
+ continue;
+ } elseif (is_a($token, 'MF_Text')) {
+ } else {
+ continue;
+ }
+ $result[] = $token;
+ }
+ return $result;
+ }
+
+ function makeWellFormed($tokens) {
+ if (empty($this->info)) $this->loadData();
+
+ }
+
+ function fixNesting($tokens) {
+ if (empty($this->info)) $this->loadData();
+
+ }
+
+ function validateAttributes($tokens) {
+ if (empty($this->info)) $this->loadData();
+
+ }
+
+}
+
+?>
\ No newline at end of file
diff --git a/tester.php b/tester.php
index 2b219341..f4aa2848 100644
--- a/tester.php
+++ b/tester.php
@@ -7,6 +7,7 @@ require_once 'XML/HTMLSax3.php'; // optional PEAR class
require_once 'HTML_Purifier.php';
require_once 'HTML_Lexer.php';
require_once 'MarkupFragment.php';
+require_once 'PureHTMLDefinition.php';
$test = new GroupTest('HTML_Purifier');
@@ -14,6 +15,7 @@ chdir('tests/');
$test->addTestFile('HTML_Purifier.php');
$test->addTestFile('HTML_Lexer.php');
//$test->addTestFile('MarkupFragment.php');
+$test->addTestFile('PureHTMLDefinition.php');
chdir('../');
$test->run(new HtmlReporter());
diff --git a/tests/HTML_Lexer.php b/tests/HTML_Lexer.php
index 9263bd3e..eb9f68f9 100644
--- a/tests/HTML_Lexer.php
+++ b/tests/HTML_Lexer.php
@@ -107,7 +107,12 @@ class TestCase_HTML_Lexer extends UnitTestCase
$input[9] = '<b>';
$expect[9] = array(
- new MF_Text('<b>')
+ new MF_Text('')
+ );
+ $sax_expect[9] = array(
+ new MF_Text('<')
+ ,new MF_Text('b')
+ ,new MF_Text('>')
);
// however, we may want to change both styles
// into parsed: ''. SAX has an option for this
diff --git a/tests/PureHTMLDefinition.php b/tests/PureHTMLDefinition.php
new file mode 100644
index 00000000..f325c889
--- /dev/null
+++ b/tests/PureHTMLDefinition.php
@@ -0,0 +1,41 @@
+UnitTestCase();
+ $this->def = new PureHTMLDefinition();
+ $this->def->loadData();
+ }
+
+ function test_removeForeignElements() {
+
+ $inputs = array();
+ $expect = array();
+
+ $inputs[0] = array();
+ $expect[0] = $inputs[0];
+
+ $inputs[1] = array(
+ new MF_Text('This is ')
+ ,new MF_StartTag('b', array())
+ ,new MF_Text('bold')
+ ,new MF_EndTag('b')
+ ,new MF_Text(' text')
+ );
+ $expect[1] = $inputs[1];
+
+ foreach ($inputs as $i => $input) {
+ $result = $this->def->removeForeignElements($input);
+ $this->assertEqual($result, $expect[$i]);
+ paintIf($result, $result != $expect[$i]);
+ }
+
+ }
+
+}
+
+?>
\ No newline at end of file