mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-01-08 23:11:52 +00:00
Turn on entity parsing for the Lexers. Add PureHTMLDefinition and define removeForeignElements.
git-svn-id: http://htmlpurifier.org/svnroot/html_purifier/trunk@31 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
e7f5b1674d
commit
c4b23cc775
@ -75,7 +75,7 @@ class HTML_Lexer
|
|||||||
|
|
||||||
if (!$inside_tag && $position_next_lt !== false) {
|
if (!$inside_tag && $position_next_lt !== false) {
|
||||||
// We are not inside tag and there still is another tag to parse
|
// We are not inside tag and there still is another tag to parse
|
||||||
$array[] = new MF_Text(substr($string, $cursor, $position_next_lt - $cursor));
|
$array[] = new MF_Text(html_entity_decode(substr($string, $cursor, $position_next_lt - $cursor)));
|
||||||
$cursor = $position_next_lt + 1;
|
$cursor = $position_next_lt + 1;
|
||||||
$inside_tag = true;
|
$inside_tag = true;
|
||||||
continue;
|
continue;
|
||||||
@ -84,7 +84,7 @@ class HTML_Lexer
|
|||||||
// If we're already at the end, break
|
// If we're already at the end, break
|
||||||
if ($cursor === strlen($string)) break;
|
if ($cursor === strlen($string)) break;
|
||||||
// Create Text of rest of string
|
// Create Text of rest of string
|
||||||
$array[] = new MF_Text(substr($string, $cursor));
|
$array[] = new MF_Text(html_entity_decode(substr($string, $cursor)));
|
||||||
break;
|
break;
|
||||||
} elseif ($inside_tag && $position_next_gt !== false) {
|
} elseif ($inside_tag && $position_next_gt !== false) {
|
||||||
// We are in tag and it is well formed
|
// We are in tag and it is well formed
|
||||||
@ -144,7 +144,7 @@ class HTML_Lexer
|
|||||||
$inside_tag = false;
|
$inside_tag = false;
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
$array[] = new MF_Text('<' . substr($string, $cursor));
|
$array[] = new MF_Text('<' . html_entity_decode(substr($string, $cursor)));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -234,7 +234,7 @@ class HTML_Lexer
|
|||||||
$value = substr($string, $position_next_quote + 1,
|
$value = substr($string, $position_next_quote + 1,
|
||||||
$position_end_quote - $position_next_quote - 1);
|
$position_end_quote - $position_next_quote - 1);
|
||||||
if ($key) {
|
if ($key) {
|
||||||
$array[$key] = $value;
|
$array[$key] = html_entity_decode($value);
|
||||||
}
|
}
|
||||||
$cursor = $position_end_quote + 1;
|
$cursor = $position_end_quote + 1;
|
||||||
} else {
|
} else {
|
||||||
@ -268,6 +268,7 @@ class HTML_Lexer_Sax extends HTML_Lexer
|
|||||||
$parser->set_element_handler('openHandler','closeHandler');
|
$parser->set_element_handler('openHandler','closeHandler');
|
||||||
$parser->set_data_handler('dataHandler');
|
$parser->set_data_handler('dataHandler');
|
||||||
$parser->set_escape_handler('escapeHandler');
|
$parser->set_escape_handler('escapeHandler');
|
||||||
|
$parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
|
||||||
$parser->parse($html);
|
$parser->parse($html);
|
||||||
return $this->tokens;
|
return $this->tokens;
|
||||||
}
|
}
|
||||||
|
169
PureHTMLDefinition.php
Normal file
169
PureHTMLDefinition.php
Normal file
@ -0,0 +1,169 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
class PureHTMLDefinition
|
||||||
|
{
|
||||||
|
|
||||||
|
var $info = array();
|
||||||
|
|
||||||
|
function PureHTMLDefinition() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadData() {
|
||||||
|
// emulates the structure of the DTD
|
||||||
|
|
||||||
|
// array(
|
||||||
|
// array of allowed child elements,
|
||||||
|
// array of rejected child elements
|
||||||
|
// indication about how many child elements are needed
|
||||||
|
// )
|
||||||
|
|
||||||
|
$entity['special.extra'] = array('img');
|
||||||
|
$entity['special.basic'] = array('br','bdo','span');
|
||||||
|
$entity['special'] = array_merge($entity['special.basic'],
|
||||||
|
$entity['special.extra']);
|
||||||
|
|
||||||
|
$entity['fontstyle.extra'] = array('big','small');
|
||||||
|
$entity['fontstyle.basic'] = array('tt','i','b','u','s','strike');
|
||||||
|
$entity['fontstyle'] = array_merge($entity['fontstyle.extra'],
|
||||||
|
$entity['fontstyle.basic']);
|
||||||
|
|
||||||
|
$entity['phrase.extra'] = array('sub','sup');
|
||||||
|
$entity['phrase.basic'] = array('em','strong','dfn','code','samp','kbd',
|
||||||
|
'var','cite','abbr','acronym','q');
|
||||||
|
$entity['phrase'] = array_merge($entity['phrase.extra'],
|
||||||
|
$entity['phrase.basic']);
|
||||||
|
|
||||||
|
$entity['misc.inline'] = array('ins','del');
|
||||||
|
$entity['misc'] = $entity['misc.inline'];
|
||||||
|
|
||||||
|
$entity['inline'] = array_merge(array('a'), $entity['special'],
|
||||||
|
$entity['fontstyle'], $entity['phrase']);
|
||||||
|
|
||||||
|
$entity['heading'] = array('h1','h2','h3','h4','h5','h6');
|
||||||
|
$entity['lists'] = array('ul','ol', 'dl');
|
||||||
|
$entity['blocktext'] = array('pre','hr','blockquote','address');
|
||||||
|
|
||||||
|
$entity['block'] = array_merge(array('p','div','table'),
|
||||||
|
$entity['heading'],$entity['lists'], $entity['blocktext']);
|
||||||
|
|
||||||
|
$entity['Inline'] = array_merge(array('#PCDATA'),$entity['special'],
|
||||||
|
$entity['misc.inline']);
|
||||||
|
$entity['Flow'] = array_merge(array('#PCDATA'), $entity['block'],
|
||||||
|
$entity['inline'], $entity['misc']);
|
||||||
|
$entity['a.content'] = array_merge(array('#PCDATA'), $entity['special'],
|
||||||
|
$entity['fontstyle'], $entity['phrase'], $entity['misc.inline']);
|
||||||
|
|
||||||
|
$entity['pre.content'] = array_merge(array('#PCDATA', 'a'),
|
||||||
|
$entity['special.basic'], $entity['fontstyle.basic'],
|
||||||
|
$entity['phrase.basic'], $entity['misc.inline']);
|
||||||
|
|
||||||
|
$this->info['ins'] =
|
||||||
|
$this->info['del'] =
|
||||||
|
$this->info['blockquote'] =
|
||||||
|
$this->info['dd'] =
|
||||||
|
$this->info['div'] = array($entity['Flow']);
|
||||||
|
|
||||||
|
$this->info['em'] =
|
||||||
|
$this->info['strong'] =
|
||||||
|
$this->info['dfn'] =
|
||||||
|
$this->info['code'] =
|
||||||
|
$this->info['samp'] =
|
||||||
|
$this->info['kbd'] =
|
||||||
|
$this->info['var'] =
|
||||||
|
$this->info['code'] =
|
||||||
|
$this->info['samp'] =
|
||||||
|
$this->info['kbd'] =
|
||||||
|
$this->info['var'] =
|
||||||
|
$this->info['cite'] =
|
||||||
|
$this->info['abbr'] =
|
||||||
|
$this->info['acronym'] =
|
||||||
|
$this->info['q'] =
|
||||||
|
$this->info['sub'] =
|
||||||
|
$this->info['tt'] =
|
||||||
|
$this->info['sup'] =
|
||||||
|
$this->info['i'] =
|
||||||
|
$this->info['b'] =
|
||||||
|
$this->info['big'] =
|
||||||
|
$this->info['small'] =
|
||||||
|
$this->info['u'] =
|
||||||
|
$this->info['s'] =
|
||||||
|
$this->info['strike'] =
|
||||||
|
$this->info['bdo'] =
|
||||||
|
$this->info['span'] =
|
||||||
|
$this->info['dt'] =
|
||||||
|
$this->info['p'] =
|
||||||
|
$this->info['h1'] =
|
||||||
|
$this->info['h2'] =
|
||||||
|
$this->info['h3'] =
|
||||||
|
$this->info['h4'] =
|
||||||
|
$this->info['h5'] =
|
||||||
|
$this->info['h6'] = array($entity['Inline']);
|
||||||
|
|
||||||
|
$this->info['ol'] =
|
||||||
|
$this->info['ul'] = array(array('li'),array(),'+');
|
||||||
|
// the plus requires at least one child. I don't know what the
|
||||||
|
// empty array is for though
|
||||||
|
|
||||||
|
$this->info['dl'] = array(array('dt','dd'));
|
||||||
|
$this->info['address'] =
|
||||||
|
array(
|
||||||
|
array_merge(
|
||||||
|
array('#PCDATA', 'p'),
|
||||||
|
$entity['inline'],
|
||||||
|
$entity['misc.inline']));
|
||||||
|
|
||||||
|
$this->info['img'] =
|
||||||
|
$this->info['br'] =
|
||||||
|
$this->info['hr'] = 'EMPTY';
|
||||||
|
|
||||||
|
$this->info['pre'] = array($entity['pre.content']);
|
||||||
|
|
||||||
|
$this->info['a'] = array($entity['a.content']);
|
||||||
|
}
|
||||||
|
|
||||||
|
function purifyTokens($tokens) {
|
||||||
|
if (empty($this->info)) $this->loadData();
|
||||||
|
$tokens = $this->removeForeignElements($tokens);
|
||||||
|
$tokens = $this->makeWellFormed($tokens);
|
||||||
|
$tokens = $this->fixNesting($tokens);
|
||||||
|
$tokens = $this->validateAttributes($tokens);
|
||||||
|
return $tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
function removeForeignElements($tokens) {
|
||||||
|
if (empty($this->info)) $this->loadData();
|
||||||
|
$result = array();
|
||||||
|
foreach($tokens as $token) {
|
||||||
|
if (is_subclass_of($token, 'MF_Tag')) {
|
||||||
|
if (!isset($this->info[$token->name])) continue;
|
||||||
|
} elseif (is_a($token, 'MF_Comment')) {
|
||||||
|
// strip comments
|
||||||
|
continue;
|
||||||
|
} elseif (is_a($token, 'MF_Text')) {
|
||||||
|
} else {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
$result[] = $token;
|
||||||
|
}
|
||||||
|
return $result;
|
||||||
|
}
|
||||||
|
|
||||||
|
function makeWellFormed($tokens) {
|
||||||
|
if (empty($this->info)) $this->loadData();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
function fixNesting($tokens) {
|
||||||
|
if (empty($this->info)) $this->loadData();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
function validateAttributes($tokens) {
|
||||||
|
if (empty($this->info)) $this->loadData();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
?>
|
@ -7,6 +7,7 @@ require_once 'XML/HTMLSax3.php'; // optional PEAR class
|
|||||||
require_once 'HTML_Purifier.php';
|
require_once 'HTML_Purifier.php';
|
||||||
require_once 'HTML_Lexer.php';
|
require_once 'HTML_Lexer.php';
|
||||||
require_once 'MarkupFragment.php';
|
require_once 'MarkupFragment.php';
|
||||||
|
require_once 'PureHTMLDefinition.php';
|
||||||
|
|
||||||
$test = new GroupTest('HTML_Purifier');
|
$test = new GroupTest('HTML_Purifier');
|
||||||
|
|
||||||
@ -14,6 +15,7 @@ chdir('tests/');
|
|||||||
$test->addTestFile('HTML_Purifier.php');
|
$test->addTestFile('HTML_Purifier.php');
|
||||||
$test->addTestFile('HTML_Lexer.php');
|
$test->addTestFile('HTML_Lexer.php');
|
||||||
//$test->addTestFile('MarkupFragment.php');
|
//$test->addTestFile('MarkupFragment.php');
|
||||||
|
$test->addTestFile('PureHTMLDefinition.php');
|
||||||
chdir('../');
|
chdir('../');
|
||||||
|
|
||||||
$test->run(new HtmlReporter());
|
$test->run(new HtmlReporter());
|
||||||
|
@ -107,7 +107,12 @@ class TestCase_HTML_Lexer extends UnitTestCase
|
|||||||
|
|
||||||
$input[9] = '<b>';
|
$input[9] = '<b>';
|
||||||
$expect[9] = array(
|
$expect[9] = array(
|
||||||
new MF_Text('<b>')
|
new MF_Text('<b>')
|
||||||
|
);
|
||||||
|
$sax_expect[9] = array(
|
||||||
|
new MF_Text('<')
|
||||||
|
,new MF_Text('b')
|
||||||
|
,new MF_Text('>')
|
||||||
);
|
);
|
||||||
// however, we may want to change both styles
|
// however, we may want to change both styles
|
||||||
// into parsed: '<b>'. SAX has an option for this
|
// into parsed: '<b>'. SAX has an option for this
|
||||||
|
41
tests/PureHTMLDefinition.php
Normal file
41
tests/PureHTMLDefinition.php
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
class UnitTest_PureHTMLDefinition extends UnitTestCase
|
||||||
|
{
|
||||||
|
|
||||||
|
var $def;
|
||||||
|
|
||||||
|
function UnitTest_PureHTMLDefinition() {
|
||||||
|
$this->UnitTestCase();
|
||||||
|
$this->def = new PureHTMLDefinition();
|
||||||
|
$this->def->loadData();
|
||||||
|
}
|
||||||
|
|
||||||
|
function test_removeForeignElements() {
|
||||||
|
|
||||||
|
$inputs = array();
|
||||||
|
$expect = array();
|
||||||
|
|
||||||
|
$inputs[0] = array();
|
||||||
|
$expect[0] = $inputs[0];
|
||||||
|
|
||||||
|
$inputs[1] = array(
|
||||||
|
new MF_Text('This is ')
|
||||||
|
,new MF_StartTag('b', array())
|
||||||
|
,new MF_Text('bold')
|
||||||
|
,new MF_EndTag('b')
|
||||||
|
,new MF_Text(' text')
|
||||||
|
);
|
||||||
|
$expect[1] = $inputs[1];
|
||||||
|
|
||||||
|
foreach ($inputs as $i => $input) {
|
||||||
|
$result = $this->def->removeForeignElements($input);
|
||||||
|
$this->assertEqual($result, $expect[$i]);
|
||||||
|
paintIf($result, $result != $expect[$i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
?>
|
Loading…
Reference in New Issue
Block a user