0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-03 05:11:52 +00:00

Main: implemented regexp-style validation for complicated child definitions

Also:
* Updated spec with some extra comments
* Trigger error if HTMLDTD_ChildDef_Simple has validateChildren called
* Factor out definition assertion in test class

git-svn-id: http://htmlpurifier.org/svnroot/html_purifier/trunk@50 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-06-05 00:50:27 +00:00
parent f5486bbbae
commit 4935c69904
3 changed files with 109 additions and 11 deletions

View File

@ -352,13 +352,58 @@ class HTMLDTD_Element
// true = leave nodes as is
// false = delete parent node and all children
// array(...) = replace children nodes with these
// this is the hardest one to implement. We'll use fancy regexp tricks
// right now, we only expect it to return TRUE or FALSE (it won't attempt
// to fix the tree)
// we may end up writing custom code for each HTML case
// in order to make it self correcting
class HTMLDTD_ChildDef
{
var $dtd_regex;
var $_pcre_regex;
function HTMLDTD_ChildDef($dtd_regex) {
$this->dtd_regex = $dtd_regex;
$this->_compileRegex();
}
function _compileRegex() {
$raw = str_replace(' ', '', $this->dtd_regex);
if ($raw{0} != '(') {
$raw = "($raw)";
}
$reg = str_replace(',', ',?', $raw);
$reg = preg_replace('/([#a-zA-Z0-9_.-]+)/', '(,?\\0)', $reg);
$this->_pcre_regex = $reg;
}
function validateChildren($tokens_of_children) {
$list_of_children = '';
$nesting = 0; // depth into the nest
foreach ($tokens_of_children as $token) {
if (!empty($token->is_whitespace)) continue;
$is_child = ($nesting == 0); // direct
if (is_a($token, 'MF_StartTag')) {
$nesting++;
} elseif (is_a($token, 'MF_EndTag')) {
$nesting--;
}
if ($is_child) {
$list_of_children .= $token->name . ',';
}
}
$list_of_children = rtrim($list_of_children, ',');
$okay =
preg_match(
'/^'.$this->_pcre_regex.'$/',
$list_of_children
);
return (bool) $okay;
}
function validateChildren($tokens_of_children) {}
}
class HTMLDTD_ChildDef_Simple extends HTMLDTD_ChildDef
{
@ -373,6 +418,9 @@ class HTMLDTD_ChildDef_Simple extends HTMLDTD_ChildDef
$this->elements = $elements;
$this->gen = new HTML_Generator();
}
function validateChildren() {
trigger_error('Cannot call abstract function!', E_USER_ERROR);
}
}
class HTMLDTD_ChildDef_Required extends HTMLDTD_ChildDef_Simple
{

View File

@ -153,6 +153,8 @@ So... here's the interesting code:
--
// # This actually does the validation
// Validate the order of the children
if (!$was_error && count($dtd_children)) {
$children_list = implode(',', $children);
@ -166,6 +168,8 @@ if (!$was_error && count($dtd_children)) {
--
// # This figures out the PcreRegex
//$ch is a string of the allowed childs
$children = preg_split('/([^#a-zA-Z0-9_.-]+)/', $ch, -1, PREG_SPLIT_NO_EMPTY);
// check for parsed character data special case
@ -222,7 +226,8 @@ form
must not contain other form elements.
Normative exclusions straight from the horses mouth. These are SGML style,
not XML style, so we need to modify the ruleset slightly.
not XML style, so we need to modify the ruleset slightly. However, the DTD
may have done this for us already.
--

View File

@ -190,8 +190,61 @@ class Test_PureHTMLDefinition extends UnitTestCase
class Test_HTMLDTD_ChildDef extends UnitTestCase
{
function assertSeries($inputs, $expect, $def) {
foreach ($inputs as $i => $input) {
$result = $def->validateChildren($input);
if (is_bool($expect[$i])) {
$this->assertIdentical($expect[$i], $result);
} else {
$this->assertEqual($expect[$i], $result);
paintIf($result, $result != $expect[$i]);
}
}
}
function test_complex() {
// the table definition
$def = new HTMLDTD_ChildDef(
'(caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))');
$inputs[0] = array();
$expect[0] = false;
// we really don't care what's inside, because if it turns out
// this tr is illegal, we'll end up re-evaluating the parent node
// anyway.
$inputs[1] = array(
new MF_StartTag('tr') ,new MF_EndTag('tr')
);
$expect[1] = true;
$inputs[2] = array(
new MF_StartTag('caption') ,new MF_EndTag('caption')
,new MF_StartTag('col') ,new MF_EndTag('col')
,new MF_StartTag('thead') ,new MF_EndTag('thead')
,new MF_StartTag('tfoot') ,new MF_EndTag('tfoot')
,new MF_StartTag('tbody') ,new MF_EndTag('tbody')
);
$expect[2] = true;
$inputs[3] = array(
new MF_StartTag('col') ,new MF_EndTag('col')
,new MF_StartTag('col') ,new MF_EndTag('col')
,new MF_StartTag('col') ,new MF_EndTag('col')
,new MF_StartTag('tr') ,new MF_EndTag('tr')
);
$expect[3] = true;
$this->assertSeries($inputs, $expect, $def);
}
function test_simple() {
// simple is actually an abstract class
// but we're unit testing some of the conv. functions it gives
$def = new HTMLDTD_ChildDef_Simple('foobar | bang |gizmo');
$this->assertEqual($def->elements,
array(
@ -274,15 +327,7 @@ class Test_HTMLDTD_ChildDef extends UnitTestCase
);
$expect[5] = false;
foreach ($inputs as $i => $input) {
$result = $def->validateChildren($input);
if (is_bool($expect[$i])) {
$this->assertIdentical($expect[$i], $result);
} else {
$this->assertEqual($expect[$i], $result);
paintIf($result, $result != $expect[$i]);
}
}
$this->assertSeries($inputs, $expect, $def);
}