From 4935c69904650ae8f1552bbb34fca21263e44428 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Mon, 5 Jun 2006 00:50:27 +0000 Subject: [PATCH] Main: implemented regexp-style validation for complicated child definitions Also: * Updated spec with some extra comments * Trigger error if HTMLDTD_ChildDef_Simple has validateChildren called * Factor out definition assertion in test class git-svn-id: http://htmlpurifier.org/svnroot/html_purifier/trunk@50 48356398-32a2-884e-a903-53898d9a118a --- PureHTMLDefinition.php | 50 +++++++++++++++++++++++++++- docs/spec.txt | 7 +++- tests/PureHTMLDefinition.php | 63 ++++++++++++++++++++++++++++++------ 3 files changed, 109 insertions(+), 11 deletions(-) diff --git a/PureHTMLDefinition.php b/PureHTMLDefinition.php index 83f3aaac..f6627f53 100644 --- a/PureHTMLDefinition.php +++ b/PureHTMLDefinition.php @@ -352,13 +352,58 @@ class HTMLDTD_Element // true = leave nodes as is // false = delete parent node and all children // array(...) = replace children nodes with these + +// this is the hardest one to implement. We'll use fancy regexp tricks +// right now, we only expect it to return TRUE or FALSE (it won't attempt +// to fix the tree) + +// we may end up writing custom code for each HTML case +// in order to make it self correcting class HTMLDTD_ChildDef { var $dtd_regex; + var $_pcre_regex; function HTMLDTD_ChildDef($dtd_regex) { $this->dtd_regex = $dtd_regex; + $this->_compileRegex(); + } + function _compileRegex() { + $raw = str_replace(' ', '', $this->dtd_regex); + if ($raw{0} != '(') { + $raw = "($raw)"; + } + $reg = str_replace(',', ',?', $raw); + $reg = preg_replace('/([#a-zA-Z0-9_.-]+)/', '(,?\\0)', $reg); + $this->_pcre_regex = $reg; + } + function validateChildren($tokens_of_children) { + $list_of_children = ''; + $nesting = 0; // depth into the nest + foreach ($tokens_of_children as $token) { + if (!empty($token->is_whitespace)) continue; + + $is_child = ($nesting == 0); // direct + + if (is_a($token, 'MF_StartTag')) { + $nesting++; + } elseif (is_a($token, 'MF_EndTag')) { + $nesting--; + } + + if ($is_child) { + $list_of_children .= $token->name . ','; + } + } + $list_of_children = rtrim($list_of_children, ','); + + $okay = + preg_match( + '/^'.$this->_pcre_regex.'$/', + $list_of_children + ); + + return (bool) $okay; } - function validateChildren($tokens_of_children) {} } class HTMLDTD_ChildDef_Simple extends HTMLDTD_ChildDef { @@ -373,6 +418,9 @@ class HTMLDTD_ChildDef_Simple extends HTMLDTD_ChildDef $this->elements = $elements; $this->gen = new HTML_Generator(); } + function validateChildren() { + trigger_error('Cannot call abstract function!', E_USER_ERROR); + } } class HTMLDTD_ChildDef_Required extends HTMLDTD_ChildDef_Simple { diff --git a/docs/spec.txt b/docs/spec.txt index e5faa574..59d52ea1 100644 --- a/docs/spec.txt +++ b/docs/spec.txt @@ -153,6 +153,8 @@ So... here's the interesting code: -- +// # This actually does the validation + // Validate the order of the children if (!$was_error && count($dtd_children)) { $children_list = implode(',', $children); @@ -166,6 +168,8 @@ if (!$was_error && count($dtd_children)) { -- +// # This figures out the PcreRegex + //$ch is a string of the allowed childs $children = preg_split('/([^#a-zA-Z0-9_.-]+)/', $ch, -1, PREG_SPLIT_NO_EMPTY); // check for parsed character data special case @@ -222,7 +226,8 @@ form must not contain other form elements. Normative exclusions straight from the horses mouth. These are SGML style, -not XML style, so we need to modify the ruleset slightly. +not XML style, so we need to modify the ruleset slightly. However, the DTD +may have done this for us already. -- diff --git a/tests/PureHTMLDefinition.php b/tests/PureHTMLDefinition.php index ffa5bddf..aaffe26c 100644 --- a/tests/PureHTMLDefinition.php +++ b/tests/PureHTMLDefinition.php @@ -190,8 +190,61 @@ class Test_PureHTMLDefinition extends UnitTestCase class Test_HTMLDTD_ChildDef extends UnitTestCase { + function assertSeries($inputs, $expect, $def) { + foreach ($inputs as $i => $input) { + $result = $def->validateChildren($input); + if (is_bool($expect[$i])) { + $this->assertIdentical($expect[$i], $result); + } else { + $this->assertEqual($expect[$i], $result); + paintIf($result, $result != $expect[$i]); + } + } + } + + function test_complex() { + + // the table definition + $def = new HTMLDTD_ChildDef( + '(caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))'); + + $inputs[0] = array(); + $expect[0] = false; + + // we really don't care what's inside, because if it turns out + // this tr is illegal, we'll end up re-evaluating the parent node + // anyway. + $inputs[1] = array( + new MF_StartTag('tr') ,new MF_EndTag('tr') + ); + $expect[1] = true; + + $inputs[2] = array( + new MF_StartTag('caption') ,new MF_EndTag('caption') + ,new MF_StartTag('col') ,new MF_EndTag('col') + ,new MF_StartTag('thead') ,new MF_EndTag('thead') + ,new MF_StartTag('tfoot') ,new MF_EndTag('tfoot') + ,new MF_StartTag('tbody') ,new MF_EndTag('tbody') + ); + $expect[2] = true; + + $inputs[3] = array( + new MF_StartTag('col') ,new MF_EndTag('col') + ,new MF_StartTag('col') ,new MF_EndTag('col') + ,new MF_StartTag('col') ,new MF_EndTag('col') + ,new MF_StartTag('tr') ,new MF_EndTag('tr') + ); + $expect[3] = true; + + $this->assertSeries($inputs, $expect, $def); + + } + function test_simple() { + // simple is actually an abstract class + // but we're unit testing some of the conv. functions it gives + $def = new HTMLDTD_ChildDef_Simple('foobar | bang |gizmo'); $this->assertEqual($def->elements, array( @@ -274,15 +327,7 @@ class Test_HTMLDTD_ChildDef extends UnitTestCase ); $expect[5] = false; - foreach ($inputs as $i => $input) { - $result = $def->validateChildren($input); - if (is_bool($expect[$i])) { - $this->assertIdentical($expect[$i], $result); - } else { - $this->assertEqual($expect[$i], $result); - paintIf($result, $result != $expect[$i]); - } - } + $this->assertSeries($inputs, $expect, $def); }