0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-09 07:21:54 +00:00

Rename MarkupFragment.php to Token.php, change internal class names and rewire the classes. We also started adding more dependence on the Lexer and Generator in unrelated tests.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@63 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2006-07-21 11:27:54 +00:00
parent 8bde230c99
commit 23dba8b55e
9 changed files with 323 additions and 372 deletions

View File

@ -12,18 +12,18 @@ class HTML_Generator
} }
function generateFromToken($token) { function generateFromToken($token) {
if (is_a($token, 'MF_StartTag')) { if ($token->type == 'start') {
$attr = $this->generateAttributes($token->attributes); $attr = $this->generateAttributes($token->attributes);
return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>'; return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
} elseif (is_a($token, 'MF_EndTag')) { } elseif ($token->type == 'end') {
return '</' . $token->name . '>'; return '</' . $token->name . '>';
} elseif (is_a($token, 'MF_EmptyTag')) { } elseif ($token->type == 'empty') {
$attr = $this->generateAttributes($token->attributes); $attr = $this->generateAttributes($token->attributes);
return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />'; return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />';
} elseif (is_a($token, 'MF_Text')) { } elseif ($token->type == 'text') {
return htmlentities($token->data, ENT_COMPAT, 'UTF-8'); return htmlentities($token->data, ENT_COMPAT, 'UTF-8');
} else { } else {

View File

@ -121,7 +121,7 @@ class HTML_Lexer
if (!$inside_tag && $position_next_lt !== false) { if (!$inside_tag && $position_next_lt !== false) {
// We are not inside tag and there still is another tag to parse // We are not inside tag and there still is another tag to parse
$array[] = new $array[] = new
MF_Text( HTMLPurifier_Token_Text(
html_entity_decode( html_entity_decode(
substr( substr(
$string, $cursor, $position_next_lt - $cursor $string, $cursor, $position_next_lt - $cursor
@ -138,7 +138,7 @@ class HTML_Lexer
if ($cursor === strlen($string)) break; if ($cursor === strlen($string)) break;
// Create Text of rest of string // Create Text of rest of string
$array[] = new $array[] = new
MF_Text( HTMLPurifier_Token_Text(
html_entity_decode( html_entity_decode(
substr( substr(
$string, $cursor $string, $cursor
@ -158,7 +158,7 @@ class HTML_Lexer
substr($segment,strlen($segment)-2,2) == '--' substr($segment,strlen($segment)-2,2) == '--'
) { ) {
$array[] = new $array[] = new
MF_Comment( HTMLPurifier_Token_Comment(
substr( substr(
$segment, 3, strlen($segment) - 5 $segment, 3, strlen($segment) - 5
) )
@ -172,7 +172,7 @@ class HTML_Lexer
$is_end_tag = (strpos($segment,'/') === 0); $is_end_tag = (strpos($segment,'/') === 0);
if ($is_end_tag) { if ($is_end_tag) {
$type = substr($segment, 1); $type = substr($segment, 1);
$array[] = new MF_EndTag($type); $array[] = new HTMLPurifier_Token_End($type);
$inside_tag = false; $inside_tag = false;
$cursor = $position_next_gt + 1; $cursor = $position_next_gt + 1;
continue; continue;
@ -191,9 +191,9 @@ class HTML_Lexer
$position_first_space = $this->nextWhiteSpace($segment); $position_first_space = $this->nextWhiteSpace($segment);
if ($position_first_space === false) { if ($position_first_space === false) {
if ($is_self_closing) { if ($is_self_closing) {
$array[] = new MF_EmptyTag($segment); $array[] = new HTMLPurifier_Token_Empty($segment);
} else { } else {
$array[] = new MF_StartTag($segment, array()); $array[] = new HTMLPurifier_Token_Start($segment);
} }
$inside_tag = false; $inside_tag = false;
$cursor = $position_next_gt + 1; $cursor = $position_next_gt + 1;
@ -210,16 +210,16 @@ class HTML_Lexer
); );
$attributes = $this->tokenizeAttributeString($attribute_string); $attributes = $this->tokenizeAttributeString($attribute_string);
if ($is_self_closing) { if ($is_self_closing) {
$array[] = new MF_EmptyTag($type, $attributes); $array[] = new HTMLPurifier_Token_Empty($type, $attributes);
} else { } else {
$array[] = new MF_StartTag($type, $attributes); $array[] = new HTMLPurifier_Token_Start($type, $attributes);
} }
$cursor = $position_next_gt + 1; $cursor = $position_next_gt + 1;
$inside_tag = false; $inside_tag = false;
continue; continue;
} else { } else {
$array[] = new $array[] = new
MF_Text( HTMLPurifier_Token_Text(
'<' . '<' .
html_entity_decode( html_entity_decode(
substr($string, $cursor), substr($string, $cursor),
@ -362,9 +362,9 @@ class HTML_Lexer_Sax extends HTML_Lexer
function openHandler(&$parser, $name, $attrs, $closed) { function openHandler(&$parser, $name, $attrs, $closed) {
if ($closed) { if ($closed) {
$this->tokens[] = new MF_EmptyTag($name, $attrs); $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
} else { } else {
$this->tokens[] = new MF_StartTag($name, $attrs); $this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs);
} }
return true; return true;
} }
@ -373,21 +373,21 @@ class HTML_Lexer_Sax extends HTML_Lexer
// HTMLSax3 seems to always send empty tags an extra close tag // HTMLSax3 seems to always send empty tags an extra close tag
// check and ignore if you see it: // check and ignore if you see it:
// [TESTME] to make sure it doesn't overreach // [TESTME] to make sure it doesn't overreach
if (is_a($this->tokens[count($this->tokens)-1], 'MF_EmptyTag')) { if ($this->tokens[count($this->tokens)-1]->type == 'empty') {
return true; return true;
} }
$this->tokens[] = new MF_EndTag($name); $this->tokens[] = new HTMLPurifier_Token_End($name);
return true; return true;
} }
function dataHandler(&$parser, $data) { function dataHandler(&$parser, $data) {
$this->tokens[] = new MF_Text($data); $this->tokens[] = new HTMLPurifier_Token_Text($data);
return true; return true;
} }
function escapeHandler(&$parser, $data) { function escapeHandler(&$parser, $data) {
if (strpos($data, '-') === 0) { if (strpos($data, '-') === 0) {
$this->tokens[] = new MF_Comment($data); $this->tokens[] = new HTMLPurifier_Token_Comment($data);
} }
return true; return true;
} }

View File

@ -1,65 +0,0 @@
<?php
// MF = Markup Fragment
// all objects here are immutable
class MF {}
class MF_Tag extends MF // abstract
{
var $name;
function MF_Tag($name) {
$this->name = strtolower($name); // for some reason, the SAX parser
// uses uppercase. Investigate?
}
}
class MF_TagWithAttributes extends MF_Tag // abstract
{
var $attributes = array();
function MF_TagWithAttributes($name, $attributes = array()) {
$this->MF_Tag($name);
$this->attributes = $attributes;
}
}
class MF_StartTag extends MF_TagWithAttributes
{
var $type = 'start';
}
class MF_EmptyTag extends MF_TagWithAttributes
{
var $type = 'empty';
}
class MF_EndTag extends MF_Tag
{
var $type = 'end';
}
class MF_Text extends MF
{
var $name = '#PCDATA';
var $type = 'text';
var $data;
var $is_whitespace = false;
function MF_Text($data) {
$this->data = $data;
if (trim($data, " \n\r\t") === '') $this->is_whitespace = true;
}
function append($mf_text) {
return new MF_Text($this->data . $mf_text->data);
}
}
class MF_Comment extends MF
{
var $data;
var $type = 'comment';
function MF_Comment($data) {
$this->data = $data;
}
}
?>

View File

@ -162,10 +162,10 @@ class PureHTMLDefinition
if (empty($this->info)) $this->loadData(); if (empty($this->info)) $this->loadData();
$result = array(); $result = array();
foreach($tokens as $token) { foreach($tokens as $token) {
if (is_subclass_of($token, 'MF_Tag')) { if (!empty( $token->is_tag )) {
if (!isset($this->info[$token->name])) { if (!isset($this->info[$token->name])) {
// invalid tag, generate HTML and insert in // invalid tag, generate HTML and insert in
$token = new MF_Text( $token = new HTMLPurifier_Token_Text(
$this->generator->generateFromToken($token) $this->generator->generateFromToken($token)
); );
} }
@ -186,7 +186,7 @@ class PureHTMLDefinition
$result = array(); $result = array();
$current_nesting = array(); $current_nesting = array();
foreach ($tokens as $token) { foreach ($tokens as $token) {
if (!is_subclass_of($token, 'MF_Tag')) { if (empty( $token->is_tag )) {
$result[] = $token; $result[] = $token;
continue; continue;
} }
@ -196,7 +196,8 @@ class PureHTMLDefinition
if ($info->child_def->type == 'empty' && if ($info->child_def->type == 'empty' &&
$token->type == 'start' ) { $token->type == 'start' ) {
$result[] = new MF_EmptyTag($token->name, $token->attributes); $result[] = new HTMLPurifier_Token_Empty($token->name,
$token->attributes);
continue; continue;
} }
@ -204,8 +205,9 @@ class PureHTMLDefinition
if ($info->child_def->type != 'empty' && if ($info->child_def->type != 'empty' &&
$token->type == 'empty' ) { $token->type == 'empty' ) {
$result[] = new MF_StartTag($token->name, $token->attributes); $result[] = new HTMLPurifier_Token_Start($token->name,
$result[] = new MF_EndTag($token->name); $token->attributes);
$result[] = new HTMLPurifier_Token_End($token->name);
continue; continue;
} }
@ -228,7 +230,7 @@ class PureHTMLDefinition
if ($current_parent->name == 'p' && if ($current_parent->name == 'p' &&
isset($this->info_closes_p[$token->name]) isset($this->info_closes_p[$token->name])
) { ) {
$result[] = new MF_EndTag('p'); $result[] = new HTMLPurifier_Token_End('p');
$result[] = $token; $result[] = $token;
$current_nesting[] = $token; $current_nesting[] = $token;
continue; continue;
@ -238,7 +240,7 @@ class PureHTMLDefinition
if ($current_parent->name == 'li' && if ($current_parent->name == 'li' &&
$token->name == 'li' $token->name == 'li'
) { ) {
$result[] = new MF_EndTag('li'); $result[] = new HTMLPurifier_Token_End('li');
$result[] = $token; $result[] = $token;
$current_nesting[] = $token; $current_nesting[] = $token;
continue; continue;
@ -263,7 +265,7 @@ class PureHTMLDefinition
// make sure that we have something open // make sure that we have something open
if (empty($current_nesting)) { if (empty($current_nesting)) {
$result[] = new MF_Text( $result[] = new HTMLPurifier_Token_Text(
$this->generator->generateFromToken($token) $this->generator->generateFromToken($token)
); );
continue; continue;
@ -298,7 +300,7 @@ class PureHTMLDefinition
// we still didn't find the tag, so translate to text // we still didn't find the tag, so translate to text
if ($skipped_tags === false) { if ($skipped_tags === false) {
$result[] = new MF_Text( $result[] = new HTMLPurifier_Token_Text(
$this->generator->generateFromToken($token) $this->generator->generateFromToken($token)
); );
continue; continue;
@ -308,7 +310,7 @@ class PureHTMLDefinition
// note that skipped tags contains the element we need closed // note that skipped tags contains the element we need closed
$size = count($skipped_tags); $size = count($skipped_tags);
for ($i = $size - 1; $i >= 0; $i--) { for ($i = $size - 1; $i >= 0; $i--) {
$result[] = new MF_EndTag($skipped_tags[$i]->name); $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
} }
// done! // done!
@ -320,7 +322,8 @@ class PureHTMLDefinition
if (!empty($current_nesting)) { if (!empty($current_nesting)) {
$size = count($current_nesting); $size = count($current_nesting);
for ($i = $size - 1; $i >= 0; $i--) { for ($i = $size - 1; $i >= 0; $i--) {
$result[] = new MF_EndTag($current_nesting[$i]->name); $result[] =
new HTMLPurifier_Token_End($current_nesting[$i]->name);
} }
} }
@ -331,8 +334,8 @@ class PureHTMLDefinition
if (empty($this->info)) $this->loadData(); if (empty($this->info)) $this->loadData();
// insert implicit "parent" node, will be removed at end // insert implicit "parent" node, will be removed at end
array_unshift($tokens, new MF_StartTag('div')); array_unshift($tokens, new HTMLPurifier_Token_Start('div'));
$tokens[] = new MF_EndTag('div'); $tokens[] = new HTMLPurifier_Token_End('div');
for ($i = 0, $size = count($tokens) ; $i < $size; ) { for ($i = 0, $size = count($tokens) ; $i < $size; ) {
@ -553,7 +556,7 @@ class HTMLDTD_ChildDef_Required extends HTMLDTD_ChildDef_Simple
if (!isset($this->elements[$token->name])) { if (!isset($this->elements[$token->name])) {
$is_deleting = true; $is_deleting = true;
if ($pcdata_allowed) { if ($pcdata_allowed) {
$result[] = new MF_Text( $result[] = new HTMLPurifier_Token_Text(
$this->gen->generateFromToken($token) $this->gen->generateFromToken($token)
); );
} }
@ -563,7 +566,10 @@ class HTMLDTD_ChildDef_Required extends HTMLDTD_ChildDef_Simple
if (!$is_deleting) { if (!$is_deleting) {
$result[] = $token; $result[] = $token;
} elseif ($pcdata_allowed) { } elseif ($pcdata_allowed) {
$result[] = new MF_Text($this->gen->generateFromToken($token)); $result[] =
new HTMLPurifier_Token_Text(
$this->gen->generateFromToken( $token )
);
} else { } else {
// drop silently // drop silently
} }

66
Token.php Normal file
View File

@ -0,0 +1,66 @@
<?php
// all objects here are immutable
class HTMLPurifier_Token {} // abstract
class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
{
var $is_tag = true;
var $name;
function HTMLPurifier_Token_Tag($name) {
$this->name = strtolower($name); // for some reason, the SAX parser
// uses uppercase. Investigate?
}
}
// a rich tag has attributes
class HTMLPurifier_Token_RichTag extends HTMLPurifier_Token_Tag // abstract
{
var $attributes = array();
function HTMLPurifier_Token_RichTag($name, $attributes = array()) {
$this->HTMLPurifier_Token_Tag($name);
$this->attributes = $attributes;
}
}
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_RichTag
{
var $type = 'start';
}
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_RichTag
{
var $type = 'empty';
}
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
{
var $type = 'end';
}
class HTMLPurifier_Token_Text extends HTMLPurifier_Token
{
var $name = '#PCDATA';
var $type = 'text';
var $data;
var $is_whitespace = false;
function HTMLPurifier_Token_Text($data) {
$this->data = $data;
if (trim($data, " \n\r\t") === '') $this->is_whitespace = true;
}
function append($text) {
return new HTMLPurifier_Token_Text($this->data . $text->data);
}
}
class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
{
var $data;
var $type = 'comment';
function HTMLPurifier_Token_Comment($data) {
$this->data = $data;
}
}
?>

View File

@ -6,7 +6,7 @@ require_once 'XML/HTMLSax3.php'; // optional PEAR class
require_once 'HTML_Purifier.php'; require_once 'HTML_Purifier.php';
require_once 'HTML_Lexer.php'; require_once 'HTML_Lexer.php';
require_once 'MarkupFragment.php'; require_once 'Token.php';
require_once 'PureHTMLDefinition.php'; require_once 'PureHTMLDefinition.php';
require_once 'HTML_Generator.php'; require_once 'HTML_Generator.php';
@ -15,7 +15,7 @@ $test = new GroupTest('HTML_Purifier');
chdir('tests/'); chdir('tests/');
$test->addTestFile('HTML_Purifier.php'); $test->addTestFile('HTML_Purifier.php');
$test->addTestFile('HTML_Lexer.php'); $test->addTestFile('HTML_Lexer.php');
//$test->addTestFile('MarkupFragment.php'); //$test->addTestFile('Token.php');
$test->addTestFile('PureHTMLDefinition.php'); $test->addTestFile('PureHTMLDefinition.php');
$test->addTestFile('HTML_Generator.php'); $test->addTestFile('HTML_Generator.php');
chdir('../'); chdir('../');

View File

@ -15,22 +15,26 @@ class Test_HTML_Generator extends UnitTestCase
$inputs = array(); $inputs = array();
$expect = array(); $expect = array();
$inputs[0] = new MF_Text('Foobar.<>'); $inputs[0] = new HTMLPurifier_Token_Text('Foobar.<>');
$expect[0] = 'Foobar.&lt;&gt;'; $expect[0] = 'Foobar.&lt;&gt;';
$inputs[1] = new MF_StartTag('a', array('href' => 'dyn?a=foo&b=bar')); $inputs[1] = new HTMLPurifier_Token_Start('a',
array('href' => 'dyn?a=foo&b=bar')
);
$expect[1] = '<a href="dyn?a=foo&amp;b=bar">'; $expect[1] = '<a href="dyn?a=foo&amp;b=bar">';
$inputs[2] = new MF_EndTag('b'); $inputs[2] = new HTMLPurifier_Token_End('b');
$expect[2] = '</b>'; $expect[2] = '</b>';
$inputs[3] = new MF_EmptyTag('br', array('style' => 'font-family:"Courier New";')); $inputs[3] = new HTMLPurifier_Token_Empty('br',
array('style' => 'font-family:"Courier New";')
);
$expect[3] = '<br style="font-family:&quot;Courier New&quot;;" />'; $expect[3] = '<br style="font-family:&quot;Courier New&quot;;" />';
$inputs[4] = new MF_StartTag('asdf'); $inputs[4] = new HTMLPurifier_Token_Start('asdf');
$expect[4] = '<asdf>'; $expect[4] = '<asdf>';
$inputs[5] = new MF_EmptyTag('br'); $inputs[5] = new HTMLPurifier_Token_Empty('br');
$expect[5] = '<br />'; $expect[5] = '<br />';
foreach ($inputs as $i => $input) { foreach ($inputs as $i => $input) {
@ -69,9 +73,9 @@ class Test_HTML_Generator extends UnitTestCase
function test_generateFromTokens() { function test_generateFromTokens() {
$tokens = array( $tokens = array(
new MF_StartTag('b'), new HTMLPurifier_Token_Start('b'),
new MF_Text('Foobar!'), new HTMLPurifier_Token_Text('Foobar!'),
new MF_EndTag('b') new HTMLPurifier_Token_End('b')
); );
$expect = '<b>Foobar!</b>'; $expect = '<b>Foobar!</b>';
$this->assertEqual($expect, $this->gen->generateFromTokens($tokens)); $this->assertEqual($expect, $this->gen->generateFromTokens($tokens));

View File

@ -46,83 +46,83 @@ class Test_HTML_Lexer extends UnitTestCase
$input[1] = 'This is regular text.'; $input[1] = 'This is regular text.';
$expect[1] = array( $expect[1] = array(
new MF_Text('This is regular text.') new HTMLPurifier_Token_Text('This is regular text.')
); );
$input[2] = 'This is <b>bold</b> text'; $input[2] = 'This is <b>bold</b> text';
$expect[2] = array( $expect[2] = array(
new MF_Text('This is ') new HTMLPurifier_Token_Text('This is ')
,new MF_StartTag('b', array()) ,new HTMLPurifier_Token_Start('b', array())
,new MF_Text('bold') ,new HTMLPurifier_Token_Text('bold')
,new MF_EndTag('b') ,new HTMLPurifier_Token_End('b')
,new MF_Text(' text') ,new HTMLPurifier_Token_Text(' text')
); );
$input[3] = '<DIV>Totally rad dude. <b>asdf</b></div>'; $input[3] = '<DIV>Totally rad dude. <b>asdf</b></div>';
$expect[3] = array( $expect[3] = array(
new MF_StartTag('DIV', array()) new HTMLPurifier_Token_Start('DIV', array())
,new MF_Text('Totally rad dude. ') ,new HTMLPurifier_Token_Text('Totally rad dude. ')
,new MF_StartTag('b', array()) ,new HTMLPurifier_Token_Start('b', array())
,new MF_Text('asdf') ,new HTMLPurifier_Token_Text('asdf')
,new MF_EndTag('b') ,new HTMLPurifier_Token_End('b')
,new MF_EndTag('div') ,new HTMLPurifier_Token_End('div')
); );
$input[4] = '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>'; $input[4] = '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>';
$expect[4] = array( $expect[4] = array(
new MF_StartTag('asdf') new HTMLPurifier_Token_Start('asdf')
,new MF_EndTag('asdf') ,new HTMLPurifier_Token_End('asdf')
,new MF_StartTag('d') ,new HTMLPurifier_Token_Start('d')
,new MF_EndTag('d') ,new HTMLPurifier_Token_End('d')
,new MF_StartTag('poOloka') ,new HTMLPurifier_Token_Start('poOloka')
,new MF_StartTag('poolasdf') ,new HTMLPurifier_Token_Start('poolasdf')
,new MF_StartTag('ds') ,new HTMLPurifier_Token_Start('ds')
,new MF_EndTag('asdf') ,new HTMLPurifier_Token_End('asdf')
,new MF_EndTag('ASDF') ,new HTMLPurifier_Token_End('ASDF')
); );
$input[5] = '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>'; $input[5] = '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>';
$expect[5] = array( $expect[5] = array(
new MF_StartTag('a',array('href'=>'foobar.php','title'=>'foo!')) new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!'))
,new MF_Text('Link to ') ,new HTMLPurifier_Token_Text('Link to ')
,new MF_StartTag('b',array('id'=>'asdf')) ,new HTMLPurifier_Token_Start('b',array('id'=>'asdf'))
,new MF_Text('foobar') ,new HTMLPurifier_Token_Text('foobar')
,new MF_EndTag('b') ,new HTMLPurifier_Token_End('b')
,new MF_EndTag('a') ,new HTMLPurifier_Token_End('a')
); );
$input[6] = '<br />'; $input[6] = '<br />';
$expect[6] = array( $expect[6] = array(
new MF_EmptyTag('br') new HTMLPurifier_Token_Empty('br')
); );
// [INVALID] [RECOVERABLE] // [INVALID] [RECOVERABLE]
$input[7] = '<!-- Comment --> <!-- not so well formed --->'; $input[7] = '<!-- Comment --> <!-- not so well formed --->';
$expect[7] = array( $expect[7] = array(
new MF_Comment(' Comment ') new HTMLPurifier_Token_Comment(' Comment ')
,new MF_Text(' ') ,new HTMLPurifier_Token_Text(' ')
,new MF_Comment(' not so well formed -') ,new HTMLPurifier_Token_Comment(' not so well formed -')
); );
$sax_expect[7] = false; // we need to figure out proper comment output $sax_expect[7] = false; // we need to figure out proper comment output
// [INVALID] // [INVALID]
$input[8] = '<a href=""'; $input[8] = '<a href=""';
$expect[8] = array( $expect[8] = array(
new MF_Text('<a href=""') new HTMLPurifier_Token_Text('<a href=""')
); );
// SAX parses it into a tag // SAX parses it into a tag
$sax_expect[8] = array( $sax_expect[8] = array(
new MF_StartTag('a', array('href'=>'')) new HTMLPurifier_Token_Start('a', array('href'=>''))
); );
$input[9] = '&lt;b&gt;'; $input[9] = '&lt;b&gt;';
$expect[9] = array( $expect[9] = array(
new MF_Text('<b>') new HTMLPurifier_Token_Text('<b>')
); );
$sax_expect[9] = array( $sax_expect[9] = array(
new MF_Text('<') new HTMLPurifier_Token_Text('<')
,new MF_Text('b') ,new HTMLPurifier_Token_Text('b')
,new MF_Text('>') ,new HTMLPurifier_Token_Text('>')
); );
// note that SAX can clump text nodes together. We won't be // note that SAX can clump text nodes together. We won't be
// too picky though // too picky though
@ -130,16 +130,16 @@ class Test_HTML_Lexer extends UnitTestCase
// [INVALID] // [INVALID]
$input[10] = '<a "=>'; $input[10] = '<a "=>';
$expect[10] = array( $expect[10] = array(
new MF_StartTag('a', array('"' => '')) new HTMLPurifier_Token_Start('a', array('"' => ''))
); );
// [INVALID] [RECOVERABLE] // [INVALID] [RECOVERABLE]
$input[11] = '"'; $input[11] = '"';
$expect[11] = array( new MF_Text('"') ); $expect[11] = array( new HTMLPurifier_Token_Text('"') );
// compare with this valid one: // compare with this valid one:
$input[12] = '&quot;'; $input[12] = '&quot;';
$expect[12] = array( new MF_Text('"') ); $expect[12] = array( new HTMLPurifier_Token_Text('"') );
$sax_expect[12] = false; $sax_expect[12] = false;
// SAX chokes on this? We do have entity parsing on, so it should work! // SAX chokes on this? We do have entity parsing on, so it should work!

View File

@ -3,14 +3,25 @@
class Test_HTMLDTD_ChildDef extends UnitTestCase class Test_HTMLDTD_ChildDef extends UnitTestCase
{ {
var $lex;
var $gen;
function Test_HTMLDTD_ChildDef() {
$this->lex = new HTML_Lexer();
$this->gen = new HTML_Generator();
parent::UnitTestCase();
}
function assertSeries($inputs, $expect, $def) { function assertSeries($inputs, $expect, $def) {
foreach ($inputs as $i => $input) { foreach ($inputs as $i => $input) {
$result = $def->validateChildren($input); $tokens = $this->lex->tokenizeHTML($input);
$result = $def->validateChildren($tokens);
if (is_bool($expect[$i])) { if (is_bool($expect[$i])) {
$this->assertIdentical($expect[$i], $result); $this->assertIdentical($expect[$i], $result);
} else { } else {
$this->assertEqual($expect[$i], $result); $result_html = $this->gen->generateFromTokens($result);
paintIf($result, $result != $expect[$i]); $this->assertEqual($expect[$i], $result_html);
paintIf($result_html, $result_html != $expect[$i]);
} }
} }
} }
@ -21,32 +32,20 @@ class Test_HTMLDTD_ChildDef extends UnitTestCase
$def = new HTMLDTD_ChildDef( $def = new HTMLDTD_ChildDef(
'(caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))'); '(caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))');
$inputs[0] = array(); $inputs[0] = '';
$expect[0] = false; $expect[0] = false;
// we really don't care what's inside, because if it turns out // we really don't care what's inside, because if it turns out
// this tr is illegal, we'll end up re-evaluating the parent node // this tr is illegal, we'll end up re-evaluating the parent node
// anyway. // anyway.
$inputs[1] = array( $inputs[1] = '<tr></tr>';
new MF_StartTag('tr') ,new MF_EndTag('tr')
);
$expect[1] = true; $expect[1] = true;
$inputs[2] = array( $inputs[2] = '<caption></caption><col></col><thead></thead>' .
new MF_StartTag('caption') ,new MF_EndTag('caption') '<tfoot></tfoot><tbody></tbody>';
,new MF_StartTag('col') ,new MF_EndTag('col')
,new MF_StartTag('thead') ,new MF_EndTag('thead')
,new MF_StartTag('tfoot') ,new MF_EndTag('tfoot')
,new MF_StartTag('tbody') ,new MF_EndTag('tbody')
);
$expect[2] = true; $expect[2] = true;
$inputs[3] = array( $inputs[3] = '<col></col><col></col><col></col><tr></tr>';
new MF_StartTag('col') ,new MF_EndTag('col')
,new MF_StartTag('col') ,new MF_EndTag('col')
,new MF_StartTag('col') ,new MF_EndTag('col')
,new MF_StartTag('tr') ,new MF_EndTag('tr')
);
$expect[3] = true; $expect[3] = true;
$this->assertSeries($inputs, $expect, $def); $this->assertSeries($inputs, $expect, $def);
@ -81,63 +80,22 @@ class Test_HTMLDTD_ChildDef extends UnitTestCase
$inputs[0] = array(); $inputs[0] = array();
$expect[0] = false; $expect[0] = false;
$inputs[1] = array( $inputs[1] = '<dt>Term</dt>Text in an illegal location'.
new MF_StartTag('dt') '<dd>Definition</dd><b>Illegal tag</b>';
,new MF_Text('Term')
,new MF_EndTag('dt')
,new MF_Text('Text in an illegal location') $expect[1] = '<dt>Term</dt><dd>Definition</dd>';
,new MF_StartTag('dd') $inputs[2] = 'How do you do!';
,new MF_Text('Definition')
,new MF_EndTag('dd')
,new MF_StartTag('b') // test tag removal too
,new MF_EndTag('b')
);
$expect[1] = array(
new MF_StartTag('dt')
,new MF_Text('Term')
,new MF_EndTag('dt')
,new MF_StartTag('dd')
,new MF_Text('Definition')
,new MF_EndTag('dd')
);
$inputs[2] = array(new MF_Text('How do you do!'));
$expect[2] = false; $expect[2] = false;
// whitespace shouldn't trigger it // whitespace shouldn't trigger it
$inputs[3] = array( $inputs[3] = "\n<dd>Definition</dd> ";
new MF_Text("\n")
,new MF_StartTag('dd')
,new MF_Text('Definition')
,new MF_EndTag('dd')
,new MF_Text(' ')
);
$expect[3] = true; $expect[3] = true;
$inputs[4] = array( $inputs[4] ='<dd>Definition</dd> <b></b> ';
new MF_StartTag('dd') $expect[4] = '<dd>Definition</dd> ';
,new MF_Text('Definition')
,new MF_EndTag('dd') $inputs[5] = "\t ";
,new MF_Text(' ')
,new MF_StartTag('b')
,new MF_EndTag('b')
,new MF_Text(' ')
);
$expect[4] = array(
new MF_StartTag('dd')
,new MF_Text('Definition')
,new MF_EndTag('dd')
,new MF_Text(' ')
,new MF_Text(' ')
);
$inputs[5] = array(
new MF_Text(' ')
,new MF_Text("\t")
);
$expect[5] = false; $expect[5] = false;
$this->assertSeries($inputs, $expect, $def); $this->assertSeries($inputs, $expect, $def);
@ -146,41 +104,23 @@ class Test_HTMLDTD_ChildDef extends UnitTestCase
function test_required_pcdata_allowed() { function test_required_pcdata_allowed() {
$def = new HTMLDTD_ChildDef_Required('#PCDATA | b'); $def = new HTMLDTD_ChildDef_Required('#PCDATA | b');
$input = array(
new MF_StartTag('b') $inputs[0] = '<b>Bold text</b><img />';
,new MF_Text('Bold text') $expect[0] = '<b>Bold text</b>&lt;img /&gt;';
,new MF_EndTag('b')
,new MF_EmptyTag('img') // illegal tag $this->assertSeries($inputs, $expect, $def);
);
$expect = array(
new MF_StartTag('b')
,new MF_Text('Bold text')
,new MF_EndTag('b')
,new MF_Text('<img />')
);
$this->assertEqual($expect, $def->validateChildren($input));
} }
function test_optional() { function test_optional() {
$def = new HTMLDTD_ChildDef_Optional('b | i'); $def = new HTMLDTD_ChildDef_Optional('b | i');
$input = array(
new MF_StartTag('b')
,new MF_Text('Bold text')
,new MF_EndTag('b')
,new MF_EmptyTag('img') // illegal tag
);
$expect = array(
new MF_StartTag('b')
,new MF_Text('Bold text')
,new MF_EndTag('b')
);
$this->assertEqual($expect, $def->validateChildren($input));
$input = array( $inputs[0] = '<b>Bold text</b><img />';
new MF_Text('Not allowed text') $expect[0] = '<b>Bold text</b>';
);
$expect = array(); $inputs[1] = 'Not allowed text';
$this->assertEqual($expect, $def->validateChildren($input)); $expect[1] = '';
$this->assertSeries($inputs, $expect, $def);
} }
} }
@ -188,13 +128,13 @@ class Test_HTMLDTD_ChildDef extends UnitTestCase
class Test_PureHTMLDefinition extends UnitTestCase class Test_PureHTMLDefinition extends UnitTestCase
{ {
var $def, $lexer; var $def, $lex;
function Test_PureHTMLDefinition() { function Test_PureHTMLDefinition() {
$this->UnitTestCase(); $this->UnitTestCase();
$this->def = new PureHTMLDefinition(); $this->def = new PureHTMLDefinition();
$this->def->loadData(); $this->def->loadData();
$this->lexer = new HTML_Lexer(); $this->lex = new HTML_Lexer();
} }
function test_removeForeignElements() { function test_removeForeignElements() {
@ -206,35 +146,35 @@ class Test_PureHTMLDefinition extends UnitTestCase
$expect[0] = $inputs[0]; $expect[0] = $inputs[0];
$inputs[1] = array( $inputs[1] = array(
new MF_Text('This is ') new HTMLPurifier_Token_Text('This is ')
,new MF_StartTag('b', array()) ,new HTMLPurifier_Token_Start('b', array())
,new MF_Text('bold') ,new HTMLPurifier_Token_Text('bold')
,new MF_EndTag('b') ,new HTMLPurifier_Token_End('b')
,new MF_Text(' text') ,new HTMLPurifier_Token_Text(' text')
); );
$expect[1] = $inputs[1]; $expect[1] = $inputs[1];
$inputs[2] = array( $inputs[2] = array(
new MF_StartTag('asdf') new HTMLPurifier_Token_Start('asdf')
,new MF_EndTag('asdf') ,new HTMLPurifier_Token_End('asdf')
,new MF_StartTag('d', array('href' => 'bang!')) ,new HTMLPurifier_Token_Start('d', array('href' => 'bang!'))
,new MF_EndTag('d') ,new HTMLPurifier_Token_End('d')
,new MF_StartTag('pooloka') ,new HTMLPurifier_Token_Start('pooloka')
,new MF_StartTag('poolasdf') ,new HTMLPurifier_Token_Start('poolasdf')
,new MF_StartTag('ds', array('moogle' => '&')) ,new HTMLPurifier_Token_Start('ds', array('moogle' => '&'))
,new MF_EndTag('asdf') ,new HTMLPurifier_Token_End('asdf')
,new MF_EndTag('asdf') ,new HTMLPurifier_Token_End('asdf')
); );
$expect[2] = array( $expect[2] = array(
new MF_Text('<asdf>') new HTMLPurifier_Token_Text('<asdf>')
,new MF_Text('</asdf>') ,new HTMLPurifier_Token_Text('</asdf>')
,new MF_Text('<d href="bang!">') ,new HTMLPurifier_Token_Text('<d href="bang!">')
,new MF_Text('</d>') ,new HTMLPurifier_Token_Text('</d>')
,new MF_Text('<pooloka>') ,new HTMLPurifier_Token_Text('<pooloka>')
,new MF_Text('<poolasdf>') ,new HTMLPurifier_Token_Text('<poolasdf>')
,new MF_Text('<ds moogle="&amp;">') ,new HTMLPurifier_Token_Text('<ds moogle="&amp;">')
,new MF_Text('</asdf>') ,new HTMLPurifier_Token_Text('</asdf>')
,new MF_Text('</asdf>') ,new HTMLPurifier_Token_Text('</asdf>')
); );
foreach ($inputs as $i => $input) { foreach ($inputs as $i => $input) {
@ -254,113 +194,113 @@ class Test_PureHTMLDefinition extends UnitTestCase
$expect[0] = $inputs[0]; $expect[0] = $inputs[0];
$inputs[1] = array( $inputs[1] = array(
new MF_Text('This is ') new HTMLPurifier_Token_Text('This is ')
,new MF_StartTag('b') ,new HTMLPurifier_Token_Start('b')
,new MF_Text('bold') ,new HTMLPurifier_Token_Text('bold')
,new MF_EndTag('b') ,new HTMLPurifier_Token_End('b')
,new MF_Text(' text') ,new HTMLPurifier_Token_Text(' text')
,new MF_EmptyTag('br') ,new HTMLPurifier_Token_Empty('br')
); );
$expect[1] = $inputs[1]; $expect[1] = $inputs[1];
$inputs[2] = array( $inputs[2] = array(
new MF_StartTag('b') new HTMLPurifier_Token_Start('b')
,new MF_Text('Unclosed tag, gasp!') ,new HTMLPurifier_Token_Text('Unclosed tag, gasp!')
); );
$expect[2] = array( $expect[2] = array(
new MF_StartTag('b') new HTMLPurifier_Token_Start('b')
,new MF_Text('Unclosed tag, gasp!') ,new HTMLPurifier_Token_Text('Unclosed tag, gasp!')
,new MF_EndTag('b') ,new HTMLPurifier_Token_End('b')
); );
$inputs[3] = array( $inputs[3] = array(
new MF_StartTag('b') new HTMLPurifier_Token_Start('b')
,new MF_StartTag('i') ,new HTMLPurifier_Token_Start('i')
,new MF_Text('The b is closed, but the i is not') ,new HTMLPurifier_Token_Text('The b is closed, but the i is not')
,new MF_EndTag('b') ,new HTMLPurifier_Token_End('b')
); );
$expect[3] = array( $expect[3] = array(
new MF_StartTag('b') new HTMLPurifier_Token_Start('b')
,new MF_StartTag('i') ,new HTMLPurifier_Token_Start('i')
,new MF_Text('The b is closed, but the i is not') ,new HTMLPurifier_Token_Text('The b is closed, but the i is not')
,new MF_EndTag('i') ,new HTMLPurifier_Token_End('i')
,new MF_EndTag('b') ,new HTMLPurifier_Token_End('b')
); );
$inputs[4] = array( $inputs[4] = array(
new MF_Text('Hey, recycle unused end tags!') new HTMLPurifier_Token_Text('Hey, recycle unused end tags!')
,new MF_EndTag('b') ,new HTMLPurifier_Token_End('b')
); );
$expect[4] = array( $expect[4] = array(
new MF_Text('Hey, recycle unused end tags!') new HTMLPurifier_Token_Text('Hey, recycle unused end tags!')
,new MF_Text('</b>') ,new HTMLPurifier_Token_Text('</b>')
); );
$inputs[5] = array(new MF_StartTag('br', array('style' => 'clear:both;'))); $inputs[5] = array(new HTMLPurifier_Token_Start('br', array('style' => 'clear:both;')));
$expect[5] = array(new MF_EmptyTag('br', array('style' => 'clear:both;'))); $expect[5] = array(new HTMLPurifier_Token_Empty('br', array('style' => 'clear:both;')));
$inputs[6] = array(new MF_EmptyTag('div', array('style' => 'clear:both;'))); $inputs[6] = array(new HTMLPurifier_Token_Empty('div', array('style' => 'clear:both;')));
$expect[6] = array( $expect[6] = array(
new MF_StartTag('div', array('style' => 'clear:both;')) new HTMLPurifier_Token_Start('div', array('style' => 'clear:both;'))
,new MF_EndTag('div') ,new HTMLPurifier_Token_End('div')
); );
// test automatic paragraph closing // test automatic paragraph closing
$inputs[7] = array( $inputs[7] = array(
new MF_StartTag('p') new HTMLPurifier_Token_Start('p')
,new MF_Text('Paragraph 1') ,new HTMLPurifier_Token_Text('Paragraph 1')
,new MF_StartTag('p') ,new HTMLPurifier_Token_Start('p')
,new MF_Text('Paragraph 2') ,new HTMLPurifier_Token_Text('Paragraph 2')
); );
$expect[7] = array( $expect[7] = array(
new MF_StartTag('p') new HTMLPurifier_Token_Start('p')
,new MF_Text('Paragraph 1') ,new HTMLPurifier_Token_Text('Paragraph 1')
,new MF_EndTag('p') ,new HTMLPurifier_Token_End('p')
,new MF_StartTag('p') ,new HTMLPurifier_Token_Start('p')
,new MF_Text('Paragraph 2') ,new HTMLPurifier_Token_Text('Paragraph 2')
,new MF_EndTag('p') ,new HTMLPurifier_Token_End('p')
); );
$inputs[8] = array( $inputs[8] = array(
new MF_StartTag('div') new HTMLPurifier_Token_Start('div')
,new MF_StartTag('p') ,new HTMLPurifier_Token_Start('p')
,new MF_Text('Paragraph 1 in a div') ,new HTMLPurifier_Token_Text('Paragraph 1 in a div')
,new MF_EndTag('div') ,new HTMLPurifier_Token_End('div')
); );
$expect[8] = array( $expect[8] = array(
new MF_StartTag('div') new HTMLPurifier_Token_Start('div')
,new MF_StartTag('p') ,new HTMLPurifier_Token_Start('p')
,new MF_Text('Paragraph 1 in a div') ,new HTMLPurifier_Token_Text('Paragraph 1 in a div')
,new MF_EndTag('p') ,new HTMLPurifier_Token_End('p')
,new MF_EndTag('div') ,new HTMLPurifier_Token_End('div')
); );
// automatic list closing // automatic list closing
$inputs[9] = array( $inputs[9] = array(
new MF_StartTag('ol') new HTMLPurifier_Token_Start('ol')
,new MF_StartTag('li') ,new HTMLPurifier_Token_Start('li')
,new MF_Text('Item 1') ,new HTMLPurifier_Token_Text('Item 1')
,new MF_StartTag('li') ,new HTMLPurifier_Token_Start('li')
,new MF_Text('Item 2') ,new HTMLPurifier_Token_Text('Item 2')
,new MF_EndTag('ol') ,new HTMLPurifier_Token_End('ol')
); );
$expect[9] = array( $expect[9] = array(
new MF_StartTag('ol') new HTMLPurifier_Token_Start('ol')
,new MF_StartTag('li') ,new HTMLPurifier_Token_Start('li')
,new MF_Text('Item 1') ,new HTMLPurifier_Token_Text('Item 1')
,new MF_EndTag('li') ,new HTMLPurifier_Token_End('li')
,new MF_StartTag('li') ,new HTMLPurifier_Token_Start('li')
,new MF_Text('Item 2') ,new HTMLPurifier_Token_Text('Item 2')
,new MF_EndTag('li') ,new HTMLPurifier_Token_End('li')
,new MF_EndTag('ol') ,new HTMLPurifier_Token_End('ol')
); );
foreach ($inputs as $i => $input) { foreach ($inputs as $i => $input) {
@ -379,62 +319,62 @@ class Test_PureHTMLDefinition extends UnitTestCase
// legal inline nesting // legal inline nesting
$inputs[0] = array( $inputs[0] = array(
new MF_StartTag('b'), new HTMLPurifier_Token_Start('b'),
new MF_Text('Bold text'), new HTMLPurifier_Token_Text('Bold text'),
new MF_EndTag('b'), new HTMLPurifier_Token_End('b'),
); );
$expect[0] = $inputs[0]; $expect[0] = $inputs[0];
// legal inline and block // legal inline and block
// as the parent element is considered FLOW // as the parent element is considered FLOW
$inputs[1] = array( $inputs[1] = array(
new MF_StartTag('a', array('href' => 'http://www.example.com/')), new HTMLPurifier_Token_Start('a', array('href' => 'http://www.example.com/')),
new MF_Text('Linky'), new HTMLPurifier_Token_Text('Linky'),
new MF_EndTag('a'), new HTMLPurifier_Token_End('a'),
new MF_StartTag('div'), new HTMLPurifier_Token_Start('div'),
new MF_Text('Block element'), new HTMLPurifier_Token_Text('Block element'),
new MF_EndTag('div'), new HTMLPurifier_Token_End('div'),
); );
$expect[1] = $inputs[1]; $expect[1] = $inputs[1];
// illegal block in inline, element -> text // illegal block in inline, element -> text
$inputs[2] = array( $inputs[2] = array(
new MF_StartTag('b'), new HTMLPurifier_Token_Start('b'),
new MF_StartTag('div'), new HTMLPurifier_Token_Start('div'),
new MF_Text('Illegal Div'), new HTMLPurifier_Token_Text('Illegal Div'),
new MF_EndTag('div'), new HTMLPurifier_Token_End('div'),
new MF_EndTag('b'), new HTMLPurifier_Token_End('b'),
); );
$expect[2] = array( $expect[2] = array(
new MF_StartTag('b'), new HTMLPurifier_Token_Start('b'),
new MF_Text('<div>'), new HTMLPurifier_Token_Text('<div>'),
new MF_Text('Illegal Div'), new HTMLPurifier_Token_Text('Illegal Div'),
new MF_Text('</div>'), new HTMLPurifier_Token_Text('</div>'),
new MF_EndTag('b'), new HTMLPurifier_Token_End('b'),
); );
// test of empty set that's required, resulting in removal of node // test of empty set that's required, resulting in removal of node
$inputs[3] = array( $inputs[3] = array(
new MF_StartTag('ul'), new HTMLPurifier_Token_Start('ul'),
new MF_EndTag('ul') new HTMLPurifier_Token_End('ul')
); );
$expect[3] = array(); $expect[3] = array();
// test illegal text which gets removed // test illegal text which gets removed
$inputs[4] = array( $inputs[4] = array(
new MF_StartTag('ul'), new HTMLPurifier_Token_Start('ul'),
new MF_Text('Illegal Text'), new HTMLPurifier_Token_Text('Illegal Text'),
new MF_StartTag('li'), new HTMLPurifier_Token_Start('li'),
new MF_Text('Legal item'), new HTMLPurifier_Token_Text('Legal item'),
new MF_EndTag('li'), new HTMLPurifier_Token_End('li'),
new MF_EndTag('ul') new HTMLPurifier_Token_End('ul')
); );
$expect[4] = array( $expect[4] = array(
new MF_StartTag('ul'), new HTMLPurifier_Token_Start('ul'),
new MF_StartTag('li'), new HTMLPurifier_Token_Start('li'),
new MF_Text('Legal item'), new HTMLPurifier_Token_Text('Legal item'),
new MF_EndTag('li'), new HTMLPurifier_Token_End('li'),
new MF_EndTag('ul') new HTMLPurifier_Token_End('ul')
); );
foreach ($inputs as $i => $input) { foreach ($inputs as $i => $input) {