0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-05 06:01:52 +00:00

[2.0.1] Implement error messages for MakeWellFormed. Armor AutoParagraph generated p start tags from these tag closing errors. Fix another auto-paragraphing edge-case. Create common Strategy error harness.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1242 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2007-06-26 15:07:07 +00:00
parent 3d02a2a7d4
commit 3c734b4c72
10 changed files with 162 additions and 38 deletions

View File

@ -27,6 +27,12 @@ HTMLPurifier_ConfigSchema::define(
class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
{ {
function _pStart() {
$par = new HTMLPurifier_Token_Start('p');
$par->armor['MakeWellFormed_TagClosedError'] = true;
return $par;
}
function handleText(&$token) { function handleText(&$token) {
$text = $token->data; $text = $token->data;
if (empty($this->currentNesting)) { if (empty($this->currentNesting)) {
@ -42,7 +48,7 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
// case 3: we're in an element that allows paragraphs // case 3: we're in an element that allows paragraphs
if (strpos($text, "\n\n") !== false) { if (strpos($text, "\n\n") !== false) {
// case 3.1: this text node has a double-newline // case 3.1: this text node has a double-newline
$token = array(new HTMLPurifier_Token_Start('p')); $token = array($this->_pStart());
$this->_splitText($text, $token); $this->_splitText($text, $token);
} else { } else {
$ok = false; $ok = false;
@ -66,7 +72,7 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
if ($ok) { if ($ok) {
// case 3.2: this text node is next to another node // case 3.2: this text node is next to another node
// that will start a paragraph // that will start a paragraph
$token = array(new HTMLPurifier_Token_Start('p'), $token); $token = array($this->_pStart(), $token);
} }
} }
} }
@ -87,7 +93,7 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
// not adjacent, we can abort early // not adjacent, we can abort early
// add lead paragraph tag if our token is inline // add lead paragraph tag if our token is inline
if ($this->_isInline($token)) { if ($this->_isInline($token)) {
$token = array(new HTMLPurifier_Token_Start('p'), $token); $token = array($this->_pStart(), $token);
} }
return; return;
} }
@ -112,7 +118,7 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
if ($j <= 0) break; if ($j <= 0) break;
} }
if ($ok) { if ($ok) {
$token = array(new HTMLPurifier_Token_Start('p'), $token); $token = array($this->_pStart(), $token);
} }
} }
return; return;
@ -122,7 +128,7 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
if (!$this->_isInline($token)) return; if (!$this->_isInline($token)) return;
// append a paragraph tag before the token // append a paragraph tag before the token
$token = array(new HTMLPurifier_Token_Start('p'), $token); $token = array($this->_pStart(), $token);
} }
/** /**
@ -142,11 +148,15 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
// remove empty paragraphs // remove empty paragraphs
$paragraphs = array(); $paragraphs = array();
$needs_start = false; $needs_start = false;
$first = true; $needs_end = false;
foreach ($raw_paragraphs as $par) {
for ($i = 0, $c = count($raw_paragraphs); $i < $c; $i++) {
$par = $raw_paragraphs[$i];
if (trim($par) !== '') { if (trim($par) !== '') {
$paragraphs[] = $par; $paragraphs[] = $par;
} elseif (empty($result) && $first) { continue;
}
if ($i == 0 && empty($result)) {
// The empty result indicates that the AutoParagraph // The empty result indicates that the AutoParagraph
// injector did not add any start paragraph tokens. // injector did not add any start paragraph tokens.
// The fact that the first paragraph is empty indicates // The fact that the first paragraph is empty indicates
@ -161,8 +171,13 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
// next start paragraph tag will be handled by the // next start paragraph tag will be handled by the
// next run-around the injector // next run-around the injector
$needs_start = true; $needs_start = true;
} elseif ($i + 1 == $c) {
// a double-paragraph at the end indicates that
// there is an overriding need to start a new paragraph
// for the next section. This has no effect until
// we've processed all of the other paragraphs though
$needs_end = true;
} }
$first = false;
} }
// check if there are no "real" paragraphs to be processed // check if there are no "real" paragraphs to be processed
@ -173,13 +188,13 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
// add a start tag if an end tag was added while processing // add a start tag if an end tag was added while processing
// the raw paragraphs (that happens if there's a leading double // the raw paragraphs (that happens if there's a leading double
// newline) // newline)
if ($needs_start) $result[] = new HTMLPurifier_Token_Start('p'); if ($needs_start) $result[] = $this->_pStart();
// append the paragraphs onto the result // append the paragraphs onto the result
foreach ($paragraphs as $par) { foreach ($paragraphs as $par) {
$result[] = new HTMLPurifier_Token_Text($par); $result[] = new HTMLPurifier_Token_Text($par);
$result[] = new HTMLPurifier_Token_End('p'); $result[] = new HTMLPurifier_Token_End('p');
$result[] = new HTMLPurifier_Token_Start('p'); $result[] = $this->_pStart();
} }
// remove trailing start token, if one is needed, it will // remove trailing start token, if one is needed, it will
@ -190,13 +205,13 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
// end paragraph tag should be removed. It should be removed // end paragraph tag should be removed. It should be removed
// unless the next non-whitespace token is a paragraph // unless the next non-whitespace token is a paragraph
// or a block element. // or a block element.
$remove_paragraph_end = true; $remove_paragraph_end = true;
if (!$needs_end) {
// Start of the checks one after the current token's index // Start of the checks one after the current token's index
for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) { for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) {
if ($this->inputTokens[$i]->type == 'start' || $this->inputTokens[$i]->type == 'empty') { if ($this->inputTokens[$i]->type == 'start' || $this->inputTokens[$i]->type == 'empty') {
$remove_paragraph_end = $this->_isInline($this->inputTokens[$i]); $remove_paragraph_end = $this->_isInline($this->inputTokens[$i]);
break;
} }
// check if we can abort early (whitespace means we carry-on!) // check if we can abort early (whitespace means we carry-on!)
if ($this->inputTokens[$i]->type == 'text' && !$this->inputTokens[$i]->is_whitespace) break; if ($this->inputTokens[$i]->type == 'text' && !$this->inputTokens[$i]->is_whitespace) break;
@ -204,6 +219,9 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
// so we don't have to worry about them // so we don't have to worry about them
if ($this->inputTokens[$i]->type == 'end') break; if ($this->inputTokens[$i]->type == 'end') break;
} }
} else {
$remove_paragraph_end = false;
}
// check the outside to determine whether or not the // check the outside to determine whether or not the
// end paragraph tag should be removed // end paragraph tag should be removed

View File

@ -16,14 +16,20 @@ $messages = array(
'Lexer: Missing attribute key' => 'Attribute declaration has no key', 'Lexer: Missing attribute key' => 'Attribute declaration has no key',
'Lexer: Missing end quote' => 'Attribute declaration has no end quote', 'Lexer: Missing end quote' => 'Attribute declaration has no end quote',
'Strategy_RemoveForeignElements: Tag transform' => '$1 element transformed into $CurrentToken.Serialized', 'Strategy_RemoveForeignElements: Tag transform' => '<$1> element transformed into $CurrentToken.Serialized',
'Strategy_RemoveForeignElements: Missing required attribute' => '$1 element missing required attribute $2', 'Strategy_RemoveForeignElements: Missing required attribute' => '<$1> element missing required attribute $2',
'Strategy_RemoveForeignElements: Foreign element to text' => 'Unrecognized $1 element converted to text', 'Strategy_RemoveForeignElements: Foreign element to text' => 'Unrecognized $CurrentToken.Serialized tag converted to text',
'Strategy_RemoveForeignElements: Foreign element removed' => 'Unrecognized $1 element removed', 'Strategy_RemoveForeignElements: Foreign element removed' => 'Unrecognized $CurrentToken.Serialized tag removed',
'Strategy_RemoveForeignElements: Comment removed' => 'Comment containing "$1" removed', 'Strategy_RemoveForeignElements: Comment removed' => 'Comment containing "$1" removed',
'Strategy_RemoveForeignElements: Script removed' => 'Inline scripting removed', 'Strategy_RemoveForeignElements: Script removed' => 'Inline scripting removed',
'Strategy_RemoveForeignElements: Token removed to end' => 'Tags and text starting from $1 element where removed to end', 'Strategy_RemoveForeignElements: Token removed to end' => 'Tags and text starting from $1 element where removed to end',
'Strategy_MakeWellFormed: Unnecessary end tag removed' => 'Unnecessary </$1> tag removed',
'Strategy_MakeWellFormed: Unnecessary end tag to text' => 'Unnecessary </$1> tag converted to text',
'Strategy_MakeWellFormed: Stray end tag removed' => 'Stray </$1> tag removed',
'Strategy_MakeWellFormed: Stray end tag to text' => 'Stray </$1> tag converted to text',
'Strategy_MakeWellFormed: Tag closed by element end' => '<$1> tag closed by end of $CurrentToken.Serialized',
'Strategy_MakeWellFormed: Tag closed by document end' => '<$1> tag closed by end of document',
); );

View File

@ -56,6 +56,8 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags'); $escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags');
$generator = new HTMLPurifier_Generator(); $generator = new HTMLPurifier_Generator();
$e =& $context->get('ErrorCollector', true);
// -- begin INJECTOR -- // -- begin INJECTOR --
$this->injectors = array(); $this->injectors = array();
@ -90,6 +92,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// -- end INJECTOR -- // -- end INJECTOR --
$token = false;
$context->register('CurrentToken', $token);
for ($this->inputIndex = 0; isset($tokens[$this->inputIndex]); $this->inputIndex++) { for ($this->inputIndex = 0; isset($tokens[$this->inputIndex]); $this->inputIndex++) {
// if all goes well, this token will be passed through unharmed // if all goes well, this token will be passed through unharmed
@ -177,9 +182,12 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// make sure that we have something open // make sure that we have something open
if (empty($this->currentNesting)) { if (empty($this->currentNesting)) {
if ($escape_invalid_tags) { if ($escape_invalid_tags) {
if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text', $token->name);
$result[] = new HTMLPurifier_Token_Text( $result[] = new HTMLPurifier_Token_Text(
$generator->generateFromToken($token, $config, $context) $generator->generateFromToken($token, $config, $context)
); );
} elseif ($e) {
$e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed', $token->name);
} }
continue; continue;
} }
@ -215,6 +223,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$result[] = new HTMLPurifier_Token_Text( $result[] = new HTMLPurifier_Token_Text(
$generator->generateFromToken($token, $config, $context) $generator->generateFromToken($token, $config, $context)
); );
if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text', $token->name);
} elseif ($e) {
$e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed', $token->name);
} }
continue; continue;
} }
@ -222,10 +233,15 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// okay, we found it, close all the skipped tags // okay, we found it, close all the skipped tags
// note that skipped tags contains the element we need closed // note that skipped tags contains the element we need closed
$size = count($skipped_tags); $size = count($skipped_tags);
for ($i = $size - 1; $i >= 0; $i--) { for ($i = $size - 1; $i > 0; $i--) {
if ($e && !isset($skipped_tags[$i]->armor['MakeWellFormed_TagClosedError'])) {
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$i]->name);
}
$result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name); $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
} }
$result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
} }
// we're at the end now, fix all still unclosed tags // we're at the end now, fix all still unclosed tags
@ -234,6 +250,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
if (!empty($this->currentNesting)) { if (!empty($this->currentNesting)) {
$size = count($this->currentNesting); $size = count($this->currentNesting);
for ($i = $size - 1; $i >= 0; $i--) { for ($i = $size - 1; $i >= 0; $i--) {
if ($e && !isset($skipped_tags[$i]->armor['MakeWellFormed_TagClosedError'])) {
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $this->currentNesting[$i]->name);
}
$result[] = $result[] =
new HTMLPurifier_Token_End($this->currentNesting[$i]->name); new HTMLPurifier_Token_End($this->currentNesting[$i]->name);
} }
@ -242,6 +261,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$context->destroy('CurrentNesting'); $context->destroy('CurrentNesting');
$context->destroy('InputTokens'); $context->destroy('InputTokens');
$context->destroy('InputIndex'); $context->destroy('InputIndex');
$context->destroy('CurrentToken');
unset($this->outputTokens, $this->injectors, $this->currentInjector, unset($this->outputTokens, $this->injectors, $this->currentInjector,
$this->currentNesting, $this->inputTokens, $this->inputIndex); $this->currentNesting, $this->inputTokens, $this->inputIndex);

View File

@ -115,7 +115,7 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
} elseif ($escape_invalid_tags) { } elseif ($escape_invalid_tags) {
// invalid tag, generate HTML representation and insert in // invalid tag, generate HTML representation and insert in
if ($e) $e->send(E_WARNING, 'Strategy_RemoveForeignElements: Foreign element to text', $token->name); if ($e) $e->send(E_WARNING, 'Strategy_RemoveForeignElements: Foreign element to text');
$token = new HTMLPurifier_Token_Text( $token = new HTMLPurifier_Token_Text(
$generator->generateFromToken($token, $config, $context) $generator->generateFromToken($token, $config, $context)
); );
@ -132,7 +132,7 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
} }
if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Script removed'); if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Script removed');
} else { } else {
if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign element removed', $token->name); if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign element removed');
} }
continue; continue;
} }

View File

@ -15,7 +15,8 @@ class HTMLPurifier_Token {
/** /**
* Lookup array of processing that this token is exempt from. * Lookup array of processing that this token is exempt from.
* Currently, the only valid value is "ValidateAttributes". * Currently, valid values are "ValidateAttributes" and
* "MakeWellFormed_TagClosedError"
*/ */
var $armor = array(); var $armor = array();

View File

@ -230,6 +230,13 @@ Par1
</p><div>Par2</div>' </p><div>Par2</div>'
); );
$this->assertResult(
'Par1
<b>Par2</b>',
'<p>Par1</p><p><b>Par2</b></p>'
);
} }
function testInlineRootNode() { function testInlineRootNode() {

View File

@ -0,0 +1,20 @@
<?php
require_once 'HTMLPurifier/ErrorsHarness.php';
class HTMLPurifier_Strategy_ErrorsHarness extends HTMLPurifier_ErrorsHarness
{
// needs to be defined
function getStrategy() {}
function invoke($input) {
$strategy = $this->getStrategy();
$lexer = new HTMLPurifier_Lexer_DirectLex();
$tokens = $lexer->tokenizeHTML($input, $this->config, $this->context);
$strategy->execute($tokens, $this->config, $this->context);
}
}
?>

View File

@ -0,0 +1,52 @@
<?php
require_once 'HTMLPurifier/Strategy/ErrorsHarness.php';
require_once 'HTMLPurifier/Strategy/MakeWellFormed.php';
/*
'Strategy_MakeWellFormed: Tag closed by element end' => '',
'Strategy_MakeWellFormed: Tag closed by document end' => '',
*/
class HTMLPurifier_Strategy_MakeWellFormed_ErrorsTest extends HTMLPurifier_Strategy_ErrorsHarness
{
function getStrategy() {
return new HTMLPurifier_Strategy_MakeWellFormed();
}
function testUnnecessaryEndTagRemoved() {
$this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed', 'b');
$this->invoke('</b>');
}
function testUnnecessaryEndTagToText() {
$this->config->set('Core', 'EscapeInvalidTags', true);
$this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text', 'b');
$this->invoke('</b>');
}
function testStrayEndTagRemoved() {
$this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed', 'b');
$this->invoke('<i></b></i>');
}
function testStrayEndTagToText() {
$this->config->set('Core', 'EscapeInvalidTags', true);
$this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text', 'b');
$this->invoke('<i></b></i>');
}
function testTagClosedByElementEnd() {
$this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', 'b');
$this->invoke('<i><b>Foobar</i>');
}
function testTagClosedByDocumentEnd() {
$this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', 'b');
$this->invoke('<b>Foobar');
}
}
?>

View File

@ -1,9 +1,9 @@
<?php <?php
require_once 'HTMLPurifier/ErrorsHarness.php'; require_once 'HTMLPurifier/Strategy/ErrorsHarness.php';
require_once 'HTMLPurifier/Strategy/RemoveForeignElements.php'; require_once 'HTMLPurifier/Strategy/RemoveForeignElements.php';
class HTMLPurifier_Strategy_RemoveForeignElements_ErrorsTest extends HTMLPurifier_ErrorsHarness class HTMLPurifier_Strategy_RemoveForeignElements_ErrorsTest extends HTMLPurifier_Strategy_ErrorsHarness
{ {
function setup() { function setup() {
@ -11,11 +11,8 @@ class HTMLPurifier_Strategy_RemoveForeignElements_ErrorsTest extends HTMLPurifie
$this->config->set('HTML', 'TidyLevel', 'heavy'); $this->config->set('HTML', 'TidyLevel', 'heavy');
} }
function invoke($input) { function getStrategy() {
$strategy = new HTMLPurifier_Strategy_RemoveForeignElements(); return new HTMLPurifier_Strategy_RemoveForeignElements();
$lexer = new HTMLPurifier_Lexer_DirectLex();
$tokens = $lexer->tokenizeHTML($input, $this->config, $this->context);
$strategy->execute($tokens, $this->config, $this->context);
} }
function testTagTransform() { function testTagTransform() {
@ -31,12 +28,14 @@ class HTMLPurifier_Strategy_RemoveForeignElements_ErrorsTest extends HTMLPurifie
} }
function testForeignElementToText() { function testForeignElementToText() {
// uses $CurrentToken.Serialized
$this->config->set('Core', 'EscapeInvalidTags', true); $this->config->set('Core', 'EscapeInvalidTags', true);
$this->expectErrorCollection(E_WARNING, 'Strategy_RemoveForeignElements: Foreign element to text', 'cannot-possibly-exist-element'); $this->expectErrorCollection(E_WARNING, 'Strategy_RemoveForeignElements: Foreign element to text', 'cannot-possibly-exist-element');
$this->invoke('<cannot-possibly-exist-element>'); $this->invoke('<cannot-possibly-exist-element>');
} }
function testForeignElementRemoved() { function testForeignElementRemoved() {
// uses $CurrentToken.Serialized
$this->expectErrorCollection(E_ERROR, 'Strategy_RemoveForeignElements: Foreign element removed', 'cannot-possibly-exist-element'); $this->expectErrorCollection(E_ERROR, 'Strategy_RemoveForeignElements: Foreign element removed', 'cannot-possibly-exist-element');
$this->invoke('<cannot-possibly-exist-element>'); $this->invoke('<cannot-possibly-exist-element>');
} }

View File

@ -94,6 +94,7 @@ $test_files[] = 'HTMLPurifier/Strategy/CompositeTest.php';
$test_files[] = 'HTMLPurifier/Strategy/CoreTest.php'; $test_files[] = 'HTMLPurifier/Strategy/CoreTest.php';
$test_files[] = 'HTMLPurifier/Strategy/FixNestingTest.php'; $test_files[] = 'HTMLPurifier/Strategy/FixNestingTest.php';
$test_files[] = 'HTMLPurifier/Strategy/MakeWellFormedTest.php'; $test_files[] = 'HTMLPurifier/Strategy/MakeWellFormedTest.php';
$test_files[] = 'HTMLPurifier/Strategy/MakeWellFormed_ErrorsTest.php';
$test_files[] = 'HTMLPurifier/Strategy/RemoveForeignElementsTest.php'; $test_files[] = 'HTMLPurifier/Strategy/RemoveForeignElementsTest.php';
$test_files[] = 'HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php'; $test_files[] = 'HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php';
$test_files[] = 'HTMLPurifier/Strategy/ValidateAttributesTest.php'; $test_files[] = 'HTMLPurifier/Strategy/ValidateAttributesTest.php';