diff --git a/library/HTMLPurifier/Injector/AutoParagraph.php b/library/HTMLPurifier/Injector/AutoParagraph.php index e2b17ba2..919785ae 100644 --- a/library/HTMLPurifier/Injector/AutoParagraph.php +++ b/library/HTMLPurifier/Injector/AutoParagraph.php @@ -27,6 +27,12 @@ HTMLPurifier_ConfigSchema::define( class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector { + function _pStart() { + $par = new HTMLPurifier_Token_Start('p'); + $par->armor['MakeWellFormed_TagClosedError'] = true; + return $par; + } + function handleText(&$token) { $text = $token->data; if (empty($this->currentNesting)) { @@ -42,7 +48,7 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector // case 3: we're in an element that allows paragraphs if (strpos($text, "\n\n") !== false) { // case 3.1: this text node has a double-newline - $token = array(new HTMLPurifier_Token_Start('p')); + $token = array($this->_pStart()); $this->_splitText($text, $token); } else { $ok = false; @@ -66,7 +72,7 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector if ($ok) { // case 3.2: this text node is next to another node // that will start a paragraph - $token = array(new HTMLPurifier_Token_Start('p'), $token); + $token = array($this->_pStart(), $token); } } } @@ -87,7 +93,7 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector // not adjacent, we can abort early // add lead paragraph tag if our token is inline if ($this->_isInline($token)) { - $token = array(new HTMLPurifier_Token_Start('p'), $token); + $token = array($this->_pStart(), $token); } return; } @@ -112,7 +118,7 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector if ($j <= 0) break; } if ($ok) { - $token = array(new HTMLPurifier_Token_Start('p'), $token); + $token = array($this->_pStart(), $token); } } return; @@ -122,7 +128,7 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector if (!$this->_isInline($token)) return; // append a paragraph tag before the token - $token = array(new HTMLPurifier_Token_Start('p'), $token); + $token = array($this->_pStart(), $token); } /** @@ -142,11 +148,15 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector // remove empty paragraphs $paragraphs = array(); $needs_start = false; - $first = true; - foreach ($raw_paragraphs as $par) { + $needs_end = false; + + for ($i = 0, $c = count($raw_paragraphs); $i < $c; $i++) { + $par = $raw_paragraphs[$i]; if (trim($par) !== '') { - $paragraphs[] = $par; - } elseif (empty($result) && $first) { + $paragraphs[] = $par; + continue; + } + if ($i == 0 && empty($result)) { // The empty result indicates that the AutoParagraph // injector did not add any start paragraph tokens. // The fact that the first paragraph is empty indicates @@ -161,8 +171,13 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector // next start paragraph tag will be handled by the // next run-around the injector $needs_start = true; + } elseif ($i + 1 == $c) { + // a double-paragraph at the end indicates that + // there is an overriding need to start a new paragraph + // for the next section. This has no effect until + // we've processed all of the other paragraphs though + $needs_end = true; } - $first = false; } // check if there are no "real" paragraphs to be processed @@ -173,13 +188,13 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector // add a start tag if an end tag was added while processing // the raw paragraphs (that happens if there's a leading double // newline) - if ($needs_start) $result[] = new HTMLPurifier_Token_Start('p'); + if ($needs_start) $result[] = $this->_pStart(); // append the paragraphs onto the result foreach ($paragraphs as $par) { $result[] = new HTMLPurifier_Token_Text($par); $result[] = new HTMLPurifier_Token_End('p'); - $result[] = new HTMLPurifier_Token_Start('p'); + $result[] = $this->_pStart(); } // remove trailing start token, if one is needed, it will @@ -190,19 +205,22 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector // end paragraph tag should be removed. It should be removed // unless the next non-whitespace token is a paragraph // or a block element. - $remove_paragraph_end = true; - // Start of the checks one after the current token's index - for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) { - if ($this->inputTokens[$i]->type == 'start' || $this->inputTokens[$i]->type == 'empty') { - $remove_paragraph_end = $this->_isInline($this->inputTokens[$i]); - break; + + if (!$needs_end) { + // Start of the checks one after the current token's index + for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) { + if ($this->inputTokens[$i]->type == 'start' || $this->inputTokens[$i]->type == 'empty') { + $remove_paragraph_end = $this->_isInline($this->inputTokens[$i]); + } + // check if we can abort early (whitespace means we carry-on!) + if ($this->inputTokens[$i]->type == 'text' && !$this->inputTokens[$i]->is_whitespace) break; + // end tags will automatically be handled by MakeWellFormed, + // so we don't have to worry about them + if ($this->inputTokens[$i]->type == 'end') break; } - // check if we can abort early (whitespace means we carry-on!) - if ($this->inputTokens[$i]->type == 'text' && !$this->inputTokens[$i]->is_whitespace) break; - // end tags will automatically be handled by MakeWellFormed, - // so we don't have to worry about them - if ($this->inputTokens[$i]->type == 'end') break; + } else { + $remove_paragraph_end = false; } // check the outside to determine whether or not the diff --git a/library/HTMLPurifier/Language/messages/en.php b/library/HTMLPurifier/Language/messages/en.php index 6634556a..fa4736c6 100644 --- a/library/HTMLPurifier/Language/messages/en.php +++ b/library/HTMLPurifier/Language/messages/en.php @@ -16,14 +16,20 @@ $messages = array( 'Lexer: Missing attribute key' => 'Attribute declaration has no key', 'Lexer: Missing end quote' => 'Attribute declaration has no end quote', -'Strategy_RemoveForeignElements: Tag transform' => '$1 element transformed into $CurrentToken.Serialized', -'Strategy_RemoveForeignElements: Missing required attribute' => '$1 element missing required attribute $2', -'Strategy_RemoveForeignElements: Foreign element to text' => 'Unrecognized $1 element converted to text', -'Strategy_RemoveForeignElements: Foreign element removed' => 'Unrecognized $1 element removed', +'Strategy_RemoveForeignElements: Tag transform' => '<$1> element transformed into $CurrentToken.Serialized', +'Strategy_RemoveForeignElements: Missing required attribute' => '<$1> element missing required attribute $2', +'Strategy_RemoveForeignElements: Foreign element to text' => 'Unrecognized $CurrentToken.Serialized tag converted to text', +'Strategy_RemoveForeignElements: Foreign element removed' => 'Unrecognized $CurrentToken.Serialized tag removed', 'Strategy_RemoveForeignElements: Comment removed' => 'Comment containing "$1" removed', 'Strategy_RemoveForeignElements: Script removed' => 'Inline scripting removed', 'Strategy_RemoveForeignElements: Token removed to end' => 'Tags and text starting from $1 element where removed to end', +'Strategy_MakeWellFormed: Unnecessary end tag removed' => 'Unnecessary tag removed', +'Strategy_MakeWellFormed: Unnecessary end tag to text' => 'Unnecessary tag converted to text', +'Strategy_MakeWellFormed: Stray end tag removed' => 'Stray tag removed', +'Strategy_MakeWellFormed: Stray end tag to text' => 'Stray tag converted to text', +'Strategy_MakeWellFormed: Tag closed by element end' => '<$1> tag closed by end of $CurrentToken.Serialized', +'Strategy_MakeWellFormed: Tag closed by document end' => '<$1> tag closed by end of document', ); diff --git a/library/HTMLPurifier/Strategy/MakeWellFormed.php b/library/HTMLPurifier/Strategy/MakeWellFormed.php index 1783e722..834d6b10 100644 --- a/library/HTMLPurifier/Strategy/MakeWellFormed.php +++ b/library/HTMLPurifier/Strategy/MakeWellFormed.php @@ -56,6 +56,8 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy $escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags'); $generator = new HTMLPurifier_Generator(); + $e =& $context->get('ErrorCollector', true); + // -- begin INJECTOR -- $this->injectors = array(); @@ -90,6 +92,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy // -- end INJECTOR -- + $token = false; + $context->register('CurrentToken', $token); + for ($this->inputIndex = 0; isset($tokens[$this->inputIndex]); $this->inputIndex++) { // if all goes well, this token will be passed through unharmed @@ -177,9 +182,12 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy // make sure that we have something open if (empty($this->currentNesting)) { if ($escape_invalid_tags) { + if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text', $token->name); $result[] = new HTMLPurifier_Token_Text( $generator->generateFromToken($token, $config, $context) ); + } elseif ($e) { + $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed', $token->name); } continue; } @@ -215,6 +223,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy $result[] = new HTMLPurifier_Token_Text( $generator->generateFromToken($token, $config, $context) ); + if ($e) $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text', $token->name); + } elseif ($e) { + $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed', $token->name); } continue; } @@ -222,10 +233,15 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy // okay, we found it, close all the skipped tags // note that skipped tags contains the element we need closed $size = count($skipped_tags); - for ($i = $size - 1; $i >= 0; $i--) { + for ($i = $size - 1; $i > 0; $i--) { + if ($e && !isset($skipped_tags[$i]->armor['MakeWellFormed_TagClosedError'])) { + $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$i]->name); + } $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name); } + $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name); + } // we're at the end now, fix all still unclosed tags @@ -234,6 +250,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy if (!empty($this->currentNesting)) { $size = count($this->currentNesting); for ($i = $size - 1; $i >= 0; $i--) { + if ($e && !isset($skipped_tags[$i]->armor['MakeWellFormed_TagClosedError'])) { + $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $this->currentNesting[$i]->name); + } $result[] = new HTMLPurifier_Token_End($this->currentNesting[$i]->name); } @@ -242,6 +261,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy $context->destroy('CurrentNesting'); $context->destroy('InputTokens'); $context->destroy('InputIndex'); + $context->destroy('CurrentToken'); unset($this->outputTokens, $this->injectors, $this->currentInjector, $this->currentNesting, $this->inputTokens, $this->inputIndex); diff --git a/library/HTMLPurifier/Strategy/RemoveForeignElements.php b/library/HTMLPurifier/Strategy/RemoveForeignElements.php index f8a03aec..16ffbbba 100644 --- a/library/HTMLPurifier/Strategy/RemoveForeignElements.php +++ b/library/HTMLPurifier/Strategy/RemoveForeignElements.php @@ -115,7 +115,7 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy } elseif ($escape_invalid_tags) { // invalid tag, generate HTML representation and insert in - if ($e) $e->send(E_WARNING, 'Strategy_RemoveForeignElements: Foreign element to text', $token->name); + if ($e) $e->send(E_WARNING, 'Strategy_RemoveForeignElements: Foreign element to text'); $token = new HTMLPurifier_Token_Text( $generator->generateFromToken($token, $config, $context) ); @@ -132,7 +132,7 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy } if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Script removed'); } else { - if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign element removed', $token->name); + if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign element removed'); } continue; } diff --git a/library/HTMLPurifier/Token.php b/library/HTMLPurifier/Token.php index 10aff570..42dd16f1 100644 --- a/library/HTMLPurifier/Token.php +++ b/library/HTMLPurifier/Token.php @@ -15,7 +15,8 @@ class HTMLPurifier_Token { /** * Lookup array of processing that this token is exempt from. - * Currently, the only valid value is "ValidateAttributes". + * Currently, valid values are "ValidateAttributes" and + * "MakeWellFormed_TagClosedError" */ var $armor = array(); diff --git a/tests/HTMLPurifier/Injector/AutoParagraphTest.php b/tests/HTMLPurifier/Injector/AutoParagraphTest.php index 0c64d54f..04963789 100644 --- a/tests/HTMLPurifier/Injector/AutoParagraphTest.php +++ b/tests/HTMLPurifier/Injector/AutoParagraphTest.php @@ -230,6 +230,13 @@ Par1

Par2
' ); + $this->assertResult( +'Par1 + +Par2', +'

Par1

Par2

' + ); + } function testInlineRootNode() { diff --git a/tests/HTMLPurifier/Strategy/ErrorsHarness.php b/tests/HTMLPurifier/Strategy/ErrorsHarness.php new file mode 100644 index 00000000..34a32b99 --- /dev/null +++ b/tests/HTMLPurifier/Strategy/ErrorsHarness.php @@ -0,0 +1,20 @@ +getStrategy(); + $lexer = new HTMLPurifier_Lexer_DirectLex(); + $tokens = $lexer->tokenizeHTML($input, $this->config, $this->context); + $strategy->execute($tokens, $this->config, $this->context); + } + +} + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/Strategy/MakeWellFormed_ErrorsTest.php b/tests/HTMLPurifier/Strategy/MakeWellFormed_ErrorsTest.php new file mode 100644 index 00000000..baeff527 --- /dev/null +++ b/tests/HTMLPurifier/Strategy/MakeWellFormed_ErrorsTest.php @@ -0,0 +1,52 @@ + '', +'Strategy_MakeWellFormed: Tag closed by document end' => '', +*/ + +class HTMLPurifier_Strategy_MakeWellFormed_ErrorsTest extends HTMLPurifier_Strategy_ErrorsHarness +{ + + function getStrategy() { + return new HTMLPurifier_Strategy_MakeWellFormed(); + } + + function testUnnecessaryEndTagRemoved() { + $this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed', 'b'); + $this->invoke(''); + } + + function testUnnecessaryEndTagToText() { + $this->config->set('Core', 'EscapeInvalidTags', true); + $this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text', 'b'); + $this->invoke(''); + } + + function testStrayEndTagRemoved() { + $this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed', 'b'); + $this->invoke(''); + } + + function testStrayEndTagToText() { + $this->config->set('Core', 'EscapeInvalidTags', true); + $this->expectErrorCollection(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text', 'b'); + $this->invoke(''); + } + + function testTagClosedByElementEnd() { + $this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', 'b'); + $this->invoke('Foobar'); + } + + function testTagClosedByDocumentEnd() { + $this->expectErrorCollection(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', 'b'); + $this->invoke('Foobar'); + } + +} + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php b/tests/HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php index fa7dd91b..78988cb3 100644 --- a/tests/HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php +++ b/tests/HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php @@ -1,9 +1,9 @@ config->set('HTML', 'TidyLevel', 'heavy'); } - function invoke($input) { - $strategy = new HTMLPurifier_Strategy_RemoveForeignElements(); - $lexer = new HTMLPurifier_Lexer_DirectLex(); - $tokens = $lexer->tokenizeHTML($input, $this->config, $this->context); - $strategy->execute($tokens, $this->config, $this->context); + function getStrategy() { + return new HTMLPurifier_Strategy_RemoveForeignElements(); } function testTagTransform() { @@ -31,12 +28,14 @@ class HTMLPurifier_Strategy_RemoveForeignElements_ErrorsTest extends HTMLPurifie } function testForeignElementToText() { + // uses $CurrentToken.Serialized $this->config->set('Core', 'EscapeInvalidTags', true); $this->expectErrorCollection(E_WARNING, 'Strategy_RemoveForeignElements: Foreign element to text', 'cannot-possibly-exist-element'); $this->invoke(''); } function testForeignElementRemoved() { + // uses $CurrentToken.Serialized $this->expectErrorCollection(E_ERROR, 'Strategy_RemoveForeignElements: Foreign element removed', 'cannot-possibly-exist-element'); $this->invoke(''); } diff --git a/tests/test_files.php b/tests/test_files.php index 4cc365ed..865e53c7 100644 --- a/tests/test_files.php +++ b/tests/test_files.php @@ -94,6 +94,7 @@ $test_files[] = 'HTMLPurifier/Strategy/CompositeTest.php'; $test_files[] = 'HTMLPurifier/Strategy/CoreTest.php'; $test_files[] = 'HTMLPurifier/Strategy/FixNestingTest.php'; $test_files[] = 'HTMLPurifier/Strategy/MakeWellFormedTest.php'; +$test_files[] = 'HTMLPurifier/Strategy/MakeWellFormed_ErrorsTest.php'; $test_files[] = 'HTMLPurifier/Strategy/RemoveForeignElementsTest.php'; $test_files[] = 'HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php'; $test_files[] = 'HTMLPurifier/Strategy/ValidateAttributesTest.php';