mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-11-09 15:28:40 +00:00
Implement %AutoFormat.RemoveEmpty, end to start ref, and injector rewind.
Injector rewind: Injectors can now use the method rewind() in order to move the input index backwards, so that they can reprocess tokens (other injectors are not affected by a rewind). This functionality was necessary to implement nested node removals in %AutoFormat.RemoveEmpty. End to start ref: To facilitate rewinding, HTMLPurifier_Token_End now maintains a reference called $start to the starting token for their node. %AutoFormat.RemoveEmpty removes empty nodes. Lots of people have requested it, so here is a partially effective implementation. Because it is implemented as an Injector, it's not possible for it to handle newly introduced empty nodes by later validators, specifically auto-closing and child validation. The Injector is only meant to be used on HTML-ish languages. Signed-off-by: Edward Z. Yang <edwardzyang@thewritingpot.com>
This commit is contained in:
parent
fd384129bf
commit
700d5bcbfc
4
NEWS
4
NEWS
@ -18,8 +18,12 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
||||
! Proper support for name attribute. It is now allowed and equivalent to the id
|
||||
attribute in a and img tags, and is only converted to id when %HTML.TidyLevel
|
||||
is heavy (for all doctypes).
|
||||
! %AutoFormat.RemoveEmpty to remove some empty tags from documents. Please don't
|
||||
use on hand-written HTML.
|
||||
. Strategy_MakeWellFormed now operates in-place, saving memory and allowing
|
||||
for more interesting filter-backtracking
|
||||
. New HTMLPurifier_Injector->rewind() functionality, allows injectors to rewind
|
||||
index to reprocess tokens.
|
||||
|
||||
3.1.1, released 2008-06-19
|
||||
# %URI.Munge now, by default, does not munge resources (for example, <img src="">)
|
||||
|
@ -165,6 +165,7 @@ require 'HTMLPurifier/HTMLModule/Tidy/XHTML.php';
|
||||
require 'HTMLPurifier/Injector/AutoParagraph.php';
|
||||
require 'HTMLPurifier/Injector/Linkify.php';
|
||||
require 'HTMLPurifier/Injector/PurifierLinkify.php';
|
||||
require 'HTMLPurifier/Injector/RemoveEmpty.php';
|
||||
require 'HTMLPurifier/Injector/SafeObject.php';
|
||||
require 'HTMLPurifier/Lexer/DOMLex.php';
|
||||
require 'HTMLPurifier/Lexer/DirectLex.php';
|
||||
|
@ -159,6 +159,7 @@ require_once $__dir . '/HTMLPurifier/HTMLModule/Tidy/XHTML.php';
|
||||
require_once $__dir . '/HTMLPurifier/Injector/AutoParagraph.php';
|
||||
require_once $__dir . '/HTMLPurifier/Injector/Linkify.php';
|
||||
require_once $__dir . '/HTMLPurifier/Injector/PurifierLinkify.php';
|
||||
require_once $__dir . '/HTMLPurifier/Injector/RemoveEmpty.php';
|
||||
require_once $__dir . '/HTMLPurifier/Injector/SafeObject.php';
|
||||
require_once $__dir . '/HTMLPurifier/Lexer/DOMLex.php';
|
||||
require_once $__dir . '/HTMLPurifier/Lexer/DirectLex.php';
|
||||
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,44 @@
|
||||
AutoFormat.RemoveEmpty
|
||||
TYPE: bool
|
||||
VERSION: 3.1.2
|
||||
DEFAULT: false
|
||||
--DESCRIPTION--
|
||||
<p>
|
||||
When enabled, HTML Purifier will attempt to remove empty elements that
|
||||
contribute no semantic information to the document. The following types
|
||||
of nodes will be removed:
|
||||
</p>
|
||||
<ul><li>
|
||||
Tags with no attributes and no content, and that are not empty
|
||||
elements (remove <code><a></a></code> but not
|
||||
<code><br /></code>), and
|
||||
</li>
|
||||
<li>
|
||||
Tags with no content, except for:<ul>
|
||||
<li>The <code>colgroup</code> element, or</li>
|
||||
<li>
|
||||
Elements with the <code>id</code> or <code>name</code> attribute,
|
||||
when those attributes are permitted on those elements.
|
||||
</li>
|
||||
</ul></li>
|
||||
</ul>
|
||||
<p>
|
||||
Please be very careful when using this functionality; while it may not
|
||||
seem that empty elements contain useful information, they can alter the
|
||||
layout of a document given appropriate styling. This directive is most
|
||||
useful when you are processing machine-generated HTML, please avoid using
|
||||
it on regular user HTML.
|
||||
</p>
|
||||
<p>
|
||||
Elements that contain only whitespace will be treated as empty. Non-breaking
|
||||
spaces, however, do not count as whitespace.
|
||||
</p>
|
||||
<p>
|
||||
This algorithm is not perfect; you may still notice some empty tags,
|
||||
particularly if a node had elements, but those elements were later removed
|
||||
because they were not permitted in that context, or tags that, after
|
||||
being auto-closed by another tag, where empty. This is for safety reasons
|
||||
to prevent clever code from breaking validation. The general rule of thumb:
|
||||
if a tag looked empty on the way end, it will get removed; if HTML Purifier
|
||||
made it empty, it will stay.
|
||||
</p>
|
@ -54,6 +54,30 @@ abstract class HTMLPurifier_Injector
|
||||
*/
|
||||
public $needed = array();
|
||||
|
||||
/**
|
||||
* Index of inputTokens to rewind to.
|
||||
*/
|
||||
protected $rewind = false;
|
||||
|
||||
/**
|
||||
* Rewind to a spot to re-perform processing. This is useful if you
|
||||
* deleted a node, and now need to see if this change affected any
|
||||
* earlier nodes. Rewinding does not affect other injectors, and can
|
||||
* result in infinite loops if not used carefully.
|
||||
*/
|
||||
public function rewind($index) {
|
||||
$this->rewind = $index;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves rewind, and then unsets it.
|
||||
*/
|
||||
public function getRewind() {
|
||||
$r = $this->rewind;
|
||||
$this->rewind = false;
|
||||
return $r;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepares the injector by giving it the config and context objects:
|
||||
* this allows references to important variables to be made within
|
||||
|
40
library/HTMLPurifier/Injector/RemoveEmpty.php
Normal file
40
library/HTMLPurifier/Injector/RemoveEmpty.php
Normal file
@ -0,0 +1,40 @@
|
||||
<?php
|
||||
|
||||
class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector
|
||||
{
|
||||
|
||||
private $context, $config;
|
||||
|
||||
public function prepare($config, $context) {
|
||||
parent::prepare($config, $context);
|
||||
$this->config = $config;
|
||||
$this->context = $context;
|
||||
$this->attrValidator = new HTMLPurifier_AttrValidator();
|
||||
}
|
||||
|
||||
public function handleElement(&$token) {
|
||||
if (!$token instanceof HTMLPurifier_Token_Start) return;
|
||||
$next = false;
|
||||
for ($i = $this->inputIndex + 1, $c = count($this->inputTokens); $i < $c; $i++) {
|
||||
$next = $this->inputTokens[$i];
|
||||
if ($next instanceof HTMLPurifier_Token_Text && $next->is_whitespace) continue;
|
||||
break;
|
||||
}
|
||||
if (!$next || ($next instanceof HTMLPurifier_Token_End && $next->name == $token->name)) {
|
||||
if ($token->name == 'colgroup') return;
|
||||
$this->attrValidator->validateToken($token, $this->config, $this->context);
|
||||
$token->armor['ValidateAttributes'] = true;
|
||||
if (isset($token->attr['id']) || isset($token->attr['name'])) return;
|
||||
$token = $i - $this->inputIndex + 1;
|
||||
for ($b = $this->inputIndex - 1; $b > 0; $b--) {
|
||||
$prev = $this->inputTokens[$b];
|
||||
if ($prev instanceof HTMLPurifier_Token_Text && $prev->is_whitespace) continue;
|
||||
break;
|
||||
}
|
||||
// This is safe because we removed the token that triggered this.
|
||||
$this->rewind($b - 1);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -100,7 +100,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
|
||||
// injector handler code; duplicated for performance reasons
|
||||
foreach ($this->injectors as $i => $injector) {
|
||||
if (!$injector->skip) $injector->handleText($token);
|
||||
if (is_array($token)) {
|
||||
if (is_array($token) || is_int($token)) {
|
||||
$this->currentInjector = $i;
|
||||
break;
|
||||
}
|
||||
@ -144,7 +144,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
|
||||
if ($e) $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent);
|
||||
// insert parent end tag before this tag;
|
||||
// end tag isn't processed, but this tag is processed again
|
||||
$this->insertBefore(new HTMLPurifier_Token_End($parent->name));
|
||||
$new_token = new HTMLPurifier_Token_End($parent->name);
|
||||
$new_token->start = $parent;
|
||||
$this->insertBefore($new_token);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -157,7 +159,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
|
||||
if ($ok) {
|
||||
foreach ($this->injectors as $i => $injector) {
|
||||
if (!$injector->skip) $injector->handleElement($token);
|
||||
if (is_array($token)) {
|
||||
if (is_array($token) || is_int($token)) {
|
||||
$this->currentInjector = $i;
|
||||
break;
|
||||
}
|
||||
@ -189,6 +191,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
|
||||
// first, check for the simplest case: everything closes neatly
|
||||
$current_parent = array_pop($this->currentNesting);
|
||||
if ($current_parent->name == $token->name) {
|
||||
$token->start = $current_parent;
|
||||
foreach ($this->injectors as $i => $injector) {
|
||||
$injector->notifyEnd($token);
|
||||
}
|
||||
@ -236,6 +239,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
|
||||
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$i]);
|
||||
}
|
||||
$new_token = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
|
||||
$new_token->start = $skipped_tags[$i];
|
||||
$this->insertAfter($new_token);
|
||||
//printTokens($tokens, $this->inputIndex);
|
||||
//var_dump($this->currentNesting);
|
||||
@ -261,6 +265,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
|
||||
}
|
||||
// instead of splice, since we know this is the end
|
||||
$tokens[] = $new_token = new HTMLPurifier_Token_End($this->currentNesting[$i]->name);
|
||||
$new_token->start = $this->currentNesting[$i];
|
||||
foreach ($this->injectors as $injector) {
|
||||
$injector->notifyEnd($new_token);
|
||||
}
|
||||
@ -313,34 +318,59 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
|
||||
* If $token is false, the current token is deleted.
|
||||
*/
|
||||
protected function processToken($token, $config, $context) {
|
||||
if (is_array($token)) {
|
||||
if (is_array($token) || is_int($token)) {
|
||||
// the original token was overloaded by an injector, time
|
||||
// to some fancy acrobatics
|
||||
|
||||
// $this->inputIndex is decremented so that the entire set gets
|
||||
// re-processed
|
||||
array_splice($this->inputTokens, $this->inputIndex--, 1, $token);
|
||||
|
||||
// adjust the injector skips based on the array substitution
|
||||
if (is_array($token)) {
|
||||
array_splice($this->inputTokens, $this->inputIndex, 1, $token);
|
||||
} else {
|
||||
array_splice($this->inputTokens, $this->inputIndex, $token, array());
|
||||
}
|
||||
if ($this->injectors) {
|
||||
$offset = count($token);
|
||||
for ($i = 0; $i <= $this->currentInjector; $i++) {
|
||||
// because of the skip back, we need to add one more
|
||||
// for uninitialized injectors. I'm not exactly
|
||||
// sure why this is the case, but I think it has to
|
||||
// do with the fact that we're decrementing skips
|
||||
// before re-checking text
|
||||
if (!$this->injectors[$i]->skip) $this->injectors[$i]->skip++;
|
||||
$this->injectors[$i]->skip += $offset;
|
||||
$rewind = $this->injectors[$this->currentInjector]->getRewind();
|
||||
if ($rewind < 0) $rewind = 0;
|
||||
if ($rewind !== false) {
|
||||
$offset = $this->inputIndex - $rewind;
|
||||
if ($this->injectors) {
|
||||
foreach ($this->injectors as $i => $injector) {
|
||||
if ($i == $this->currentInjector) {
|
||||
$injector->skip = 0;
|
||||
} else {
|
||||
$injector->skip += $offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
for ($this->inputIndex--; $this->inputIndex >= $rewind; $this->inputIndex--) {
|
||||
$prev = $this->inputTokens[$this->inputIndex];
|
||||
if ($prev instanceof HTMLPurifier_Token_Start) array_pop($this->currentNesting);
|
||||
elseif ($prev instanceof HTMLPurifier_Token_End) $this->currentNesting[] = $prev->start;
|
||||
}
|
||||
$this->inputIndex++;
|
||||
} else {
|
||||
// adjust the injector skips based on the array substitution
|
||||
$offset = is_array($token) ? count($token) : 0;
|
||||
for ($i = 0; $i <= $this->currentInjector; $i++) {
|
||||
// because of the skip back, we need to add one more
|
||||
// for uninitialized injectors. I'm not exactly
|
||||
// sure why this is the case, but I think it has to
|
||||
// do with the fact that we're decrementing skips
|
||||
// before re-checking text
|
||||
if (!$this->injectors[$i]->skip) $this->injectors[$i]->skip++;
|
||||
$this->injectors[$i]->skip += $offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
// ensure that we reprocess these tokens with the other injectors
|
||||
--$this->inputIndex;
|
||||
|
||||
} elseif ($token) {
|
||||
// regular case
|
||||
$this->swap($token);
|
||||
if ($token instanceof HTMLPurifier_Token_Start) {
|
||||
$this->currentNesting[] = $token;
|
||||
} elseif ($token instanceof HTMLPurifier_Token_End) {
|
||||
array_pop($this->currentNesting); // not actually used
|
||||
// not actually used
|
||||
$token->start = array_pop($this->currentNesting);
|
||||
}
|
||||
} else {
|
||||
$this->remove();
|
||||
|
@ -9,5 +9,9 @@
|
||||
*/
|
||||
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
|
||||
{
|
||||
|
||||
/**
|
||||
* Token that started this node. Added by MakeWellFormed. Please
|
||||
* do not edit this!
|
||||
*/
|
||||
public $start;
|
||||
}
|
||||
|
58
tests/HTMLPurifier/Injector/RemoveEmptyTest.php
Normal file
58
tests/HTMLPurifier/Injector/RemoveEmptyTest.php
Normal file
@ -0,0 +1,58 @@
|
||||
<?php
|
||||
|
||||
class HTMLPurifier_Injector_RemoveEmptyTest extends HTMLPurifier_InjectorHarness
|
||||
{
|
||||
|
||||
public function setup() {
|
||||
parent::setup();
|
||||
$this->config->set('AutoFormat', 'RemoveEmpty', true);
|
||||
}
|
||||
|
||||
function testPreserve() {
|
||||
$this->assertResult('<b>asdf</b>');
|
||||
}
|
||||
|
||||
function testRemove() {
|
||||
$this->assertResult('<b></b>', '');
|
||||
}
|
||||
|
||||
function testRemoveWithSpace() {
|
||||
$this->assertResult('<b> </b>', '');
|
||||
}
|
||||
|
||||
function testRemoveWithAttr() {
|
||||
$this->assertResult('<b class="asdf"></b>', '');
|
||||
}
|
||||
|
||||
function testRemoveIdAndName() {
|
||||
$this->assertResult('<a id="asdf" name="asdf"></a>', '');
|
||||
}
|
||||
|
||||
function testPreserveColgroup() {
|
||||
$this->assertResult('<colgroup></colgroup>');
|
||||
}
|
||||
|
||||
function testPreserveId() {
|
||||
$this->config->set('Attr', 'EnableID', true);
|
||||
$this->assertResult('<a id="asdf"></a>');
|
||||
}
|
||||
|
||||
function testPreserveName() {
|
||||
$this->config->set('Attr', 'EnableID', true);
|
||||
$this->assertResult('<a name="asdf"></a>');
|
||||
}
|
||||
|
||||
function testRemoveNested() {
|
||||
$this->assertResult('<b><i></i></b>', '');
|
||||
}
|
||||
|
||||
function testRemoveNested2() {
|
||||
$this->assertResult('<b><i><u></u></i></b>', '');
|
||||
}
|
||||
|
||||
function testRemoveNested3() {
|
||||
$this->assertResult('<b> <i> <u> </u> </i> </b>', '');
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -10,6 +10,7 @@ class HTMLPurifier_Injector_SafeObjectTest extends HTMLPurifier_InjectorHarness
|
||||
|
||||
function setup() {
|
||||
parent::setup();
|
||||
// there is no AutoFormat.SafeObject directive
|
||||
$this->config->set('AutoFormat', 'Custom', array(new HTMLPurifier_Injector_SafeObject()));
|
||||
$this->config->set('HTML', 'Trusted', true);
|
||||
}
|
||||
|
@ -8,14 +8,19 @@ class HTMLPurifier_Strategy_MakeWellFormed_InjectorTest extends HTMLPurifier_Str
|
||||
$this->obj = new HTMLPurifier_Strategy_MakeWellFormed();
|
||||
$this->config->set('AutoFormat', 'AutoParagraph', true);
|
||||
$this->config->set('AutoFormat', 'Linkify', true);
|
||||
$this->config->set('AutoFormat', 'RemoveEmpty', true);
|
||||
generate_mock_once('HTMLPurifier_Injector');
|
||||
}
|
||||
|
||||
function testEndNotification() {
|
||||
$mock = new HTMLPurifier_InjectorMock();
|
||||
$mock->skip = false;
|
||||
$mock->expectAt(0, 'notifyEnd', array(new HTMLPurifier_Token_End('b')));
|
||||
$mock->expectAt(1, 'notifyEnd', array(new HTMLPurifier_Token_End('i')));
|
||||
$b = new HTMLPurifier_Token_End('b');
|
||||
$b->start = new HTMLPurifier_Token_Start('b');
|
||||
$mock->expectAt(0, 'notifyEnd', array($b));
|
||||
$i = new HTMLPurifier_Token_End('i');
|
||||
$i->start = new HTMLPurifier_Token_Start('i');
|
||||
$mock->expectAt(1, 'notifyEnd', array($i));
|
||||
$mock->expectCallCount('notifyEnd', 2);
|
||||
$this->config->set('AutoFormat', 'AutoParagraph', false);
|
||||
$this->config->set('AutoFormat', 'Linkify', false);
|
||||
@ -92,4 +97,20 @@ class HTMLPurifier_Strategy_MakeWellFormed_InjectorTest extends HTMLPurifier_Str
|
||||
);
|
||||
}
|
||||
|
||||
function testEmptyAndParagraph() {
|
||||
// This is a fairly degenerate case, but it demonstrates that
|
||||
// the two don't error out together, at least.
|
||||
$this->assertResult(
|
||||
"<p>asdf\n\nasdf<b></b></p>\n\n<p></p><i></i>",
|
||||
"<p>asdf</p><p>asdf</p>"
|
||||
);
|
||||
}
|
||||
|
||||
function testRewindAndParagraph() {
|
||||
$this->assertResult(
|
||||
"bar\n\n<p><i></i>\n\n</p>\n\nfoo",
|
||||
"<p>bar</p><p>foo</p>"
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user