0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-08 23:11:52 +00:00

[2.0.1] Officially add experimental auto-paragraphing and linkification functionality. Rename %Core.DefinitionCache to %Cache.DefinitionImpl. Have AutoParagraph handle even more edge cases. Fix MakeWellFormed bug.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1223 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2007-06-24 20:29:50 +00:00
parent 5f0663cad7
commit b15cbbb42a
8 changed files with 297 additions and 69 deletions

4
NEWS
View File

@ -12,6 +12,9 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
2.0.1, unknown release date 2.0.1, unknown release date
! Tag auto-closing now based on a ChildDef heuristic rather than a ! Tag auto-closing now based on a ChildDef heuristic rather than a
manually set auto_close array; some behavior may change manually set auto_close array; some behavior may change
! Experimental AutoFormat functionality added: auto-paragraph and
linkify your HTML input by setting %AutoFormat.AutoParagraph and
%AutoFormat.Linkify to true
- Clean up special case code for <script> tags - Clean up special case code for <script> tags
- Reorder includes for DefinitionCache decorators, fixes a possible - Reorder includes for DefinitionCache decorators, fixes a possible
missing class error missing class error
@ -26,6 +29,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
. HTMLDefinition printer updated with some of the new attributes . HTMLDefinition printer updated with some of the new attributes
. DefinitionCache keys reordered to reflect precedence: version number, . DefinitionCache keys reordered to reflect precedence: version number,
hash, then revision number hash, then revision number
. %Core.DefinitionCache renamed to %Cache.DefinitionImpl
2.0.0, released 2007-06-20 2.0.0, released 2007-06-20
# Completely refactored HTMLModuleManager, decentralizing safety # Completely refactored HTMLModuleManager, decentralizing safety

View File

@ -67,6 +67,7 @@ class HTMLPurifier_ConfigSchema {
$this->defineNamespace('URI', 'Features regarding Uniform Resource Identifiers.'); $this->defineNamespace('URI', 'Features regarding Uniform Resource Identifiers.');
$this->defineNamespace('HTML', 'Configuration regarding allowed HTML.'); $this->defineNamespace('HTML', 'Configuration regarding allowed HTML.');
$this->defineNamespace('CSS', 'Configuration regarding allowed CSS.'); $this->defineNamespace('CSS', 'Configuration regarding allowed CSS.');
$this->defineNamespace('AutoFormat', 'Configuration regarding auto-formatting functionality such as auto-paragraphing or linkification.');
$this->defineNamespace('Output', 'Configuration relating to the generation of (X)HTML.'); $this->defineNamespace('Output', 'Configuration relating to the generation of (X)HTML.');
$this->defineNamespace('Cache', 'Configuration for DefinitionCache and related subclasses.'); $this->defineNamespace('Cache', 'Configuration for DefinitionCache and related subclasses.');
$this->defineNamespace('Test', 'Developer testing configuration for our unit tests.'); $this->defineNamespace('Test', 'Developer testing configuration for our unit tests.');

View File

@ -3,7 +3,7 @@
require_once 'HTMLPurifier/DefinitionCache.php'; require_once 'HTMLPurifier/DefinitionCache.php';
HTMLPurifier_ConfigSchema::define( HTMLPurifier_ConfigSchema::define(
'Core', 'DefinitionCache', 'Serializer', 'string/null', ' 'Cache', 'DefinitionImpl', 'Serializer', 'string/null', '
This directive defines which method to use when caching definitions, This directive defines which method to use when caching definitions,
the complex data-type that makes HTML Purifier tick. Set to null the complex data-type that makes HTML Purifier tick. Set to null
to disable caching (not recommended, as you will see a definite to disable caching (not recommended, as you will see a definite
@ -11,7 +11,12 @@ performance degradation). This directive has been available since 2.0.0.
'); ');
HTMLPurifier_ConfigSchema::defineAllowedValues( HTMLPurifier_ConfigSchema::defineAllowedValues(
'Core', 'DefinitionCache', array('Serializer') 'Cache', 'DefinitionImpl', array('Serializer')
);
HTMLPurifier_ConfigSchema::defineAlias(
'Core', 'DefinitionCache',
'Cache', 'DefinitionImpl'
); );
@ -54,7 +59,7 @@ class HTMLPurifier_DefinitionCacheFactory
function &create($type, $config) { function &create($type, $config) {
// only one implementation as for right now, $config will // only one implementation as for right now, $config will
// be used to determine implementation // be used to determine implementation
$method = $config->get('Core', 'DefinitionCache'); $method = $config->get('Cache', 'DefinitionImpl');
if ($method === null) { if ($method === null) {
$null = new HTMLPurifier_DefinitionCache_Null($type); $null = new HTMLPurifier_DefinitionCache_Null($type);
return $null; return $null;

View File

@ -2,6 +2,24 @@
require_once 'HTMLPurifier/Injector.php'; require_once 'HTMLPurifier/Injector.php';
HTMLPurifier_ConfigSchema::define(
'AutoFormat', 'AutoParagraph', false, 'bool', '
<p>
This directive turns on auto-paragraphing, where double newlines are
converted in to paragraphs whenever possible. Auto-paragraphing
applies when:
</p>
<ul>
<li>There are inline elements or text in the root node</li>
<li>There are inline elements or text with double newlines or
block elements in nodes that allow paragraph tags</li>
<li>There are double newlines in paragraph tags</li>
</ul>
<p>
This directive has been available since 2.0.1.
</p>
');
/** /**
* Injector that auto paragraphs text in the root node based on * Injector that auto paragraphs text in the root node based on
* double-spacing. * double-spacing.
@ -11,27 +29,94 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
function handleText(&$token) { function handleText(&$token) {
$text = $token->data; $text = $token->data;
// $token is the focus: if processing is needed, it gets
// turned into an array of tokens that will replace the
// original token
if (empty($this->currentNesting)) { if (empty($this->currentNesting)) {
if (!$this->allowsElement('p')) return; if (!$this->allowsElement('p')) return;
// we're in root node, and the root node allows paragraphs // case 1: we're in root node (and it allows paragraphs)
// start a paragraph since we just hit some text
$token = array(new HTMLPurifier_Token_Start('p')); $token = array(new HTMLPurifier_Token_Start('p'));
$this->_splitText($text, $token); $this->_splitText($text, $token);
} elseif ($this->currentNesting[count($this->currentNesting)-1]->name == 'p') { } elseif ($this->currentNesting[count($this->currentNesting)-1]->name == 'p') {
// we're not in root node but we're in a paragraph, so don't // case 2: we're in a paragraph
// add a paragraph start tag but still perform processing
$token = array(); $token = array();
$this->_splitText($text, $token); $this->_splitText($text, $token);
} elseif ($this->allowsElement('p')) {
// case 3: we're in an element that allows paragraphs
if (strpos($text, PHP_EOL . PHP_EOL) !== false) {
// case 3.1: this text node has a double-newline
$token = array(new HTMLPurifier_Token_Start('p'));
$this->_splitText($text, $token);
} else {
$ok = false;
// test if up-coming tokens are either block or have
// a double newline in them
for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) {
if ($this->inputTokens[$i]->type == 'start'){
if (!$this->_isInline($this->inputTokens[$i])) {
$ok = true;
}
break;
}
if ($this->inputTokens[$i]->type == 'end') break;
if ($this->inputTokens[$i]->type == 'text') {
if (strpos($this->inputTokens[$i]->data, PHP_EOL . PHP_EOL) !== false) {
$ok = true;
}
if (!$this->inputTokens[$i]->is_whitespace) break;
}
}
if ($ok) {
// case 3.2: this text node is next to another node
// that will start a paragraph
$token = array(new HTMLPurifier_Token_Start('p'), $token);
}
} }
} }
}
function handleStart(&$token) { function handleStart(&$token) {
// check if we're inside a tag already, if so, don't add // check if we're inside a tag already
// paragraph tags if (!empty($this->currentNesting)) {
if (!empty($this->currentNesting)) return; if ($this->allowsElement('p')) {
// special case: we're in an element that allows paragraphs
// this token is already paragraph, abort
if ($token->name == 'p') return;
// check if this token is adjacent to the parent
if ($this->inputTokens[$this->inputIndex - 1]->type != 'start') {
// not adjacent, we can abort early
// add lead paragraph tag if our token is inline
if ($this->_isInline($token)) {
$token = array(new HTMLPurifier_Token_Start('p'), $token);
}
return;
}
// this token is the first child of the element that allows
// paragraph. We have to peek ahead and see whether or not
// there is anything inside that suggests that a paragraph
// will be needed
$ok = false;
// maintain a mini-nesting counter, this lets us bail out
// early if possible
$j = 2; // current nesting, is two due to parent and this start
for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) {
if ($this->inputTokens[$i]->type == 'start') $j++;
if ($this->inputTokens[$i]->type == 'end') $j--;
if ($this->inputTokens[$i]->type == 'text') {
if (strpos($this->inputTokens[$i]->data, PHP_EOL . PHP_EOL) !== false) {
$ok = true;
break;
}
}
if ($j <= 0) break;
}
if ($ok) {
$token = array(new HTMLPurifier_Token_Start('p'), $token);
}
}
return;
}
// check if the start tag counts as a "block" element // check if the start tag counts as a "block" element
if (!$this->_isInline($token)) return; if (!$this->_isInline($token)) return;
@ -56,55 +141,79 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
// remove empty paragraphs // remove empty paragraphs
$paragraphs = array(); $paragraphs = array();
$needs_start = false;
$first = true;
foreach ($raw_paragraphs as $par) { foreach ($raw_paragraphs as $par) {
if (trim($par) !== '') $paragraphs[] = $par; if (trim($par) !== '') {
$paragraphs[] = $par;
} elseif (empty($result) && $first) {
// The empty result indicates that the AutoParagraph
// injector did not add any start paragraph tokens.
// The fact that the first paragraph is empty indicates
// that there was a double-newline at the start of the
// data.
// Combined together, this means that we are in a paragraph,
// and the newline means we should start a new one.
$result[] = new HTMLPurifier_Token_End('p');
// However, the start token should only be added if
// there is more processing to be done (i.e. there are
// real paragraphs in here). If there are none, the
// next start paragraph tag will be handled by the
// next run-around the injector
$needs_start = true;
}
$first = false;
} }
// check if there are no "real" paragraphs to be processed // check if there are no "real" paragraphs to be processed
if (empty($paragraphs) && count($raw_paragraphs) > 1) { if (empty($paragraphs)) {
$result[] = new HTMLPurifier_Token_End('p');
return; return;
} }
// add a start tag if an end tag was added while processing
// the raw paragraphs (that happens if there's a leading double
// newline)
if ($needs_start) $result[] = new HTMLPurifier_Token_Start('p');
// append the paragraphs onto the result // append the paragraphs onto the result
foreach ($paragraphs as $par) { foreach ($paragraphs as $par) {
$result[] = new HTMLPurifier_Token_Text($par); $result[] = new HTMLPurifier_Token_Text($par);
$result[] = new HTMLPurifier_Token_End('p'); $result[] = new HTMLPurifier_Token_End('p');
$result[] = new HTMLPurifier_Token_Start('p'); $result[] = new HTMLPurifier_Token_Start('p');
} }
array_pop($result); // remove trailing start token
// remove trailing start token, if one is needed, it will
// be handled the next time this injector is called
array_pop($result);
// check the outside to determine whether or not the
// end paragraph tag should be removed. It should be removed
// unless the next non-whitespace token is a paragraph
// or a block element.
$remove_paragraph_end = true;
// Start of the checks one after the current token's index
for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) {
if ($this->inputTokens[$i]->type == 'start' || $this->inputTokens[$i]->type == 'empty') {
$remove_paragraph_end = $this->_isInline($this->inputTokens[$i]);
break;
}
// check if we can abort early (whitespace means we carry-on!)
if ($this->inputTokens[$i]->type == 'text' && !$this->inputTokens[$i]->is_whitespace) break;
// end tags will automatically be handled by MakeWellFormed,
// so we don't have to worry about them
if ($this->inputTokens[$i]->type == 'end') break;
}
// check the outside to determine whether or not the // check the outside to determine whether or not the
// end paragraph tag should be removed // end paragraph tag should be removed
if ($this->_removeParagraphEnd()) { if ($remove_paragraph_end) {
array_pop($result); array_pop($result);
} }
} }
/**
* Returns boolean whether or not to remove the paragraph end tag
* that was automatically added. The paragraph end tag should be
* removed unless the next token is a paragraph or block element.
* @private
*/
function _removeParagraphEnd() {
$tokens =& $this->inputTokens;
$remove_paragraph_end = true;
// Start of the checks one after the current token's index
for ($i = $this->inputIndex + 1; isset($tokens[$i]); $i++) {
if ($tokens[$i]->type == 'start' || $tokens[$i]->type == 'empty') {
$remove_paragraph_end = $this->_isInline($tokens[$i]);
break;
}
// check if we can abort early (whitespace means we carry-on!)
if ($tokens[$i]->type == 'text' && !$tokens[$i]->is_whitespace) break;
if ($tokens[$i]->type == 'end') break; // nonsensical
}
return $remove_paragraph_end;
}
/** /**
* Returns true if passed token is inline (and, ergo, allowed in * Returns true if passed token is inline (and, ergo, allowed in
* paragraph tags) * paragraph tags)

View File

@ -2,6 +2,14 @@
require_once 'HTMLPurifier/Injector.php'; require_once 'HTMLPurifier/Injector.php';
HTMLPurifier_ConfigSchema::define(
'AutoFormat', 'Linkify', false, 'bool', '
<p>
This directive turns on linkification, auto-linking http, ftp and
https URLs. This directive has been available since 2.0.1.
</p>
');
/** /**
* Injector that converts http, https and ftp text URLs to actual links. * Injector that converts http, https and ftp text URLs to actual links.
*/ */

View File

@ -7,26 +7,6 @@ require_once 'HTMLPurifier/Generator.php';
require_once 'HTMLPurifier/Injector/AutoParagraph.php'; require_once 'HTMLPurifier/Injector/AutoParagraph.php';
require_once 'HTMLPurifier/Injector/Linkify.php'; require_once 'HTMLPurifier/Injector/Linkify.php';
HTMLPurifier_ConfigSchema::define(
'Core', 'AutoParagraph', false, 'bool', '
<p>
This directive will cause HTML Purifier to automatically paragraph text
in the document fragment root based on two newlines and block tags.
This directive has been available since 2.0.1.
</p>
'
);
HTMLPurifier_ConfigSchema::define(
'Core', 'AutoLinkify', false, 'bool', '
<p>
This directive will cause HTML Purifier to automatically linkify
text that looks like URLs. This directive has been available since
2.0.1.
</p>
'
);
/** /**
* Takes tokens makes them well-formed (balance end tags, etc.) * Takes tokens makes them well-formed (balance end tags, etc.)
*/ */
@ -70,11 +50,11 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// we need a generic way of adding injectors, and also its own // we need a generic way of adding injectors, and also its own
// configuration namespace // configuration namespace
if ($config->get('Core', 'AutoParagraph')) { if ($config->get('AutoFormat', 'AutoParagraph')) {
$this->injectors[] = new HTMLPurifier_Injector_AutoParagraph(); $this->injectors[] = new HTMLPurifier_Injector_AutoParagraph();
} }
if ($config->get('Core', 'AutoLinkify')) { if ($config->get('AutoFormat', 'Linkify')) {
$this->injectors[] = new HTMLPurifier_Injector_Linkify(); $this->injectors[] = new HTMLPurifier_Injector_Linkify();
} }
@ -163,7 +143,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// injector handler code; duplicated for performance reasons // injector handler code; duplicated for performance reasons
foreach ($this->injectors as $i => $x) { foreach ($this->injectors as $i => $x) {
if (!$x->skip[$i]) $x->handleStart($token, $config, $context); if (!$x->skip) $x->handleStart($token, $config, $context);
if (is_array($token)) { if (is_array($token)) {
$this->currentInjector = $i; $this->currentInjector = $i;
break; break;

View File

@ -278,8 +278,8 @@ class HTMLPurifier_ConfigTest extends UnitTestCase
} }
function test_getDefinition() { function test_getDefinition() {
CS::defineNamespace('Core', 'Core stuff'); CS::defineNamespace('Cache', 'Cache stuff');
CS::define('Core', 'DefinitionCache', null, 'string/null', 'Cache?'); CS::define('Cache', 'DefinitionImpl', null, 'string/null', 'Cache?');
CS::defineNamespace('Crust', 'Krusty Krabs'); CS::defineNamespace('Crust', 'Krusty Krabs');
$config = HTMLPurifier_Config::createDefault(); $config = HTMLPurifier_Config::createDefault();
$this->expectError("Definition of Crust type not supported"); $this->expectError("Definition of Crust type not supported");

View File

@ -76,7 +76,7 @@ class HTMLPurifier_Strategy_MakeWellFormedTest extends HTMLPurifier_StrategyHarn
} }
function testAutoParagraph() { function testAutoParagraph() {
$this->config = array('Core.AutoParagraph' => true); $this->config = array('AutoFormat.AutoParagraph' => true);
$this->assertResult( $this->assertResult(
'Foobar', 'Foobar',
@ -175,19 +175,140 @@ Par
'<p>Par</p>' '<p>Par</p>'
); );
$this->assertResult(
'<div>Par1
Par2</div>',
'<div><p>Par1</p><p>Par2</p></div>'
);
$this->assertResult(
'<div><b>Par1</b>
Par2</div>',
'<div><p><b>Par1</b></p><p>Par2</p></div>'
);
$this->assertResult('<div>Par1</div>');
$this->assertResult(
'<div><b>Par1</b>
<i>Par2</i></div>',
'<div><p><b>Par1</b></p><p><i>Par2</i></p></div>'
);
$this->assertResult(
'<pre><b>Par1</b>
<i>Par2</i></pre>',
true
);
$this->assertResult(
'<div><p>Foo
Bar</p></div>',
'<div><p>Foo</p><p>Bar</p></div>'
);
$this->assertResult(
'<div><p><b>Foo</b>
<i>Bar</i></p></div>',
'<div><p><b>Foo</b></p><p><i>Bar</i></p></div>'
);
$this->assertResult(
'<div><b>Foo</b></div>',
'<div><b>Foo</b></div>'
);
$this->assertResult(
'<blockquote>Par1
Par2</blockquote>',
'<blockquote><p>Par1</p><p>Par2</p></blockquote>'
);
$this->assertResult(
'<ul><li>Foo</li>
<li>Bar</li></ul>', true
);
$this->assertResult(
'<div>
Bar
</div>',
'<div><p>Bar</p></div>'
);
$this->assertResult(
'<b>Par1</b>a
Par2',
'<p><b>Par1</b>a</p><p>Par2</p>'
);
$this->assertResult(
'Par1
Par2</p>',
'<p>Par1</p><p>Par2</p>'
);
$this->assertResult(
'Par1
Par2</div>',
'<p>Par1</p><p>Par2</p>'
);
$this->assertResult(
'<div>
Par1
</div>', true
);
$this->assertResult(
'<div>Par1
<div>Par2</div></div>',
'<div><p>Par1</p><div>Par2</div></div>'
);
$this->assertResult(
'<div>Par1
<div>Par2</div></div>',
'<div><p>Par1
</p><div>Par2</div></div>'
);
$this->assertResult(
'Par1
<div>Par2</div>',
'<p>Par1
</p><div>Par2</div>'
);
$this->assertResult( $this->assertResult(
'Par 'Par
Par2', Par2',
true, true,
array('Core.AutoParagraph' => true, 'HTML.Parent' => 'span') array('AutoFormat.AutoParagraph' => true, 'HTML.Parent' => 'span')
); );
} }
function testLinkify() { function testLinkify() {
$this->config = array('Core.AutoLinkify' => true); $this->config = array('AutoFormat.Linkify' => true);
$this->assertResult( $this->assertResult(
'http://example.com', 'http://example.com',
@ -212,7 +333,7 @@ Par2',
function testMultipleInjectors() { function testMultipleInjectors() {
$this->config = array('Core.AutoParagraph' => true, 'Core.AutoLinkify' => true); $this->config = array('AutoFormat.AutoParagraph' => true, 'AutoFormat.Linkify' => true);
$this->assertResult( $this->assertResult(
'Foobar', 'Foobar',