From 5d0a9925797bc669086838591b448120697c437e Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang"
Date: Sat, 23 Jun 2007 19:39:03 +0000
Subject: [PATCH] Refactor Injector not to edit $result directly.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1212 48356398-32a2-884e-a903-53898d9a118a
---
.../HTMLPurifier/Injector/AutoParagraph.php | 130 ++++++++----------
.../HTMLPurifier/Strategy/MakeWellFormed.php | 3 +-
.../Strategy/MakeWellFormedTest.php | 25 ++++
3 files changed, 86 insertions(+), 72 deletions(-)
diff --git a/library/HTMLPurifier/Injector/AutoParagraph.php b/library/HTMLPurifier/Injector/AutoParagraph.php
index fd589f96..5b72b3ce 100644
--- a/library/HTMLPurifier/Injector/AutoParagraph.php
+++ b/library/HTMLPurifier/Injector/AutoParagraph.php
@@ -10,27 +10,21 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
{
function handleText(&$token, $config, &$context) {
- $dnl = PHP_EOL . PHP_EOL; // double-newline
$current_nesting =& $context->get('CurrentNesting');
- // paragraphing is on
+ $text = $token->data;
+ // $token is the focus: if processing is needed, it gets
+ // turned into an array of tokens that will replace the
+ // original token
if (empty($current_nesting)) {
// we're in root node, great time to start a paragraph
// since we're also dealing with a text node
- $result =& $context->get('OutputTokens');
- $result[] = new HTMLPurifier_Token_Start('p');
- $current_nesting[] = new HTMLPurifier_Token_Start('p');
- $this->_splitText($token, $config, $context);
- } else {
- // we're not in root node, so let's see whether or not
- // we're in a paragraph
-
- // losslessly access the parent element
- $parent = array_pop($current_nesting);
- $current_nesting[] = $parent;
-
- if ($parent->name === 'p') {
- $this->_splitText($token, $config, $context);
- }
+ $token = array(new HTMLPurifier_Token_Start('p'));
+ $this->_splitText($text, $token, $config, $context);
+ } elseif ($current_nesting[count($current_nesting)-1]->name == 'p') {
+ // we're not in root node but we're in a paragraph, so don't
+ // add a paragraph start tag but still perform processing
+ $token = array();
+ $this->_splitText($text, $token, $config, $context);
}
}
@@ -41,26 +35,25 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
if (!empty($current_nesting)) return;
// check if the start tag counts as a "block" element
- $definition = $config->getHTMLDefinition();
- if (isset($definition->info['p']->auto_close[$token->name])) return;
+ if (!$this->_isInline($token, $config)) return;
// append a paragraph tag before the token
$token = array(new HTMLPurifier_Token_Start('p'), $token);
}
/**
- * Sub-function for auto-paragraphing that takes a token and splits it
- * up into paragraphs unconditionally. Requires that a paragraph was
- * already started
+ * Splits up a text in paragraph tokens and appends them
+ * to the result stream that will replace the original
+ * @param $data String text data that will be processed
+ * into paragraphs
+ * @param $result Reference to array of tokens that the
+ * tags will be appended onto
+ * @param $config Instance of HTMLPurifier_Config
+ * @param $context Instance of HTMLPurifier_Context
+ * @private
*/
- function _splitText(&$token, $config, &$context) {
- $dnl = PHP_EOL . PHP_EOL; // double-newline
- $definition = $config->getHTMLDefinition();
- $current_nesting =& $context->get('CurrentNesting');
-
- $raw_paragraphs = explode($dnl, $token->data);
-
- $token = false; // token has been completely dismantled
+ function _splitText($data, &$result, $config, &$context) {
+ $raw_paragraphs = explode(PHP_EOL . PHP_EOL, $data);
// remove empty paragraphs
$paragraphs = array();
@@ -68,67 +61,62 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
if (trim($par) !== '') $paragraphs[] = $par;
}
- $result =& $context->get('OutputTokens');
-
+ // check if there are no "real" paragraphs to be processed
if (empty($paragraphs) && count($raw_paragraphs) > 1) {
$result[] = new HTMLPurifier_Token_End('p');
- array_pop($current_nesting);
return;
}
- foreach ($paragraphs as $data) {
- $result[] = new HTMLPurifier_Token_Text($data);
+ // append the paragraphs onto the result
+ foreach ($paragraphs as $par) {
+ $result[] = new HTMLPurifier_Token_Text($par);
$result[] = new HTMLPurifier_Token_End('p');
$result[] = new HTMLPurifier_Token_Start('p');
}
array_pop($result); // remove trailing start token
- // check the outside to determine whether or not end
- // paragraph tag is needed (it's already there)
- $end_paragraph = $this->_needsEndTag(
- $context->get('InputTokens'),
- $context->get('InputIndex'),
- $definition
- );
-
- if ($end_paragraph) {
- // things are good as they stand, remove top-level parent
- // that we deferred
- array_pop($current_nesting);
- } else {
- // remove the ending tag, no nesting modifications necessary
+ // check the outside to determine whether or not the
+ // end paragraph tag should be removed
+ if ($this->_removeParagraphEnd($config, $context)) {
array_pop($result);
}
+
}
/**
- * Determines if up-coming code requires an end-paragraph tag,
- * otherwise, keep the paragraph open (don't make another one)
- * @protected
+ * Returns boolean whether or not to remove the paragraph end tag
+ * that was automatically added. The paragraph end tag should be
+ * removed unless the next token is a paragraph or block element.
+ * @param $config Instance of HTMLPurifier_Config
+ * @param $context Instance of HTMLPurifier_Context
+ * @private
*/
- function _needsEndTag($tokens, $k, $definition) {
- $end_paragraph = false;
- for ($j = $k + 1; isset($tokens[$j]); $j++) {
- if ($tokens[$j]->type == 'start' || $tokens[$j]->type == 'empty') {
- if ($tokens[$j]->name == 'p') {
- $end_paragraph = true;
- } else {
- $end_paragraph = isset($definition->info['p']->auto_close[$tokens[$j]->name]);
- }
- break;
- } elseif ($tokens[$j]->type == 'text') {
- if (!$tokens[$j]->is_whitespace) {
- $end_paragraph = false;
- break;
- }
- } elseif ($tokens[$j]->type == 'end') {
- // nonsensical case
- $end_paragraph = false;
+ function _removeParagraphEnd($config, &$context) {
+ $tokens = $context->get('InputTokens');
+ $i = $context->get('InputIndex');
+ $remove_paragraph_end = true;
+ // Start of the checks one after the current token's index
+ for ($i++; isset($tokens[$i]); $i++) {
+ if ($tokens[$i]->type == 'start' || $tokens[$i]->type == 'empty') {
+ $definition = $config->getHTMLDefinition();
+ $remove_paragraph_end = $this->_isInline($tokens[$i], $config);
break;
}
+ // check if we can abort early (whitespace means we carry-on!)
+ if ($tokens[$i]->type == 'text' && !$tokens[$i]->is_whitespace) break;
+ if ($tokens[$i]->type == 'end') break; // nonsensical
}
- return $end_paragraph;
+ return $remove_paragraph_end;
+ }
+
+ /**
+ * Returns true if passed token is inline (and, ergo, allowed in
+ * paragraph tags)
+ */
+ function _isInline($token, $config) {
+ $definition = $config->getHTMLDefinition();
+ return !isset($definition->info['p']->auto_close[$token->name]);
}
}
diff --git a/library/HTMLPurifier/Strategy/MakeWellFormed.php b/library/HTMLPurifier/Strategy/MakeWellFormed.php
index 677d09af..e8e605df 100644
--- a/library/HTMLPurifier/Strategy/MakeWellFormed.php
+++ b/library/HTMLPurifier/Strategy/MakeWellFormed.php
@@ -225,7 +225,8 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$current_nesting[] = $token;
} elseif ($token->type == 'end') {
// theoretical: this isn't used because performing
- // the calculations otherwise is more efficient
+ // the calculations inline is more efficient, and
+ // end tokens currently do not cause a handler invocation
array_pop($current_nesting);
}
}
diff --git a/tests/HTMLPurifier/Strategy/MakeWellFormedTest.php b/tests/HTMLPurifier/Strategy/MakeWellFormedTest.php
index 20e3bf79..8ab6f907 100644
--- a/tests/HTMLPurifier/Strategy/MakeWellFormedTest.php
+++ b/tests/HTMLPurifier/Strategy/MakeWellFormedTest.php
@@ -92,6 +92,15 @@ Par 1 still
'
$this->assertResult(
'Par1
+Par2',
+ 'Par1
Par2
'
+ );
+
+ $this->assertResult(
+'Par1
+
+
+
Par2',
'Par1
Par2
'
);
@@ -149,6 +158,22 @@ Par3',
'Par1
'
);
+ $this->assertResult(
+'
+
+Par',
+ 'Par
'
+ );
+
+ $this->assertResult(
+'
+
+Par
+
+',
+ 'Par
'
+ );
+
}
}