From 5d0a9925797bc669086838591b448120697c437e Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <edwardzyang@thewritingpot.com>
Date: Sat, 23 Jun 2007 19:39:03 +0000
Subject: [PATCH] Refactor Injector not to edit $result directly.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1212 48356398-32a2-884e-a903-53898d9a118a
---
 .../HTMLPurifier/Injector/AutoParagraph.php   | 130 ++++++++----------
 .../HTMLPurifier/Strategy/MakeWellFormed.php  |   3 +-
 .../Strategy/MakeWellFormedTest.php           |  25 ++++
 3 files changed, 86 insertions(+), 72 deletions(-)

diff --git a/library/HTMLPurifier/Injector/AutoParagraph.php b/library/HTMLPurifier/Injector/AutoParagraph.php
index fd589f96..5b72b3ce 100644
--- a/library/HTMLPurifier/Injector/AutoParagraph.php
+++ b/library/HTMLPurifier/Injector/AutoParagraph.php
@@ -10,27 +10,21 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
 {
     
     function handleText(&$token, $config, &$context) {
-        $dnl = PHP_EOL . PHP_EOL; // double-newline
         $current_nesting =& $context->get('CurrentNesting');
-        // paragraphing is on
+        $text = $token->data;
+        // $token is the focus: if processing is needed, it gets
+        // turned into an array of tokens that will replace the
+        // original token
         if (empty($current_nesting)) {
             // we're in root node, great time to start a paragraph
             // since we're also dealing with a text node
-            $result =& $context->get('OutputTokens');
-            $result[] = new HTMLPurifier_Token_Start('p');
-            $current_nesting[] = new HTMLPurifier_Token_Start('p');
-            $this->_splitText($token, $config, $context);
-        } else {
-            // we're not in root node, so let's see whether or not
-            // we're in a paragraph
-            
-            // losslessly access the parent element
-            $parent = array_pop($current_nesting);
-            $current_nesting[] = $parent;
-            
-            if ($parent->name === 'p') {
-                $this->_splitText($token, $config, $context);
-            }
+            $token = array(new HTMLPurifier_Token_Start('p'));
+            $this->_splitText($text, $token, $config, $context);
+        } elseif ($current_nesting[count($current_nesting)-1]->name == 'p') {
+            // we're not in root node but we're in a paragraph, so don't 
+            // add a paragraph start tag but still perform processing
+            $token = array();
+            $this->_splitText($text, $token, $config, $context);
         }
     }
     
@@ -41,26 +35,25 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
         if (!empty($current_nesting)) return;
         
         // check if the start tag counts as a "block" element
-        $definition = $config->getHTMLDefinition();
-        if (isset($definition->info['p']->auto_close[$token->name])) return;
+        if (!$this->_isInline($token, $config)) return;
         
         // append a paragraph tag before the token
         $token = array(new HTMLPurifier_Token_Start('p'), $token);
     }
     
     /**
-     * Sub-function for auto-paragraphing that takes a token and splits it 
-     * up into paragraphs unconditionally. Requires that a paragraph was
-     * already started
+     * Splits up a text in paragraph tokens and appends them
+     * to the result stream that will replace the original
+     * @param $data String text data that will be processed
+     *    into paragraphs
+     * @param $result Reference to array of tokens that the
+     *    tags will be appended onto
+     * @param $config Instance of HTMLPurifier_Config
+     * @param $context Instance of HTMLPurifier_Context
+     * @private
      */
-    function _splitText(&$token, $config, &$context) {
-        $dnl = PHP_EOL . PHP_EOL; // double-newline
-        $definition = $config->getHTMLDefinition();
-        $current_nesting =& $context->get('CurrentNesting');
-        
-        $raw_paragraphs = explode($dnl, $token->data);
-        
-        $token = false; // token has been completely dismantled
+    function _splitText($data, &$result, $config, &$context) {
+        $raw_paragraphs = explode(PHP_EOL . PHP_EOL, $data);
         
         // remove empty paragraphs
         $paragraphs = array();
@@ -68,67 +61,62 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
             if (trim($par) !== '') $paragraphs[] = $par;
         }
         
-        $result =& $context->get('OutputTokens');
-        
+        // check if there are no "real" paragraphs to be processed
         if (empty($paragraphs) && count($raw_paragraphs) > 1) {
             $result[] = new HTMLPurifier_Token_End('p');
-            array_pop($current_nesting);
             return;
         }
         
-        foreach ($paragraphs as $data) {
-            $result[] = new HTMLPurifier_Token_Text($data);
+        // append the paragraphs onto the result
+        foreach ($paragraphs as $par) {
+            $result[] = new HTMLPurifier_Token_Text($par);
             $result[] = new HTMLPurifier_Token_End('p');
             $result[] = new HTMLPurifier_Token_Start('p');
         }
         array_pop($result); // remove trailing start token
         
-        // check the outside to determine whether or not end
-        // paragraph tag is needed (it's already there)
-        $end_paragraph = $this->_needsEndTag(
-            $context->get('InputTokens'),
-            $context->get('InputIndex'),
-            $definition
-        );
-        
-        if ($end_paragraph) {
-            // things are good as they stand, remove top-level parent
-            // that we deferred
-            array_pop($current_nesting);
-        } else {
-            // remove the ending tag, no nesting modifications necessary
+        // check the outside to determine whether or not the
+        // end paragraph tag should be removed
+        if ($this->_removeParagraphEnd($config, $context)) {
             array_pop($result);
         }
         
+        
     }
     
     /**
-     * Determines if up-coming code requires an end-paragraph tag,
-     * otherwise, keep the paragraph open (don't make another one)
-     * @protected
+     * Returns boolean whether or not to remove the paragraph end tag
+     * that was automatically added. The paragraph end tag should be
+     * removed unless the next token is a paragraph or block element.
+     * @param $config Instance of HTMLPurifier_Config
+     * @param $context Instance of HTMLPurifier_Context
+     * @private
      */
-    function _needsEndTag($tokens, $k, $definition) {
-        $end_paragraph = false;
-        for ($j = $k + 1; isset($tokens[$j]); $j++) {
-            if ($tokens[$j]->type == 'start' || $tokens[$j]->type == 'empty') {
-                if ($tokens[$j]->name == 'p') {
-                    $end_paragraph = true;
-                } else {
-                    $end_paragraph = isset($definition->info['p']->auto_close[$tokens[$j]->name]);
-                }
-                break;
-            } elseif ($tokens[$j]->type == 'text') {
-                if (!$tokens[$j]->is_whitespace) {
-                    $end_paragraph = false;
-                    break;
-                }
-            } elseif ($tokens[$j]->type == 'end') {
-                // nonsensical case
-                $end_paragraph = false;
+    function _removeParagraphEnd($config, &$context) {
+        $tokens = $context->get('InputTokens');
+        $i = $context->get('InputIndex');
+        $remove_paragraph_end = true;
+        // Start of the checks one after the current token's index
+        for ($i++; isset($tokens[$i]); $i++) {
+            if ($tokens[$i]->type == 'start' || $tokens[$i]->type == 'empty') {
+                $definition = $config->getHTMLDefinition();
+                $remove_paragraph_end = $this->_isInline($tokens[$i], $config);
                 break;
             }
+            // check if we can abort early (whitespace means we carry-on!)
+            if ($tokens[$i]->type == 'text' && !$tokens[$i]->is_whitespace) break;
+            if ($tokens[$i]->type == 'end') break; // nonsensical
         }
-        return $end_paragraph;
+        return $remove_paragraph_end;
+    }
+    
+    /**
+     * Returns true if passed token is inline (and, ergo, allowed in
+     * paragraph tags)
+     */
+    function _isInline($token, $config) {
+        $definition = $config->getHTMLDefinition();
+        return !isset($definition->info['p']->auto_close[$token->name]);
     }
     
 }
diff --git a/library/HTMLPurifier/Strategy/MakeWellFormed.php b/library/HTMLPurifier/Strategy/MakeWellFormed.php
index 677d09af..e8e605df 100644
--- a/library/HTMLPurifier/Strategy/MakeWellFormed.php
+++ b/library/HTMLPurifier/Strategy/MakeWellFormed.php
@@ -225,7 +225,8 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
                 $current_nesting[] = $token;
             } elseif ($token->type == 'end') {
                 // theoretical: this isn't used because performing
-                // the calculations otherwise is more efficient
+                // the calculations inline is more efficient, and
+                // end tokens currently do not cause a handler invocation
                 array_pop($current_nesting);
             }
         }
diff --git a/tests/HTMLPurifier/Strategy/MakeWellFormedTest.php b/tests/HTMLPurifier/Strategy/MakeWellFormedTest.php
index 20e3bf79..8ab6f907 100644
--- a/tests/HTMLPurifier/Strategy/MakeWellFormedTest.php
+++ b/tests/HTMLPurifier/Strategy/MakeWellFormedTest.php
@@ -92,6 +92,15 @@ Par 1 still</p>'
         $this->assertResult(
 'Par1
 
+Par2',
+            '<p>Par1</p><p>Par2</p>'
+        );
+        
+        $this->assertResult(
+'Par1
+
+ 
+
 Par2',
             '<p>Par1</p><p>Par2</p>'
         );
@@ -149,6 +158,22 @@ Par3',
             '<p>Par<b>1</b></p>'
         );
         
+        $this->assertResult(
+'
+
+Par',
+            '<p>Par</p>'
+        );
+        
+        $this->assertResult(
+'
+
+Par
+
+',
+            '<p>Par</p>'
+        );
+        
     }
     
 }