From 37def0104b261af268d0d5a41a99902929b207ed Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <edwardzyang@thewritingpot.com>
Date: Wed, 27 Sep 2006 02:09:54 +0000
Subject: [PATCH] [1.1.2] - Documentation updated - API docs now exclude more
 files that are not classes - Fixed lack of attribute parsing in
 HTMLPurifier_Lexer_PEARSax3 - (internal) Refactored parseData() to general
 Lexer class

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@466 48356398-32a2-884e-a903-53898d9a118a
---
 Doxyfile                                   | 11 +++--
 NEWS                                       |  3 ++
 library/HTMLPurifier.php                   |  4 +-
 library/HTMLPurifier/Lexer.php             | 54 +++++++++++++++++++++
 library/HTMLPurifier/Lexer/DirectLex.php   | 56 +---------------------
 library/HTMLPurifier/Lexer/PEARSax3.php    |  8 ++++
 tests/HTMLPurifier/Lexer/DirectLexTest.php | 18 -------
 tests/HTMLPurifier/LexerTest.php           | 24 +++++++++-
 8 files changed, 99 insertions(+), 79 deletions(-)

diff --git a/Doxyfile b/Doxyfile
index 8853c756..8667cae9 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -4,7 +4,7 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 PROJECT_NAME           = HTML Purifier
-PROJECT_NUMBER         = 1.0.0
+PROJECT_NUMBER         = 1.1.1
 OUTPUT_DIRECTORY       = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen"
 CREATE_SUBDIRS         = NO
 OUTPUT_LANGUAGE        = English
@@ -89,9 +89,12 @@ EXCLUDE                =
 EXCLUDE_SYMLINKS       = NO
 EXCLUDE_PATTERNS       = */tests/* \
                          */benchmarks/* \
-                         */docs/phpdoc/* \
-                         */docs/doxygen/* \
-                         */test-settings.php
+                         */docs/* \
+                         */test-settings.php \
+                         */configdoc/* \
+                         */test-settings.php \
+                         */maintenance/* \
+                         */smoketests/*
 EXAMPLE_PATH           = 
 EXAMPLE_PATTERNS       = *
 EXAMPLE_RECURSIVE      = NO
diff --git a/NEWS b/NEWS
index 151e09f0..96157e19 100644
--- a/NEWS
+++ b/NEWS
@@ -6,7 +6,10 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
 
 1.1.2, unknown projected release date
 (bugfix release, may be merged with 1.2.0 if new features precede major bugs)
+- Documentation updated
+- API docs now exclude more files that are not classes
 - Line endings standardized throughout project
+- Fixed lack of attribute parsing in HTMLPurifier_Lexer_PEARSax3
 
 1.1.1, released 2006-09-24
 - Various documentation updates
diff --git a/library/HTMLPurifier.php b/library/HTMLPurifier.php
index eeb959e5..f02bf0c2 100644
--- a/library/HTMLPurifier.php
+++ b/library/HTMLPurifier.php
@@ -3,7 +3,7 @@
 /*!
  * @mainpage
  * 
- * HTMLPurifier is an HTML filter that will take an arbitrary snippet of
+ * HTML Purifier is an HTML filter that will take an arbitrary snippet of
  * HTML and rigorously test, validate and filter it into a version that
  * is safe for output onto webpages. It achieves this by:
  * 
@@ -22,7 +22,7 @@
  */
 
 /*
-    HTMLPurifier - Standards Compliant HTML Filtering
+    HTML Purifier - Standards Compliant HTML Filtering
     Copyright (C) 2006 Edward Z. Yang
 
     This library is free software; you can redistribute it and/or
diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php
index e43c7b8d..962cb7bf 100644
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@@ -60,6 +60,60 @@ class HTMLPurifier_Lexer
         $this->_entity_parser = new HTMLPurifier_EntityParser();
     }
     
+    
+    /**
+     * Most common entity to raw value conversion table for special entities.
+     * @protected
+     */
+    var $_special_entity2str =
+            array(
+                    '&quot;' => '"',
+                    '&amp;'  => '&',
+                    '&lt;'   => '<',
+                    '&gt;'   => '>',
+                    '&#39;'  => "'",
+                    '&#039;' => "'",
+                    '&#x27;' => "'"
+            );
+    
+    /**
+     * Parses special entities into the proper characters.
+     * 
+     * This string will translate escaped versions of the special characters
+     * into the correct ones.
+     * 
+     * @warning
+     * You should be able to treat the output of this function as
+     * completely parsed, but that's only because all other entities should
+     * have been handled previously in substituteNonSpecialEntities()
+     * 
+     * @param $string String character data to be parsed.
+     * @returns Parsed character data.
+     */
+    function parseData($string) {
+        
+        // following functions require at least one character
+        if ($string === '') return '';
+        
+        // subtracts amps that cannot possibly be escaped
+        $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
+            ($string[strlen($string)-1] === '&' ? 1 : 0);
+        
+        if (!$num_amp) return $string; // abort if no entities
+        $num_esc_amp = substr_count($string, '&amp;');
+        $string = strtr($string, $this->_special_entity2str);
+        
+        // code duplication for sake of optimization, see above
+        $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
+            ($string[strlen($string)-1] === '&' ? 1 : 0);
+        
+        if ($num_amp_2 <= $num_esc_amp) return $string;
+        
+        // hmm... now we have some uncommon entities. Use the callback.
+        $string = $this->_entity_parser->substituteSpecialEntities($string);
+        return $string;
+    }
+    
     var $_encoder;
     
     /**
diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php
index c2d0a9b0..4b9bff1e 100644
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -12,64 +12,12 @@ require_once 'HTMLPurifier/Lexer.php';
  * completely eventually.
  * 
  * @todo Reread XML spec and document differences.
- * @todo Add support for CDATA sections.
- * @todo Determine correct behavior in outputting comment data. (preserve dashes?)
- * @todo Optimize main function tokenizeHTML().
- * @todo Less than sign (<) being prohibited (even as entity) in attr-values?
+ * 
+ * @todo Determine correct behavior in transforming comment data. (preserve dashes?)
  */
 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
 {
     
-    /**
-     * Most common entity to raw value conversion table for special entities.
-     * @protected
-     */
-    var $_special_entity2str =
-            array(
-                    '&quot;' => '"',
-                    '&amp;'  => '&',
-                    '&lt;'   => '<',
-                    '&gt;'   => '>',
-                    '&#39;'  => "'",
-                    '&#039;' => "'",
-                    '&#x27;' => "'"
-            );
-    
-    /**
-     * Parses special entities into the proper characters.
-     * 
-     * This string will translate escaped versions of the special characters
-     * into the correct ones.
-     * 
-     * @warning
-     * You should be able to treat the output of this function as
-     * completely parsed, but that's only because all other entities should
-     * have been handled previously in substituteNonSpecialEntities()
-     * 
-     * @param $string String character data to be parsed.
-     * @returns Parsed character data.
-     */
-    function parseData($string) {
-        
-        // subtracts amps that cannot possibly be escaped
-        $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
-            ($string[strlen($string)-1] === '&' ? 1 : 0);
-        
-        if (!$num_amp) return $string; // abort if no entities
-        $num_esc_amp = substr_count($string, '&amp;');
-        $string = strtr($string, $this->_special_entity2str);
-        
-        // code duplication for sake of optimization, see above
-        $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
-            ($string[strlen($string)-1] === '&' ? 1 : 0);
-        
-        if ($num_amp_2 <= $num_esc_amp) return $string;
-        
-        // hmm... now we have some uncommon entities. Use the callback.
-        $string = $this->_entity_parser->substituteSpecialEntities($string);
-        return $string;
-    }
-    
     /**
      * Whitespace characters for str(c)spn.
      * @protected
diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php
index d2d90a12..229b4636 100644
--- a/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/library/HTMLPurifier/Lexer/PEARSax3.php
@@ -18,6 +18,8 @@ require_once 'HTMLPurifier/Lexer.php';
  * whatever it does for poorly formed HTML is up to it.
  * 
  * @todo Generalize so that XML_HTMLSax is also supported.
+ * 
+ * @warning Entity-resolution inside attributes is broken.
  */
 
 class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
@@ -41,6 +43,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
         $parser->set_element_handler('openHandler','closeHandler');
         $parser->set_data_handler('dataHandler');
         $parser->set_escape_handler('escapeHandler');
+        
+        // doesn't seem to work correctly for attributes
         $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
         
         $parser->parse($string);
@@ -53,6 +57,10 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
      * Open tag event handler, interface is defined by PEAR package.
      */
     function openHandler(&$parser, $name, $attrs, $closed) {
+        // entities are not resolved in attrs
+        foreach ($attrs as $key => $attr) {
+            $attrs[$key] = $this->parseData($attr);
+        }
         if ($closed) {
             $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
         } else {
diff --git a/tests/HTMLPurifier/Lexer/DirectLexTest.php b/tests/HTMLPurifier/Lexer/DirectLexTest.php
index 2ad14476..de35c1d1 100644
--- a/tests/HTMLPurifier/Lexer/DirectLexTest.php
+++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php
@@ -11,24 +11,6 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase
         $this->DirectLex = new HTMLPurifier_Lexer_DirectLex();
     }
     
-    function test_parseData() {
-        $HP =& $this->DirectLex;
-        
-        $this->assertIdentical('asdf', $HP->parseData('asdf'));
-        $this->assertIdentical('&', $HP->parseData('&amp;'));
-        $this->assertIdentical('"', $HP->parseData('&quot;'));
-        $this->assertIdentical("'", $HP->parseData('&#039;'));
-        $this->assertIdentical("'", $HP->parseData('&#39;'));
-        $this->assertIdentical('&&&', $HP->parseData('&amp;&amp;&amp;'));
-        $this->assertIdentical('&&', $HP->parseData('&amp;&')); // [INVALID]
-        $this->assertIdentical('Procter & Gamble',
-                $HP->parseData('Procter & Gamble')); // [INVALID]
-        
-        // This is not special, thus not converted. Test of fault tolerance,
-        // realistically speaking, this should never happen
-        $this->assertIdentical('&#x2D;', $HP->parseData('&#x2D;'));
-    }
-    
     // internals testing
     function test_parseAttributeString() {
         
diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php
index 25fff13c..1ddc8a67 100644
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@@ -38,6 +38,25 @@ class HTMLPurifier_LexerTest extends UnitTestCase
         $this->assertIdentical($extract, $result);
     }
     
+    function test_parseData() {
+        $HP =& $this->Lexer;
+        
+        $this->assertIdentical('asdf', $HP->parseData('asdf'));
+        $this->assertIdentical('&', $HP->parseData('&amp;'));
+        $this->assertIdentical('"', $HP->parseData('&quot;'));
+        $this->assertIdentical("'", $HP->parseData('&#039;'));
+        $this->assertIdentical("'", $HP->parseData('&#39;'));
+        $this->assertIdentical('&&&', $HP->parseData('&amp;&amp;&amp;'));
+        $this->assertIdentical('&&', $HP->parseData('&amp;&')); // [INVALID]
+        $this->assertIdentical('Procter & Gamble',
+                $HP->parseData('Procter & Gamble')); // [INVALID]
+        
+        // This is not special, thus not converted. Test of fault tolerance,
+        // realistically speaking, this should never happen
+        $this->assertIdentical('&#x2D;', $HP->parseData('&#x2D;'));
+    }
+    
+    
     function test_extractBody() {
         $this->assertExtractBody('<b>Bold</b>');
         $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
@@ -249,13 +268,16 @@ class HTMLPurifier_LexerTest extends UnitTestCase
                ,new HTMLPurifier_Token_Text('Link')
                ,new HTMLPurifier_Token_End('a')
             );
-        $sax_expect[16] = false; // PEARSax doesn't support it!
         
         // test that UTF-8 is preserved
         $char_hearts = $this->_entity_lookup->table['hearts'];
         $input[17] = $char_hearts;
         $expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) );
         
+        // test weird characters in attributes
+        $input[18] = '<br test="x &lt; 6" />';
+        $expect[18] = array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) );
+        
         $default_config = HTMLPurifier_Config::createDefault();
         foreach($input as $i => $discard) {
             if (!isset($config[$i])) $config[$i] = $default_config;