[1.1.2]

- Documentation updated - API docs now exclude more files that are not classes - Fixed lack of attribute parsing in HTMLPurifier_Lexer_PEARSax3 - (internal) Refactored parseData() to general Lexer class git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@466 48356398-32a2-884e-a903-53898d9a118a
2025-01-03 05:11:52 +00:00 · 2006-09-27 02:09:54 +00:00 · 2006-09-27 02:09:54 +00:00 · 37def0104b
commit 37def0104b
parent d9bb97cc26
8 changed files with 99 additions and 79 deletions
--- a/11
+++ b/11
@ -4,7 +4,7 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 PROJECT_NAME           = HTML Purifier
-PROJECT_NUMBER         = 1.0.0
+PROJECT_NUMBER         = 1.1.1
 OUTPUT_DIRECTORY       = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen"
 CREATE_SUBDIRS         = NO
 OUTPUT_LANGUAGE        = English
@ -89,9 +89,12 @@ EXCLUDE                =
 EXCLUDE_SYMLINKS       = NO
 EXCLUDE_PATTERNS       = */tests/* \
                         */benchmarks/* \
-                         */docs/phpdoc/* \
+                         */docs/* \
-                         */docs/doxygen/* \
+                         */test-settings.php \
-                         */test-settings.php
+                         */configdoc/* \
                         */test-settings.php \
                         */maintenance/* \
                         */smoketests/*
 EXAMPLE_PATH           = 
 EXAMPLE_PATTERNS       = *
 EXAMPLE_RECURSIVE      = NO
--- a/3
+++ b/3
@ -6,7 +6,10 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
 1.1.2, unknown projected release date
 (bugfix release, may be merged with 1.2.0 if new features precede major bugs)
 - Documentation updated
 - API docs now exclude more files that are not classes
 - Line endings standardized throughout project
 - Fixed lack of attribute parsing in HTMLPurifier_Lexer_PEARSax3
 1.1.1, released 2006-09-24
 - Various documentation updates
--- a/library/HTMLPurifier.php
+++ b/library/HTMLPurifier.php
@ -3,7 +3,7 @@
 /*!
 * @mainpage
 * 
- * HTMLPurifier is an HTML filter that will take an arbitrary snippet of
+ * HTML Purifier is an HTML filter that will take an arbitrary snippet of
 * HTML and rigorously test, validate and filter it into a version that
 * is safe for output onto webpages. It achieves this by:
 * 
@ -22,7 +22,7 @@
 */
 /*
-    HTMLPurifier - Standards Compliant HTML Filtering
+    HTML Purifier - Standards Compliant HTML Filtering
    Copyright (C) 2006 Edward Z. Yang
    This library is free software; you can redistribute it and/or
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@ -60,6 +60,60 @@ class HTMLPurifier_Lexer
        $this->_entity_parser = new HTMLPurifier_EntityParser();
    }
    /**
     * Most common entity to raw value conversion table for special entities.
     * @protected
     */
    var $_special_entity2str =
            array(
                    '&quot;' => '"',
                    '&amp;'  => '&',
                    '&lt;'   => '<',
                    '&gt;'   => '>',
                    '&#39;'  => "'",
                    '&#039;' => "'",
                    '&#x27;' => "'"
            );
    /**
     * Parses special entities into the proper characters.
     * 
     * This string will translate escaped versions of the special characters
     * into the correct ones.
     * 
     * @warning
     * You should be able to treat the output of this function as
     * completely parsed, but that's only because all other entities should
     * have been handled previously in substituteNonSpecialEntities()
     * 
     * @param $string String character data to be parsed.
     * @returns Parsed character data.
     */
    function parseData($string) {
        // following functions require at least one character
        if ($string === '') return '';
        // subtracts amps that cannot possibly be escaped
        $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
            ($string[strlen($string)-1] === '&' ? 1 : 0);
        if (!$num_amp) return $string; // abort if no entities
        $num_esc_amp = substr_count($string, '&amp;');
        $string = strtr($string, $this->_special_entity2str);
        // code duplication for sake of optimization, see above
        $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
            ($string[strlen($string)-1] === '&' ? 1 : 0);
        if ($num_amp_2 <= $num_esc_amp) return $string;
        // hmm... now we have some uncommon entities. Use the callback.
        $string = $this->_entity_parser->substituteSpecialEntities($string);
        return $string;
    }
    var $_encoder;
    /**
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@ -12,64 +12,12 @@ require_once 'HTMLPurifier/Lexer.php';
 * completely eventually.
 * 
 * @todo Reread XML spec and document differences.
- * @todo Add support for CDATA sections.
+ * 
- * @todo Determine correct behavior in outputting comment data. (preserve dashes?)
+ * @todo Determine correct behavior in transforming comment data. (preserve dashes?)
 * @todo Optimize main function tokenizeHTML().
 * @todo Less than sign (<) being prohibited (even as entity) in attr-values?
 */
 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
 {
    /**
     * Most common entity to raw value conversion table for special entities.
     * @protected
     */
    var $_special_entity2str =
            array(
                    '&quot;' => '"',
                    '&amp;'  => '&',
                    '&lt;'   => '<',
                    '&gt;'   => '>',
                    '&#39;'  => "'",
                    '&#039;' => "'",
                    '&#x27;' => "'"
            );
    /**
     * Parses special entities into the proper characters.
     * 
     * This string will translate escaped versions of the special characters
     * into the correct ones.
     * 
     * @warning
     * You should be able to treat the output of this function as
     * completely parsed, but that's only because all other entities should
     * have been handled previously in substituteNonSpecialEntities()
     * 
     * @param $string String character data to be parsed.
     * @returns Parsed character data.
     */
    function parseData($string) {
        // subtracts amps that cannot possibly be escaped
        $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
            ($string[strlen($string)-1] === '&' ? 1 : 0);
        if (!$num_amp) return $string; // abort if no entities
        $num_esc_amp = substr_count($string, '&amp;');
        $string = strtr($string, $this->_special_entity2str);
        // code duplication for sake of optimization, see above
        $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
            ($string[strlen($string)-1] === '&' ? 1 : 0);
        if ($num_amp_2 <= $num_esc_amp) return $string;
        // hmm... now we have some uncommon entities. Use the callback.
        $string = $this->_entity_parser->substituteSpecialEntities($string);
        return $string;
    }
    /**
     * Whitespace characters for str(c)spn.
     * @protected
--- a/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/library/HTMLPurifier/Lexer/PEARSax3.php
@ -18,6 +18,8 @@ require_once 'HTMLPurifier/Lexer.php';
 * whatever it does for poorly formed HTML is up to it.
 * 
 * @todo Generalize so that XML_HTMLSax is also supported.
 * 
 * @warning Entity-resolution inside attributes is broken.
 */
 class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
@ -41,6 +43,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
        $parser->set_element_handler('openHandler','closeHandler');
        $parser->set_data_handler('dataHandler');
        $parser->set_escape_handler('escapeHandler');
        // doesn't seem to work correctly for attributes
        $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
        $parser->parse($string);
@ -53,6 +57,10 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
     * Open tag event handler, interface is defined by PEAR package.
     */
    function openHandler(&$parser, $name, $attrs, $closed) {
        // entities are not resolved in attrs
        foreach ($attrs as $key => $attr) {
            $attrs[$key] = $this->parseData($attr);
        }
        if ($closed) {
            $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
        } else {
--- a/tests/HTMLPurifier/Lexer/DirectLexTest.php
+++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php
@ -11,24 +11,6 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase
        $this->DirectLex = new HTMLPurifier_Lexer_DirectLex();
    }
    function test_parseData() {
        $HP =& $this->DirectLex;
        $this->assertIdentical('asdf', $HP->parseData('asdf'));
        $this->assertIdentical('&', $HP->parseData('&amp;'));
        $this->assertIdentical('"', $HP->parseData('&quot;'));
        $this->assertIdentical("'", $HP->parseData('&#039;'));
        $this->assertIdentical("'", $HP->parseData('&#39;'));
        $this->assertIdentical('&&&', $HP->parseData('&amp;&amp;&amp;'));
        $this->assertIdentical('&&', $HP->parseData('&amp;&')); // [INVALID]
        $this->assertIdentical('Procter & Gamble',
                $HP->parseData('Procter & Gamble')); // [INVALID]
        // This is not special, thus not converted. Test of fault tolerance,
        // realistically speaking, this should never happen
        $this->assertIdentical('&#x2D;', $HP->parseData('&#x2D;'));
    }
    // internals testing
    function test_parseAttributeString() {
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -38,6 +38,25 @@ class HTMLPurifier_LexerTest extends UnitTestCase
        $this->assertIdentical($extract, $result);
    }
    function test_parseData() {
        $HP =& $this->Lexer;
        $this->assertIdentical('asdf', $HP->parseData('asdf'));
        $this->assertIdentical('&', $HP->parseData('&amp;'));
        $this->assertIdentical('"', $HP->parseData('&quot;'));
        $this->assertIdentical("'", $HP->parseData('&#039;'));
        $this->assertIdentical("'", $HP->parseData('&#39;'));
        $this->assertIdentical('&&&', $HP->parseData('&amp;&amp;&amp;'));
        $this->assertIdentical('&&', $HP->parseData('&amp;&')); // [INVALID]
        $this->assertIdentical('Procter & Gamble',
                $HP->parseData('Procter & Gamble')); // [INVALID]
        // This is not special, thus not converted. Test of fault tolerance,
        // realistically speaking, this should never happen
        $this->assertIdentical('&#x2D;', $HP->parseData('&#x2D;'));
    }
    function test_extractBody() {
        $this->assertExtractBody('<b>Bold</b>');
        $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
@ -249,13 +268,16 @@ class HTMLPurifier_LexerTest extends UnitTestCase
               ,new HTMLPurifier_Token_Text('Link')
               ,new HTMLPurifier_Token_End('a')
            );
        $sax_expect[16] = false; // PEARSax doesn't support it!
        // test that UTF-8 is preserved
        $char_hearts = $this->_entity_lookup->table['hearts'];
        $input[17] = $char_hearts;
        $expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) );
        // test weird characters in attributes
        $input[18] = '<br test="x &lt; 6" />';
        $expect[18] = array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) );
        $default_config = HTMLPurifier_Config::createDefault();
        foreach($input as $i => $discard) {
            if (!isset($config[$i])) $config[$i] = $default_config;