Finish documenting PEARSax3, touch up the other docs. Nuke the original lexer.txt document.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@102 48356398-32a2-884e-a903-53898d9a118a
2024-12-22 08:21:52 +00:00 · 2006-07-23 18:56:00 +00:00 · 2006-07-23 18:56:00 +00:00 · bcc2b09ac7
commit bcc2b09ac7
parent 48cf55eae4
5 changed files with 101 additions and 51 deletions
--- a/docs/lexer.txt
+++ b/docs/lexer.txt
@ -1,41 +0,0 @@
-
-Lexer
-
-The lexer parses a string of SGML-style markup and converts them into
-corresponding tokens. It doesn't check for well-formedness, although it's
-internal mechanism may make this automatic (such as the case of DOMLex).
-
-We have several implementations of the Lexer:
-
-DirectLex [4,5] - our in-house implementation
-    DirectLex has absolutely no dependencies, making it a reasonably good
-    default for PHP4.  Written with efficiency in mind, it is up to two
-    times faster than the PEAR parser.  It will support UTF-8 completely
-    eventually.
-
-PEARSax3 [4,5] - uses the PEAR package XML_HTMLSax3 to parse
-    PEAR, not suprisingly, also has a SAX parser for HTML.  I don't know
-    very much about implementation, but it's fairly well written.  However, that
-    abstraction comes at a price: performance. You need to have it installed,
-    and if the API changes, it might break our adapter. Not sure whether or not
-    it's UTF-8 aware, but it has some entity parsing trouble.
-
-DOMLex [5] - uses the PHP5 core extension DOM to parse
-    In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
-    It gives us a forgiving HTML parser, which we use to transform the HTML
-    into a DOM, and then into the tokens.  It is blazingly fast, and is the
-    default choice for PHP 5.  However, entity resolution may be troublesome,
-    though its UTF-8 is excellent.  Also, any empty elements will have empty
-    tokens associated with them, even if this is prohibited.
-
-We use tokens because creating a DOM representation would:
-
-1. Require more processing power to create,
-2. Require recursion to iterate,
-3. Must be compatible with PHP 5's DOM,
-4. Has the entire document structure (html and body not needed), and
-5. Has unknown readability improvement.
-
-What the last item means is that the functions for manipulating tokens are
-already fairly compact, and when well-commented, more abstraction may not
-be needed.
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@ -6,7 +6,7 @@ require_once 'HTMLPurifier/Token.php';
 * Forgivingly lexes HTML (SGML-style) markup into tokens.
 * 
 * The lexer parses a string of SGML-style markup and converts them into
- * corresponding tokens.  It doesn't check for well-formedness, although it's
+ * corresponding tokens.  It doesn't check for well-formedness, although its
 * internal mechanism may make this automatic (such as the case of
 * HTMLPurifier_Lexer_DOMLex).  There are several implementations to choose
 * from.
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@ -2,7 +2,25 @@

 require_once 'HTMLPurifier/Lexer.php';

-// PHP5 only!
+/**
+ * Parser that uses PHP 5's DOM extension (part of the core).
+ * 
+ * In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
+ * It gives us a forgiving HTML parser, which we use to transform the HTML
+ * into a DOM, and then into the tokens.  It is blazingly fast (for large
+ * documents, it performs twenty times faster than
+ * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5. 
+ * 
+ * @notice
+ * Any empty elements will have empty tokens associated with them, even if
+ * this is prohibited by the spec. This is cannot be fixed until the spec
+ * comes into play.
+ * 
+ * @todo Determine DOM's entity parsing behavior, point to local entity files
+ *       if necessary.
+ * @todo Make div access less fragile, and refrain from preprocessing when
+ *       HTML tag and friends are already present.
+ */

 class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
 {
@ -19,6 +37,16 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
            );
    }
    
+    /**
+     * Recursive function that tokenizes a node, putting it into an accumulator.
+     * 
+     * @param $node     DOMNode to be tokenized.
+     * @param $tokens   Array-list of already tokenized tokens.
+     * @param $collect  Says whether or start and close are collected, set to
+     *                  false at first recursion because it's the implicit DIV
+     *                  tag you're dealing with.
+     * @returns Tokens of node appended to previously passed tokens.
+     */
    protected function tokenizeDOM($node, $tokens = array(), $collect = false) {
        // recursive goodness!
        
@ -63,6 +91,12 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
        
    }
    
+    /**
+     * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
+     * 
+     * @param $attribute_list DOMNamedNodeMap of DOMAttr objects.
+     * @returns Associative array of attributes.
+     */
    protected function transformAttrToAssoc($attribute_list) {
        $attribute_array = array();
        // undocumented behavior
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@ -16,6 +16,7 @@ require_once 'HTMLPurifier/Lexer.php';
 * @todo Add support for CDATA sections.
 * @todo Determine correct behavior in outputting comment data. (preserve dashes?)
 * @todo Optimize main function tokenizeHTML().
+ * @todo Less than sign (<) being prohibited (even as entity) in attr-values?
 */
 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
 {
@ -108,6 +109,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
    
    /**
     * Substitutes non-special entities with their parsed equivalents.
+     * 
+     * @protected
+     * @param $string String to have non-special entities parsed.
+     * @returns Parsed string.
     */
    function substituteNonSpecialEntities($string) {
        // it will try to detect missing semicolons, but don't rely on it
@ -119,6 +124,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
    
    /**
     * Callback function for substituteNonSpecialEntities() that does the work.
+     * 
+     * @warning Though this is public in order to let the callback happen,
+     *          calling it directly is not recommended.
+     * @param $matches  PCRE-style matches array, with 0 the entire match, and
+     *                  either index 1, 2 or 3 set with a hex value, dec value,
+     *                  or string (respectively).
+     * @returns Replacement string.
+     * @todo Implement string translations
     */
    function nonSpecialEntityCallback($matches) {
        // replaces all but big five
@ -132,14 +145,19 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
        } else {
            if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
            // translate $matches[3]
+            return '';
        }
    }
    
    /**
     * Substitutes only special entities with their parsed equivalents.
     * 
-     * We try to avoid calling this function because otherwise, it would have
-     * to be called a lot (for every parsed section).
+     * @notice We try to avoid calling this function because otherwise, it
+     * would have to be called a lot (for every parsed section).
+     * 
+     * @protected
+     * @param $string String to have non-special entities parsed.
+     * @returns Parsed string.
     */
    function substituteSpecialEntities($string) {
        return preg_replace_callback(
@ -151,7 +169,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
    /**
     * Callback function for substituteSpecialEntities() that does the work.
     * 
-     * This callback is very similar to nonSpecialEntityCallback().
+     * This callback has same syntax as nonSpecialEntityCallback().
+     * 
+     * @warning Though this is public in order to let the callback happen,
+     *          calling it directly is not recommended.
+     * @param $matches  PCRE-style matches array, with 0 the entire match, and
+     *                  either index 1, 2 or 3 set with a hex value, dec value,
+     *                  or string (respectively).
+     * @returns Replacement string.
     */
    function specialEntityCallback($matches) {
        $entity = $matches[0];
@ -327,7 +352,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
     * Takes the inside of an HTML tag and makes an assoc array of attributes.
     * 
     * @param $string Inside of tag excluding name.
-     * @return Assoc array of attributes.
+     * @returns Assoc array of attributes.
     */
    function parseAttributeString($string) {
        $string = (string) $string; // quick typecast
--- a/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/library/HTMLPurifier/Lexer/PEARSax3.php
@ -3,14 +3,32 @@
 require_once 'XML/HTMLSax3.php'; // PEAR
 require_once 'HTMLPurifier/Lexer.php';

-// uses the PEAR class XML_HTMLSax3 to parse XML
+/**
+ * Lexer that uses the PEAR package XML_HTMLSax3 to parse
+ * 
+ * PEAR, not suprisingly, also has a SAX parser for HTML.  I don't know
+ * very much about implementation, but it's fairly well written.  However, that
+ * abstraction comes at a price: performance. You need to have it installed,
+ * and if the API changes, it might break our adapter. Not sure whether or not
+ * it's UTF-8 aware, but it has some entity parsing trouble.
+ * 
+ * Quite personally, I don't recommend using the PEAR class, and the defaults
+ * don't use it. The unit tests do perform the tests on the SAX parser too, but
+ * whatever it does for poorly formed HTML is up to it.
+ * 
+ * @todo Generalize so that XML_HTMLSax is also supported.
+ */
+
 class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
 {
    
-    var $tokens;
+    /**
+     * Internal accumulator array for SAX parsers.
+     * @protected
+     */
+    var $tokens = array();
    
    function tokenizeHTML($html) {
-        $this->tokens = array();
        $parser=& new XML_HTMLSax3();
        $parser->set_object($this);
        $parser->set_element_handler('openHandler','closeHandler');
@ -18,9 +36,14 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
        $parser->set_escape_handler('escapeHandler');
        $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
        $parser->parse($html);
-        return $this->tokens;
+        $tokens = $this->tokens;
+        $this->tokens = array();
+        return $tokens;
    }
    
+    /**
+     * Open tag event handler, interface is defined by PEAR package.
+     */
    function openHandler(&$parser, $name, $attrs, $closed) {
        if ($closed) {
            $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
@ -30,6 +53,9 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
        return true;
    }
    
+    /**
+     * Close tag event handler, interface is defined by PEAR package.
+     */
    function closeHandler(&$parser, $name) {
        // HTMLSax3 seems to always send empty tags an extra close tag
        // check and ignore if you see it:
@ -41,11 +67,17 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
        return true;
    }
    
+    /**
+     * Data event handler, interface is defined by PEAR package.
+     */
    function dataHandler(&$parser, $data) {
        $this->tokens[] = new HTMLPurifier_Token_Text($data);
        return true;
    }
    
+    /**
+     * Escaped text handler,interface is defined by PEAR package.
+     */
    function escapeHandler(&$parser, $data) {
        if (strpos($data, '-') === 0) {
            $this->tokens[] = new HTMLPurifier_Token_Comment($data);