diff --git a/Doxyfile b/Doxyfile
index 8853c756..8e53534f 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -4,7 +4,7 @@
# Project related configuration options
#---------------------------------------------------------------------------
PROJECT_NAME = HTML Purifier
-PROJECT_NUMBER = 1.0.0
+PROJECT_NUMBER = 1.1.2
OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen"
CREATE_SUBDIRS = NO
OUTPUT_LANGUAGE = English
@@ -89,9 +89,12 @@ EXCLUDE =
EXCLUDE_SYMLINKS = NO
EXCLUDE_PATTERNS = */tests/* \
*/benchmarks/* \
- */docs/phpdoc/* \
- */docs/doxygen/* \
- */test-settings.php
+ */docs/* \
+ */test-settings.php \
+ */configdoc/* \
+ */test-settings.php \
+ */maintenance/* \
+ */smoketests/*
EXAMPLE_PATH =
EXAMPLE_PATTERNS = *
EXAMPLE_RECURSIVE = NO
diff --git a/INSTALL b/INSTALL
index b3382056..168d1026 100644
--- a/INSTALL
+++ b/INSTALL
@@ -2,145 +2,183 @@
Install
How to install HTML Purifier
-Being a library, there's no fancy GUI that will take you step-by-step through
-configuring database credentials and other mumbo-jumbo. HTML Purifier is
-designed to run "out of the box." Regardless, there are still a couple of
-things you should be mindful of.
+HTML Purifier is designed to run out of the box, so actually using the library
+is extremely easy. (Although, if you were looking for a step-by-step
+installation GUI, you've come to the wrong place!) The impatient can scroll
+down to the bottom of this INSTALL document to see the code, but you really
+should make sure a few things are properly done.
-0. Compatibility
+1. Compatibility
-HTML Purifier works in both PHP 4 and PHP 5. I have run the test suite on
-these versions:
+HTML Purifier works in both PHP 4 and PHP 5, from PHP 4.3.9 and up. It has no
+core dependencies with other libraries. (Whoopee!)
- - 4.3.9, 4.3.11
- - 4.4.0, 4.4.4
- - 5.0.0, 5.0.4
- - 5.1.0, 5.1.6
-
-And can confidently say that HTML Purifier should work in all versions
-between and afterwards. HTML Purifier definitely does not support PHP 4.2,
-and PHP 4.3 branch support may go further back than that, but I haven't tested
-any earlier versions.
-
-I have been unable to get PHP 5.0.5 working on my computer, so if someone
-wants to test that, be my guest. All tests were done on Windows XP Home,
-but operating system should not be a major factor in the library.
+Optional extensions are iconv (usually installed) and tidy (also common).
+If you use UTF-8 and don't plan on pretty-printing HTML, you can get away with
+not having either of these extensions.
-1. Including the proper files
+2. Including the library
-The library/ directory must be added to your path: HTML Purifier will not be
-able to find the necessary includes otherwise. This is as simple as:
+Simply use:
- set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR .
- get_include_path() );
+ require_once '/path/to/library/HTMLPurifier.auto.php';
-...replacing /path/to/htmlpurifier with the actual location of the folder. Don't
-worry, HTML Purifier is namespaced so unless you have another file named
-HTMLPurifier.php, the files won't collide with any of your includes.
+...and you're good to go. Since HTML Purifier's codebase is fairly
+large, I recommend only including HTML Purifier when you need it.
-Then, it's a simple matter of including the base file:
+If you don't like your include_path to be fiddled around with, simply set
+HTML Purifier's library/ directory to the include path yourself and then:
- require_once 'HTMLPurifier.php';
+ require_once 'HTMLPurifier.php';
-...and you're good to go. The library/ folder contains all the files you need,
-so you can get rid of most of everything else when using the library in a
-production environment.
+Only the contents in the library/ folder are necessary, so you can remove
+everything else when using HTML Purifier in a production environment.
-2. Preparing the proper environment
+3. Preparing the proper output environment
-While no configuration is necessary, you first should take precautions regarding
-the other output HTML that the filtered content will be going along with. Here
-is a (short) checklist:
+HTML Purifier is all about web-standards, so accordingly your webpages should
+be standards compliant. HTML Purifier can deal with these doctypes:
- * Have I specified XHTML 1.0 Transitional as the doctype?
- * Have I specified UTF-8 as the character encoding?
+* XHTML 1.0 Transitional (default)
+* HTML 4.01 Transitional
+
+...and these character encodings:
+
+* UTF-8 (default)
+* Any encoding iconv supports (support is crippled for i18n though)
+
+The defaults are there for a reason: they are best-practice choices that
+should not be changed lightly. For those of you in the dark, you can determine
+the doctype from this code in your HTML documents:
-To find out what these are, browse to your website and view its source code.
-You can figure out the doctype from the a declaration that looks like
-or no doctype. You can figure out the character encoding by looking for
+
+...and the character encoding from this code:
+
-I cannot stress the importance of these two bullets enough. Omitting either
-of them could have dire consequences not only for security but for plain
-old usability. You can find a more in-depth discussion of why this is needed
-in docs/security.txt, in the meantime, try to change your output so this is
-the case. If you can't, well, we might be able to accomodate you (read
-section 3).
+For legacy codebases these declarations may be missing. If that is the case,
+STOP, and read up on character encodings and doctypes (in that order). Here
+are some links:
+
+* http://www.joelonsoftware.com/articles/Unicode.html
+* http://alistapart.com/stories/doctype/
+
+You may currently be vulnerable to XSS and other security threats, and HTML
+Purifier won't be able to fix that.
-3. Configuring HTML Purifier
+4. Configuration
HTML Purifier is designed to run out-of-the-box, but occasionally HTML
-Purifier needs to be told what to do.
+Purifier needs to be told what to do. If you answered no to any of these
+questions, read on, otherwise, you can skip to the next section (or, if you're
+into configuring things just for the heck of it, skip to 4.3).
-If, for some reason, you are unable to switch to UTF-8 immediately, you can
-switch HTML Purifier's encoding. Note that the availability of encodings is
-dependent on iconv, and you'll be missing characters if the charset you
-choose doesn't have them.
+* Am I using UTF-8?
+* Am I using XHTML 1.0 Transitional?
+
+If you answered yes to any of these questions, instantiate a configuration
+object and read on:
+
+ $config = HTMLPurifier_Config::createDefault();
+
+
+
+4.1. Setting a different character encoding
+
+You really shouldn't use any other encoding except UTF-8, especially if you
+plan to support multilingual websites (read section three for more details).
+However, switching to UTF-8 is not always immediately feasible, so we can
+adapt.
+
+HTML Purifier uses iconv to support other character encodings, as such,
+any encoding that iconv supports
+HTML Purifier supports with this code:
$config->set('Core', 'Encoding', /* put your encoding here */);
-An example usage for Latin-1 websites:
+An example usage for Latin-1 websites (the most common encoding for English
+websites):
$config->set('Core', 'Encoding', 'ISO-8859-1');
+Note that HTML Purifier's support for non-Unicode encodings is crippled by the
+fact that any character not supported by that encoding will be silently
+dropped, EVEN if it is ampersand escaped. This is a current limitation of
+HTML Purifier that we are NOT actively working to fix. Patches are welcome,
+but there are so many other gotchas and problems in I18N for non-Unicode
+encodings that this functionality is low priority. See
+ for a more
+detailed lowdown on the topic.
+
+
+
+4.2. Setting a different doctype
+
For those of you stuck using HTML 4.01 Transitional, you can disable
XHTML output like this:
$config->set('Core', 'XHTML', false);
-However, I strongly recommend that you use XHTML. Currently, we can only
-guarantee transitional-complaint output, future versions will also allow strict
-output. There are more configuration directives which can be read about
-here: http://hp.jpsband.org/live/configdoc/plain.html
+I recommend that you use XHTML, although not as much as I recommend UTF-8. If
+your HTML 4.01 page validates, good for you!
+
+Currently, we can only guarantee transitional-complaint output, future
+versions will also allow strict-compliant output.
-3. Using the code
+4.3. Other settings
+
+There are more configuration directives which can be read about
+here: They're a bit boring,
+but they can help out for those of you who like to exert maximum control over
+your code.
+
+
+
+5. Using the code
The interface is mind-numbingly simple:
$purifier = new HTMLPurifier();
- $clean_html = $purifier->purify($dirty_html);
+ $clean_html = $purifier->purify( $dirty_html );
-Or, if you're using the configuration object:
+...or, if you're using the configuration object:
$purifier = new HTMLPurifier($config);
- $clean_html = $purifier->purify($dirty_html);
+ $clean_html = $purifier->purify( $dirty_html );
-That's it. For more examples, check out docs/examples/. Also, SLOW gives
-advice on what to do if HTML Purifier is slowing down your application.
+That's it! For more examples, check out docs/examples/ (they aren't very
+different though). Also, SLOW gives advice on what to do if HTML Purifier
+is slowing down your application.
-4. Quick install
+6. Quick install
If your website is in UTF-8 and XHTML Transitional, use this code:
purify($dirty_html);
?>
If your website is in a different encoding or doctype, use this code:
set('Core', 'Encoding', 'ISO-8859-1'); //replace with your encoding
diff --git a/NEWS b/NEWS
index bfcb753b..d312c8ce 100644
--- a/NEWS
+++ b/NEWS
@@ -1,24 +1,37 @@
NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
-1.1.1, released 2006-09-24
-- Various documentation updates
-- Fixed parse error in configuration documentation script
-- Fixed fatal error in benchmark scripts, slightly augmented
-- As far as possible, whitespace is preserved in-between table children
-- Configuration option to optionally Tidy up output for indentation to make up
- for dropped whitespace by DOMLex (pretty-printing for the entire application
- should be done by a page-wide Tidy)
-- Sample test-settings.php file included
+= KEY ====================
+ ! Feature
+ - Bugfix
+ + Sub-comment
+ . Internal change
+==========================
+
+1.1.2, released 2006-09-30
+! Add HTMLPurifier.auto.php stub file that automatically configures pathx
+- Documentation updated
+ + INSTALL document rewritten
+ + TODO added semi-lossy conversion
+ + API Doxygen docs' file exclusions updated
+ + Added notes on HTML versus XML attribute whitespace handling
+ + Noted that HTMLPurifier_ChildDef_Custom isn't being used
+ + Noted that config object's definitions are cached versions
+- Fixed lack of attribute parsing in HTMLPurifier_Lexer_PEARSax3
+- ftp:// URIs now have their typecodes checked
+- Hooked up HTMLPurifier_ChildDef_Custom's unit tests (they weren't being run)
+. Line endings standardized throughout project (svn:eol-style standardized)
+. Refactored parseData() to general Lexer class
+. Tester named "HTML Purifier" not "HTMLPurifier"
1.1.0, released 2006-09-16
+! Directive documentation generation using XSLT
+! XHTML can now be turned off, output becomes
- Made URI validator more forgiving: will ignore leading and trailing
quotes, apostrophes and less than or greater than signs.
- Enforce alphanumeric namespace and directive names for configuration.
-- Directive documentation generation using XSLT
- Table child definition made more flexible, will fix up poorly ordered elements
-- XHTML generation can now be turned off, allowing things like
-- Renamed ConfigDef to ConfigSchema
+. Renamed ConfigDef to ConfigSchema
1.0.1, released 2006-09-04
- Fixed slight bug in DOMLex attribute parsing
@@ -28,17 +41,17 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
space in them. This manifested in TinyMCE.
1.0.0, released 2006-09-01
+! Shorthand CSS properties implemented: font, border, background, list-style
+! Basic color keywords translated into hexadecimal values
+! Table CSS properties implemented
+! Support for charsets other than UTF-8 (defined by iconv)
+! Malformed UTF-8 and non-SGML character detection and cleaning implemented
- Fixed broken numeric entity conversion
-- Malformed UTF-8 and non-SGML character detection and cleaning implemented
- API documentation completed
-- Shorthand CSS properties implemented: font, border, background, list-style
-- Basic color keywords translated into hexadecimal values
-- Table CSS properties implemented
-- (HTML|CSS)Definition de-singleton-ized
-- Support for charsets other than UTF-8 (defined by iconv)
+. (HTML|CSS)Definition de-singleton-ized
1.0.0beta, released 2006-08-16
-- First public release, most functionality implemented. Notable omissions are:
- . Shorthand CSS properties
- . Table CSS properties
- . Deprecated attribute transformations
+! First public release, most functionality implemented. Notable omissions are:
+ + Shorthand CSS properties
+ + Table CSS properties
+ + Deprecated attribute transformations
diff --git a/README b/README
index e318049e..78e171ad 100644
--- a/README
+++ b/README
@@ -1,13 +1,13 @@
-
-README
- All about HTMLPurifier
-HTMLPurifier is an HTML filtering solution. It uses a unique combination of
-robust whitelists and agressive parsing to ensure that not only are XSS
-attacks thwarted, but the resulting HTML is standards compliant.
-
-See INSTALL on how to use the library. See docs/ for more developer-oriented
-documentation as well as some code examples. Users of TinyMCE or FCKeditor
-may be especially interested in WYSIWYG.
-
-HTMLPurifier can be found on the web at: http://hp.jpsband.org/
+README
+ All about HTMLPurifier
+
+HTMLPurifier is an HTML filtering solution. It uses a unique combination of
+robust whitelists and agressive parsing to ensure that not only are XSS
+attacks thwarted, but the resulting HTML is standards compliant.
+
+See INSTALL on how to use the library. See docs/ for more developer-oriented
+documentation as well as some code examples. Users of TinyMCE or FCKeditor
+may be especially interested in WYSIWYG.
+
+HTMLPurifier can be found on the web at: http://hp.jpsband.org/
diff --git a/TODO b/TODO
index 79c32c89..e6a971eb 100644
--- a/TODO
+++ b/TODO
@@ -45,6 +45,8 @@ Unknown release (on a scratch-an-itch basis)
empty-cells:show is applied to have compatibility with Internet Explorer
- Non-lossy dumb alternate character encoding transformations, achieved by
numerically encoding all non-ASCII characters
+ - Semi-lossy dumb alternate character encoding transformations, achieved by
+ encoding all characters that have string entity equivalents
Wontfix
- Non-lossy smart alternate character encoding transformations
diff --git a/configdoc/styles/plain.css b/configdoc/styles/plain.css
index 4a2d2e4b..7857dc1a 100644
--- a/configdoc/styles/plain.css
+++ b/configdoc/styles/plain.css
@@ -1,7 +1,7 @@
-table {border-collapse:collapse;}
-table td, table th {padding:0.2em;}
-
-table.constraints {margin:0 0 1em;}
-table.constraints th {text-align:left;padding-left:0.4em;}
-table.constraints td {padding-right:0.4em;}
-table.constraints td pre {margin:0;}
+table {border-collapse:collapse;}
+table td, table th {padding:0.2em;}
+
+table.constraints {margin:0 0 1em;}
+table.constraints th {text-align:left;padding-left:0.4em;}
+table.constraints td {padding-right:0.4em;}
+table.constraints td pre {margin:0;}
diff --git a/configdoc/styles/plain.xsl b/configdoc/styles/plain.xsl
index d0c60bcf..f4fdb3c2 100644
--- a/configdoc/styles/plain.xsl
+++ b/configdoc/styles/plain.xsl
@@ -1,105 +1,105 @@
-
-
-
-
-
-
-
-
-
- Configuration Documentation
-
-
-
-
-
-
-
-
-
-
-
Configuration Documentation
-
-
-
-
-
-
No configuration directives defined for this namespace.
No configuration directives defined for this namespace.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Used by:
+
+
+ ,
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Type:
+
+
+ type type-
+
+
+
+
+
+
+
Allowed values:
+
+ ,
+ ""
+
+
+
+
+
+
Default:
+
+
+
+
\ No newline at end of file
diff --git a/library/HTMLPurifier.auto.php b/library/HTMLPurifier.auto.php
new file mode 100644
index 00000000..a66fd2e2
--- /dev/null
+++ b/library/HTMLPurifier.auto.php
@@ -0,0 +1,10 @@
+
\ No newline at end of file
diff --git a/library/HTMLPurifier.php b/library/HTMLPurifier.php
index eeb959e5..f02bf0c2 100644
--- a/library/HTMLPurifier.php
+++ b/library/HTMLPurifier.php
@@ -3,7 +3,7 @@
/*!
* @mainpage
*
- * HTMLPurifier is an HTML filter that will take an arbitrary snippet of
+ * HTML Purifier is an HTML filter that will take an arbitrary snippet of
* HTML and rigorously test, validate and filter it into a version that
* is safe for output onto webpages. It achieves this by:
*
@@ -22,7 +22,7 @@
*/
/*
- HTMLPurifier - Standards Compliant HTML Filtering
+ HTML Purifier - Standards Compliant HTML Filtering
Copyright (C) 2006 Edward Z. Yang
This library is free software; you can redistribute it and/or
diff --git a/library/HTMLPurifier/AttrDef.php b/library/HTMLPurifier/AttrDef.php
index 31762ec4..3d04c752 100644
--- a/library/HTMLPurifier/AttrDef.php
+++ b/library/HTMLPurifier/AttrDef.php
@@ -48,7 +48,16 @@ class HTMLPurifier_AttrDef
*
* @note This method is not entirely standards compliant, as trim() removes
* more types of whitespace than specified in the spec. In practice,
- * this is rarely a problem.
+ * this is rarely a problem, as those extra characters usually have
+ * already been removed by HTMLPurifier_Encoder.
+ *
+ * @warning This processing is inconsistent with XML's whitespace handling
+ * as specified by section 3.3.3 and referenced XHTML 1.0 section
+ * 4.7. Compliant processing requires all line breaks normalized
+ * to "\n", so the fix is not as simple as fixing it in this
+ * function. Trim and whitespace collapsing are supposed to only
+ * occur in NMTOKENs. However, note that we are NOT necessarily
+ * parsing XML, thus, this behavior may still be correct.
*
* @public
*/
diff --git a/library/HTMLPurifier/ChildDef.php b/library/HTMLPurifier/ChildDef.php
index e6cc93f8..793ec51a 100644
--- a/library/HTMLPurifier/ChildDef.php
+++ b/library/HTMLPurifier/ChildDef.php
@@ -56,6 +56,8 @@ class HTMLPurifier_ChildDef
*
* @warning Currently this class is an all or nothing proposition, that is,
* it will only give a bool return value.
+ * @note This class is currently not used by any code, although it is unit
+ * tested.
*/
class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
{
diff --git a/library/HTMLPurifier/Config.php b/library/HTMLPurifier/Config.php
index 706dffce..9bffaab3 100644
--- a/library/HTMLPurifier/Config.php
+++ b/library/HTMLPurifier/Config.php
@@ -26,12 +26,12 @@ class HTMLPurifier_Config
var $def;
/**
- * Instance of HTMLPurifier_HTMLDefinition
+ * Cached instance of HTMLPurifier_HTMLDefinition
*/
var $html_definition;
/**
- * Instance of HTMLPurifier_CSSDefinition
+ * Cached instance of HTMLPurifier_CSSDefinition
*/
var $css_definition;
diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php
index e43c7b8d..962cb7bf 100644
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@@ -60,6 +60,60 @@ class HTMLPurifier_Lexer
$this->_entity_parser = new HTMLPurifier_EntityParser();
}
+
+ /**
+ * Most common entity to raw value conversion table for special entities.
+ * @protected
+ */
+ var $_special_entity2str =
+ array(
+ '"' => '"',
+ '&' => '&',
+ '<' => '<',
+ '>' => '>',
+ ''' => "'",
+ ''' => "'",
+ ''' => "'"
+ );
+
+ /**
+ * Parses special entities into the proper characters.
+ *
+ * This string will translate escaped versions of the special characters
+ * into the correct ones.
+ *
+ * @warning
+ * You should be able to treat the output of this function as
+ * completely parsed, but that's only because all other entities should
+ * have been handled previously in substituteNonSpecialEntities()
+ *
+ * @param $string String character data to be parsed.
+ * @returns Parsed character data.
+ */
+ function parseData($string) {
+
+ // following functions require at least one character
+ if ($string === '') return '';
+
+ // subtracts amps that cannot possibly be escaped
+ $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
+ ($string[strlen($string)-1] === '&' ? 1 : 0);
+
+ if (!$num_amp) return $string; // abort if no entities
+ $num_esc_amp = substr_count($string, '&');
+ $string = strtr($string, $this->_special_entity2str);
+
+ // code duplication for sake of optimization, see above
+ $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
+ ($string[strlen($string)-1] === '&' ? 1 : 0);
+
+ if ($num_amp_2 <= $num_esc_amp) return $string;
+
+ // hmm... now we have some uncommon entities. Use the callback.
+ $string = $this->_entity_parser->substituteSpecialEntities($string);
+ return $string;
+ }
+
var $_encoder;
/**
diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php
index c2d0a9b0..4b9bff1e 100644
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -12,64 +12,12 @@ require_once 'HTMLPurifier/Lexer.php';
* completely eventually.
*
* @todo Reread XML spec and document differences.
- * @todo Add support for CDATA sections.
- * @todo Determine correct behavior in outputting comment data. (preserve dashes?)
- * @todo Optimize main function tokenizeHTML().
- * @todo Less than sign (<) being prohibited (even as entity) in attr-values?
+ *
+ * @todo Determine correct behavior in transforming comment data. (preserve dashes?)
*/
class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
{
- /**
- * Most common entity to raw value conversion table for special entities.
- * @protected
- */
- var $_special_entity2str =
- array(
- '"' => '"',
- '&' => '&',
- '<' => '<',
- '>' => '>',
- ''' => "'",
- ''' => "'",
- ''' => "'"
- );
-
- /**
- * Parses special entities into the proper characters.
- *
- * This string will translate escaped versions of the special characters
- * into the correct ones.
- *
- * @warning
- * You should be able to treat the output of this function as
- * completely parsed, but that's only because all other entities should
- * have been handled previously in substituteNonSpecialEntities()
- *
- * @param $string String character data to be parsed.
- * @returns Parsed character data.
- */
- function parseData($string) {
-
- // subtracts amps that cannot possibly be escaped
- $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
- ($string[strlen($string)-1] === '&' ? 1 : 0);
-
- if (!$num_amp) return $string; // abort if no entities
- $num_esc_amp = substr_count($string, '&');
- $string = strtr($string, $this->_special_entity2str);
-
- // code duplication for sake of optimization, see above
- $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
- ($string[strlen($string)-1] === '&' ? 1 : 0);
-
- if ($num_amp_2 <= $num_esc_amp) return $string;
-
- // hmm... now we have some uncommon entities. Use the callback.
- $string = $this->_entity_parser->substituteSpecialEntities($string);
- return $string;
- }
-
/**
* Whitespace characters for str(c)spn.
* @protected
diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php
index d2d90a12..229b4636 100644
--- a/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/library/HTMLPurifier/Lexer/PEARSax3.php
@@ -18,6 +18,8 @@ require_once 'HTMLPurifier/Lexer.php';
* whatever it does for poorly formed HTML is up to it.
*
* @todo Generalize so that XML_HTMLSax is also supported.
+ *
+ * @warning Entity-resolution inside attributes is broken.
*/
class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
@@ -41,6 +43,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
$parser->set_element_handler('openHandler','closeHandler');
$parser->set_data_handler('dataHandler');
$parser->set_escape_handler('escapeHandler');
+
+ // doesn't seem to work correctly for attributes
$parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
$parser->parse($string);
@@ -53,6 +57,10 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
* Open tag event handler, interface is defined by PEAR package.
*/
function openHandler(&$parser, $name, $attrs, $closed) {
+ // entities are not resolved in attrs
+ foreach ($attrs as $key => $attr) {
+ $attrs[$key] = $this->parseData($attr);
+ }
if ($closed) {
$this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
} else {
diff --git a/library/HTMLPurifier/URIScheme/ftp.php b/library/HTMLPurifier/URIScheme/ftp.php
index 0c97d951..c539c354 100644
--- a/library/HTMLPurifier/URIScheme/ftp.php
+++ b/library/HTMLPurifier/URIScheme/ftp.php
@@ -4,7 +4,6 @@ require_once 'HTMLPurifier/URIScheme.php';
/**
* Validates ftp (File Transfer Protocol) URIs as defined by generic RFC 1738.
- * @todo Typecode check on path
*/
class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
@@ -16,7 +15,27 @@ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
list($userinfo, $host, $port, $path, $query) =
parent::validateComponents(
$userinfo, $host, $port, $path, $query, $config );
- // typecode check needed on path
+ $semicolon_pos = strrpos($path, ';'); // reverse
+ if ($semicolon_pos !== false) {
+ // typecode check
+ $type = substr($path, $semicolon_pos + 1); // no semicolon
+ $path = substr($path, 0, $semicolon_pos);
+ $type_ret = '';
+ if (strpos($type, '=') !== false) {
+ // figure out whether or not the declaration is correct
+ list($key, $typecode) = explode('=', $type, 2);
+ if ($key !== 'type') {
+ // invalid key, tack it back on encoded
+ $path .= '%3B' . $type;
+ } elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') {
+ $type_ret = ";type=$typecode";
+ }
+ } else {
+ $path .= '%3B' . $type;
+ }
+ $path = str_replace(';', '%3B', $path);
+ $path .= $type_ret;
+ }
return array($userinfo, $host, $port, $path, null);
}
diff --git a/maintenance/.htaccess b/maintenance/.htaccess
index 03688ee9..3a428827 100644
--- a/maintenance/.htaccess
+++ b/maintenance/.htaccess
@@ -1 +1 @@
-Deny from all
+Deny from all
diff --git a/phpdoc.ini b/phpdoc.ini
index c95e2153..87b110ed 100644
--- a/phpdoc.ini
+++ b/phpdoc.ini
@@ -1,100 +1,100 @@
-;; phpDocumentor parse configuration file
-;;
-;; This file is designed to cut down on repetitive typing on the command-line or web interface
-;; You can copy this file to create a number of configuration files that can be used with the
-;; command-line switch -c, as in phpdoc -c default.ini or phpdoc -c myini.ini. The web
-;; interface will automatically generate a list of .ini files that can be used.
-;;
-;; default.ini is used to generate the online manual at http://www.phpdoc.org/docs
-;;
-;; ALL .ini files must be in the user subdirectory of phpDocumentor with an extension of .ini
-;;
-;; Copyright 2002, Greg Beaver
-;;
-;; WARNING: do not change the name of any command-line parameters, phpDocumentor will ignore them
-
-[Parse Data]
-;; title of all the documentation
-;; legal values: any string
-title = HTML Purifier API Documentation
-
-;; parse files that start with a . like .bash_profile
-;; legal values: true, false
-hidden = false
-
-;; show elements marked @access private in documentation by setting this to on
-;; legal values: on, off
-parseprivate = off
-
-;; parse with javadoc-like description (first sentence is always the short description)
-;; legal values: on, off
-javadocdesc = on
-
-;; add any custom @tags separated by commas here
-;; legal values: any legal tagname separated by commas.
-;customtags = mytag1,mytag2
-
-;; This is only used by the XML:DocBook/peardoc2 converter
-defaultcategoryname = Documentation
-
-;; what is the main package?
-;; legal values: alphanumeric string plus - and _
-defaultpackagename = HTMLPurifier
-
-;; output any parsing information? set to on for cron jobs
-;; legal values: on
-;quiet = on
-
-;; parse a PEAR-style repository. Do not turn this on if your project does
-;; not have a parent directory named "pear"
-;; legal values: on/off
-;pear = on
-
-;; where should the documentation be written?
-;; legal values: a legal path
-target = docs/phpdoc
-
-;; Which files should be parsed out as special documentation files, such as README,
-;; INSTALL and CHANGELOG? This overrides the default files found in
-;; phpDocumentor.ini (this file is not a user .ini file, but the global file)
-readmeinstallchangelog = README, INSTALL, NEWS, WYSIWYG, SLOW, LICENSE, CREDITS
-
-;; limit output to the specified packages, even if others are parsed
-;; legal values: package names separated by commas
-;packageoutput = package1,package2
-
-;; comma-separated list of files to parse
-;; legal values: paths separated by commas
-;filename = /path/to/file1,/path/to/file2,fileincurrentdirectory
-
-;; comma-separated list of directories to parse
-;; legal values: directory paths separated by commas
-;directory = /path1,/path2,.,..,subdirectory
-;directory = /home/jeichorn/cvs/pear
-directory = ./
-
-;; template base directory (the equivalent directory of /phpDocumentor)
-;templatebase = /path/to/my/templates
-
-;; directory to find any example files in through @example and {@example} tags
-;examplesdir = /path/to/my/templates
-
-;; comma-separated list of files, directories or wildcards ? and * (any wildcard) to ignore
-;; legal values: any wildcard strings separated by commas
-;ignore = /path/to/ignore*,*list.php,myfile.php,subdirectory/
-ignore = pear-*,templates/,Documentation/,test*.php,Lexer.inc
-
-sourcecode = on
-
-;; comma-separated list of Converters to use in outputformat:Convertername:templatedirectory format
-;; legal values: HTML:frames:default,HTML:frames:l0l33t,HTML:frames:phpdoc.de,HTML:frames:phphtmllib,
-;; HTML:frames:earthli,
-;; HTML:frames:DOM/default,HTML:frames:DOM/l0l33t,HTML:frames:DOM/phpdoc.de,
-;; HTML:frames:DOM/phphtmllib,HTML:frames:DOM/earthli
-;; HTML:Smarty:default,HTML:Smarty:PHP,HTML:Smarty:HandS
-;; PDF:default:default,CHM:default:default,XML:DocBook/peardoc2:default
-output=HTML:frames:default
-
-;; turn this option on if you want highlighted source code for every file
-;; legal values: on/off
+;; phpDocumentor parse configuration file
+;;
+;; This file is designed to cut down on repetitive typing on the command-line or web interface
+;; You can copy this file to create a number of configuration files that can be used with the
+;; command-line switch -c, as in phpdoc -c default.ini or phpdoc -c myini.ini. The web
+;; interface will automatically generate a list of .ini files that can be used.
+;;
+;; default.ini is used to generate the online manual at http://www.phpdoc.org/docs
+;;
+;; ALL .ini files must be in the user subdirectory of phpDocumentor with an extension of .ini
+;;
+;; Copyright 2002, Greg Beaver
+;;
+;; WARNING: do not change the name of any command-line parameters, phpDocumentor will ignore them
+
+[Parse Data]
+;; title of all the documentation
+;; legal values: any string
+title = HTML Purifier API Documentation
+
+;; parse files that start with a . like .bash_profile
+;; legal values: true, false
+hidden = false
+
+;; show elements marked @access private in documentation by setting this to on
+;; legal values: on, off
+parseprivate = off
+
+;; parse with javadoc-like description (first sentence is always the short description)
+;; legal values: on, off
+javadocdesc = on
+
+;; add any custom @tags separated by commas here
+;; legal values: any legal tagname separated by commas.
+;customtags = mytag1,mytag2
+
+;; This is only used by the XML:DocBook/peardoc2 converter
+defaultcategoryname = Documentation
+
+;; what is the main package?
+;; legal values: alphanumeric string plus - and _
+defaultpackagename = HTMLPurifier
+
+;; output any parsing information? set to on for cron jobs
+;; legal values: on
+;quiet = on
+
+;; parse a PEAR-style repository. Do not turn this on if your project does
+;; not have a parent directory named "pear"
+;; legal values: on/off
+;pear = on
+
+;; where should the documentation be written?
+;; legal values: a legal path
+target = docs/phpdoc
+
+;; Which files should be parsed out as special documentation files, such as README,
+;; INSTALL and CHANGELOG? This overrides the default files found in
+;; phpDocumentor.ini (this file is not a user .ini file, but the global file)
+readmeinstallchangelog = README, INSTALL, NEWS, WYSIWYG, SLOW, LICENSE, CREDITS
+
+;; limit output to the specified packages, even if others are parsed
+;; legal values: package names separated by commas
+;packageoutput = package1,package2
+
+;; comma-separated list of files to parse
+;; legal values: paths separated by commas
+;filename = /path/to/file1,/path/to/file2,fileincurrentdirectory
+
+;; comma-separated list of directories to parse
+;; legal values: directory paths separated by commas
+;directory = /path1,/path2,.,..,subdirectory
+;directory = /home/jeichorn/cvs/pear
+directory = ./
+
+;; template base directory (the equivalent directory of /phpDocumentor)
+;templatebase = /path/to/my/templates
+
+;; directory to find any example files in through @example and {@example} tags
+;examplesdir = /path/to/my/templates
+
+;; comma-separated list of files, directories or wildcards ? and * (any wildcard) to ignore
+;; legal values: any wildcard strings separated by commas
+;ignore = /path/to/ignore*,*list.php,myfile.php,subdirectory/
+ignore = pear-*,templates/,Documentation/,test*.php,Lexer.inc
+
+sourcecode = on
+
+;; comma-separated list of Converters to use in outputformat:Convertername:templatedirectory format
+;; legal values: HTML:frames:default,HTML:frames:l0l33t,HTML:frames:phpdoc.de,HTML:frames:phphtmllib,
+;; HTML:frames:earthli,
+;; HTML:frames:DOM/default,HTML:frames:DOM/l0l33t,HTML:frames:DOM/phpdoc.de,
+;; HTML:frames:DOM/phphtmllib,HTML:frames:DOM/earthli
+;; HTML:Smarty:default,HTML:Smarty:PHP,HTML:Smarty:HandS
+;; PDF:default:default,CHM:default:default,XML:DocBook/peardoc2:default
+output=HTML:frames:default
+
+;; turn this option on if you want highlighted source code for every file
+;; legal values: on/off
sourcecode = on
\ No newline at end of file
diff --git a/tests/HTMLPurifier/ChildDefTest.php b/tests/HTMLPurifier/ChildDefTest.php
index be81831f..db00bd20 100644
--- a/tests/HTMLPurifier/ChildDefTest.php
+++ b/tests/HTMLPurifier/ChildDefTest.php
@@ -46,18 +46,23 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
$this->def = new HTMLPurifier_ChildDef_Custom(
'(a, b?, c*, d+, (a, b)*)');
+ $inputs = array();
+ $expect = array();
+ $config = array();
+
$inputs[0] = '';
$expect[0] = false;
$inputs[1] = '';
$expect[1] = true;
- $inputs[2] = 'Dobfoofoo';
+ $inputs[2] = 'Dobfoofoo';
$expect[2] = true;
$inputs[3] = '';
$expect[3] = false;
+ $this->assertSeries($inputs, $expect, $config);
}
function test_table() {
diff --git a/tests/HTMLPurifier/ConfigTest.php b/tests/HTMLPurifier/ConfigTest.php
index b5f606f4..6812c4d2 100644
--- a/tests/HTMLPurifier/ConfigTest.php
+++ b/tests/HTMLPurifier/ConfigTest.php
@@ -8,6 +8,7 @@ class HTMLPurifier_ConfigTest extends UnitTestCase
var $our_copy, $old_copy;
function setUp() {
+ // set up a dummy schema object for testing
$our_copy = new HTMLPurifier_ConfigSchema();
$this->old_copy = HTMLPurifier_ConfigSchema::instance();
$this->our_copy =& HTMLPurifier_ConfigSchema::instance($our_copy);
@@ -93,6 +94,17 @@ class HTMLPurifier_ConfigTest extends UnitTestCase
}
+ function test_getDefinition() {
+
+ $config = HTMLPurifier_Config::createDefault();
+ $def = $config->getHTMLDefinition();
+ $this->assertIsA($def, 'HTMLPurifier_HTMLDefinition');
+
+ $def = $config->getCSSDefinition();
+ $this->assertIsA($def, 'HTMLPurifier_CSSDefinition');
+
+ }
+
}
?>
\ No newline at end of file
diff --git a/tests/HTMLPurifier/Lexer/DirectLexTest.php b/tests/HTMLPurifier/Lexer/DirectLexTest.php
index 2ad14476..de35c1d1 100644
--- a/tests/HTMLPurifier/Lexer/DirectLexTest.php
+++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php
@@ -11,24 +11,6 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase
$this->DirectLex = new HTMLPurifier_Lexer_DirectLex();
}
- function test_parseData() {
- $HP =& $this->DirectLex;
-
- $this->assertIdentical('asdf', $HP->parseData('asdf'));
- $this->assertIdentical('&', $HP->parseData('&'));
- $this->assertIdentical('"', $HP->parseData('"'));
- $this->assertIdentical("'", $HP->parseData('''));
- $this->assertIdentical("'", $HP->parseData('''));
- $this->assertIdentical('&&&', $HP->parseData('&&&'));
- $this->assertIdentical('&&', $HP->parseData('&&')); // [INVALID]
- $this->assertIdentical('Procter & Gamble',
- $HP->parseData('Procter & Gamble')); // [INVALID]
-
- // This is not special, thus not converted. Test of fault tolerance,
- // realistically speaking, this should never happen
- $this->assertIdentical('-', $HP->parseData('-'));
- }
-
// internals testing
function test_parseAttributeString() {
diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php
index 25fff13c..1ddc8a67 100644
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@@ -38,6 +38,25 @@ class HTMLPurifier_LexerTest extends UnitTestCase
$this->assertIdentical($extract, $result);
}
+ function test_parseData() {
+ $HP =& $this->Lexer;
+
+ $this->assertIdentical('asdf', $HP->parseData('asdf'));
+ $this->assertIdentical('&', $HP->parseData('&'));
+ $this->assertIdentical('"', $HP->parseData('"'));
+ $this->assertIdentical("'", $HP->parseData('''));
+ $this->assertIdentical("'", $HP->parseData('''));
+ $this->assertIdentical('&&&', $HP->parseData('&&&'));
+ $this->assertIdentical('&&', $HP->parseData('&&')); // [INVALID]
+ $this->assertIdentical('Procter & Gamble',
+ $HP->parseData('Procter & Gamble')); // [INVALID]
+
+ // This is not special, thus not converted. Test of fault tolerance,
+ // realistically speaking, this should never happen
+ $this->assertIdentical('-', $HP->parseData('-'));
+ }
+
+
function test_extractBody() {
$this->assertExtractBody('Bold');
$this->assertExtractBody('Bold', 'Bold');
@@ -249,13 +268,16 @@ class HTMLPurifier_LexerTest extends UnitTestCase
,new HTMLPurifier_Token_Text('Link')
,new HTMLPurifier_Token_End('a')
);
- $sax_expect[16] = false; // PEARSax doesn't support it!
// test that UTF-8 is preserved
$char_hearts = $this->_entity_lookup->table['hearts'];
$input[17] = $char_hearts;
$expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) );
+ // test weird characters in attributes
+ $input[18] = ' ';
+ $expect[18] = array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) );
+
$default_config = HTMLPurifier_Config::createDefault();
foreach($input as $i => $discard) {
if (!isset($config[$i])) $config[$i] = $default_config;
diff --git a/tests/HTMLPurifier/URISchemeTest.php b/tests/HTMLPurifier/URISchemeTest.php
index 6cc32b5f..7400b8d1 100644
--- a/tests/HTMLPurifier/URISchemeTest.php
+++ b/tests/HTMLPurifier/URISchemeTest.php
@@ -54,12 +54,34 @@ class HTMLPurifier_URISchemeTest extends UnitTestCase
$scheme = new HTMLPurifier_URIScheme_ftp();
$config = HTMLPurifier_Config::createDefault();
+
$this->assertIdentical(
$scheme->validateComponents(
'user', 'www.example.com', 21, '/', 's=foobar', $config),
array('user', 'www.example.com', null, '/', null)
);
+ // valid typecode
+ $this->assertIdentical(
+ $scheme->validateComponents(
+ null, 'www.example.com', null, '/file.txt;type=a', null, $config),
+ array(null, 'www.example.com', null, '/file.txt;type=a', null)
+ );
+
+ // remove invalid typecode
+ $this->assertIdentical(
+ $scheme->validateComponents(
+ null, 'www.example.com', null, '/file.txt;type=z', null, $config),
+ array(null, 'www.example.com', null, '/file.txt', null)
+ );
+
+ // encode errant semicolons
+ $this->assertIdentical(
+ $scheme->validateComponents(
+ null, 'www.example.com', null, '/too;many;semicolons=1', null, $config),
+ array(null, 'www.example.com', null, '/too%3Bmany%3Bsemicolons=1', null)
+ );
+
}
function test_news() {
diff --git a/tests/index.php b/tests/index.php
index 880c5a87..cdb87ec0 100644
--- a/tests/index.php
+++ b/tests/index.php
@@ -114,14 +114,14 @@ if (isset($_GET['file']) && isset($test_file_lookup[$_GET['file']])) {
// execute only one test
$test_file = $_GET['file'];
- $test = new GroupTest('HTMLPurifier - ' . $test_file);
+ $test = new GroupTest('HTML Purifier - ' . $test_file);
$path = 'HTMLPurifier/' . $test_file;
require_once $path;
$test->addTestClass(htmlpurifier_path2class($path));
} else {
- $test = new GroupTest('HTMLPurifier');
+ $test = new GroupTest('HTML Purifier');
foreach ($test_files as $test_file) {
$path = 'HTMLPurifier/' . $test_file;