From 81041455805fc0ff6a3ba1412bfb299de86c1b84 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sat, 30 Sep 2006 19:10:07 +0000 Subject: [PATCH] Merged 463:474 for 1.1.2 release. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/1.1@475 48356398-32a2-884e-a903-53898d9a118a --- Doxyfile | 11 +- INSTALL | 184 ++++++++++-------- NEWS | 57 +++--- README | 24 +-- TODO | 2 + configdoc/styles/plain.css | 14 +- configdoc/styles/plain.xsl | 208 ++++++++++----------- library/HTMLPurifier.auto.php | 10 + library/HTMLPurifier.php | 4 +- library/HTMLPurifier/AttrDef.php | 11 +- library/HTMLPurifier/ChildDef.php | 2 + library/HTMLPurifier/Config.php | 4 +- library/HTMLPurifier/Lexer.php | 54 ++++++ library/HTMLPurifier/Lexer/DirectLex.php | 56 +----- library/HTMLPurifier/Lexer/PEARSax3.php | 8 + library/HTMLPurifier/URIScheme/ftp.php | 23 ++- maintenance/.htaccess | 2 +- phpdoc.ini | 198 ++++++++++---------- tests/HTMLPurifier/ChildDefTest.php | 7 +- tests/HTMLPurifier/ConfigTest.php | 12 ++ tests/HTMLPurifier/Lexer/DirectLexTest.php | 18 -- tests/HTMLPurifier/LexerTest.php | 24 ++- tests/HTMLPurifier/URISchemeTest.php | 22 +++ tests/index.php | 4 +- 24 files changed, 554 insertions(+), 405 deletions(-) create mode 100644 library/HTMLPurifier.auto.php diff --git a/Doxyfile b/Doxyfile index 8853c756..8e53534f 100644 --- a/Doxyfile +++ b/Doxyfile @@ -4,7 +4,7 @@ # Project related configuration options #--------------------------------------------------------------------------- PROJECT_NAME = HTML Purifier -PROJECT_NUMBER = 1.0.0 +PROJECT_NUMBER = 1.1.2 OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen" CREATE_SUBDIRS = NO OUTPUT_LANGUAGE = English @@ -89,9 +89,12 @@ EXCLUDE = EXCLUDE_SYMLINKS = NO EXCLUDE_PATTERNS = */tests/* \ */benchmarks/* \ - */docs/phpdoc/* \ - */docs/doxygen/* \ - */test-settings.php + */docs/* \ + */test-settings.php \ + */configdoc/* \ + */test-settings.php \ + */maintenance/* \ + */smoketests/* EXAMPLE_PATH = EXAMPLE_PATTERNS = * EXAMPLE_RECURSIVE = NO diff --git a/INSTALL b/INSTALL index b3382056..168d1026 100644 --- a/INSTALL +++ b/INSTALL @@ -2,145 +2,183 @@ Install How to install HTML Purifier -Being a library, there's no fancy GUI that will take you step-by-step through -configuring database credentials and other mumbo-jumbo. HTML Purifier is -designed to run "out of the box." Regardless, there are still a couple of -things you should be mindful of. +HTML Purifier is designed to run out of the box, so actually using the library +is extremely easy. (Although, if you were looking for a step-by-step +installation GUI, you've come to the wrong place!) The impatient can scroll +down to the bottom of this INSTALL document to see the code, but you really +should make sure a few things are properly done. -0. Compatibility +1. Compatibility -HTML Purifier works in both PHP 4 and PHP 5. I have run the test suite on -these versions: +HTML Purifier works in both PHP 4 and PHP 5, from PHP 4.3.9 and up. It has no +core dependencies with other libraries. (Whoopee!) - - 4.3.9, 4.3.11 - - 4.4.0, 4.4.4 - - 5.0.0, 5.0.4 - - 5.1.0, 5.1.6 - -And can confidently say that HTML Purifier should work in all versions -between and afterwards. HTML Purifier definitely does not support PHP 4.2, -and PHP 4.3 branch support may go further back than that, but I haven't tested -any earlier versions. - -I have been unable to get PHP 5.0.5 working on my computer, so if someone -wants to test that, be my guest. All tests were done on Windows XP Home, -but operating system should not be a major factor in the library. +Optional extensions are iconv (usually installed) and tidy (also common). +If you use UTF-8 and don't plan on pretty-printing HTML, you can get away with +not having either of these extensions. -1. Including the proper files +2. Including the library -The library/ directory must be added to your path: HTML Purifier will not be -able to find the necessary includes otherwise. This is as simple as: +Simply use: - set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR . - get_include_path() ); + require_once '/path/to/library/HTMLPurifier.auto.php'; -...replacing /path/to/htmlpurifier with the actual location of the folder. Don't -worry, HTML Purifier is namespaced so unless you have another file named -HTMLPurifier.php, the files won't collide with any of your includes. +...and you're good to go. Since HTML Purifier's codebase is fairly +large, I recommend only including HTML Purifier when you need it. -Then, it's a simple matter of including the base file: +If you don't like your include_path to be fiddled around with, simply set +HTML Purifier's library/ directory to the include path yourself and then: - require_once 'HTMLPurifier.php'; + require_once 'HTMLPurifier.php'; -...and you're good to go. The library/ folder contains all the files you need, -so you can get rid of most of everything else when using the library in a -production environment. +Only the contents in the library/ folder are necessary, so you can remove +everything else when using HTML Purifier in a production environment. -2. Preparing the proper environment +3. Preparing the proper output environment -While no configuration is necessary, you first should take precautions regarding -the other output HTML that the filtered content will be going along with. Here -is a (short) checklist: +HTML Purifier is all about web-standards, so accordingly your webpages should +be standards compliant. HTML Purifier can deal with these doctypes: - * Have I specified XHTML 1.0 Transitional as the doctype? - * Have I specified UTF-8 as the character encoding? +* XHTML 1.0 Transitional (default) +* HTML 4.01 Transitional + +...and these character encodings: + +* UTF-8 (default) +* Any encoding iconv supports (support is crippled for i18n though) + +The defaults are there for a reason: they are best-practice choices that +should not be changed lightly. For those of you in the dark, you can determine +the doctype from this code in your HTML documents: -To find out what these are, browse to your website and view its source code. -You can figure out the doctype from the a declaration that looks like -or no doctype. You can figure out the character encoding by looking for + +...and the character encoding from this code: + -I cannot stress the importance of these two bullets enough. Omitting either -of them could have dire consequences not only for security but for plain -old usability. You can find a more in-depth discussion of why this is needed -in docs/security.txt, in the meantime, try to change your output so this is -the case. If you can't, well, we might be able to accomodate you (read -section 3). +For legacy codebases these declarations may be missing. If that is the case, +STOP, and read up on character encodings and doctypes (in that order). Here +are some links: + +* http://www.joelonsoftware.com/articles/Unicode.html +* http://alistapart.com/stories/doctype/ + +You may currently be vulnerable to XSS and other security threats, and HTML +Purifier won't be able to fix that. -3. Configuring HTML Purifier +4. Configuration HTML Purifier is designed to run out-of-the-box, but occasionally HTML -Purifier needs to be told what to do. +Purifier needs to be told what to do. If you answered no to any of these +questions, read on, otherwise, you can skip to the next section (or, if you're +into configuring things just for the heck of it, skip to 4.3). -If, for some reason, you are unable to switch to UTF-8 immediately, you can -switch HTML Purifier's encoding. Note that the availability of encodings is -dependent on iconv, and you'll be missing characters if the charset you -choose doesn't have them. +* Am I using UTF-8? +* Am I using XHTML 1.0 Transitional? + +If you answered yes to any of these questions, instantiate a configuration +object and read on: + + $config = HTMLPurifier_Config::createDefault(); + + + +4.1. Setting a different character encoding + +You really shouldn't use any other encoding except UTF-8, especially if you +plan to support multilingual websites (read section three for more details). +However, switching to UTF-8 is not always immediately feasible, so we can +adapt. + +HTML Purifier uses iconv to support other character encodings, as such, +any encoding that iconv supports +HTML Purifier supports with this code: $config->set('Core', 'Encoding', /* put your encoding here */); -An example usage for Latin-1 websites: +An example usage for Latin-1 websites (the most common encoding for English +websites): $config->set('Core', 'Encoding', 'ISO-8859-1'); +Note that HTML Purifier's support for non-Unicode encodings is crippled by the +fact that any character not supported by that encoding will be silently +dropped, EVEN if it is ampersand escaped. This is a current limitation of +HTML Purifier that we are NOT actively working to fix. Patches are welcome, +but there are so many other gotchas and problems in I18N for non-Unicode +encodings that this functionality is low priority. See + for a more +detailed lowdown on the topic. + + + +4.2. Setting a different doctype + For those of you stuck using HTML 4.01 Transitional, you can disable XHTML output like this: $config->set('Core', 'XHTML', false); -However, I strongly recommend that you use XHTML. Currently, we can only -guarantee transitional-complaint output, future versions will also allow strict -output. There are more configuration directives which can be read about -here: http://hp.jpsband.org/live/configdoc/plain.html +I recommend that you use XHTML, although not as much as I recommend UTF-8. If +your HTML 4.01 page validates, good for you! + +Currently, we can only guarantee transitional-complaint output, future +versions will also allow strict-compliant output. -3. Using the code +4.3. Other settings + +There are more configuration directives which can be read about +here: They're a bit boring, +but they can help out for those of you who like to exert maximum control over +your code. + + + +5. Using the code The interface is mind-numbingly simple: $purifier = new HTMLPurifier(); - $clean_html = $purifier->purify($dirty_html); + $clean_html = $purifier->purify( $dirty_html ); -Or, if you're using the configuration object: +...or, if you're using the configuration object: $purifier = new HTMLPurifier($config); - $clean_html = $purifier->purify($dirty_html); + $clean_html = $purifier->purify( $dirty_html ); -That's it. For more examples, check out docs/examples/. Also, SLOW gives -advice on what to do if HTML Purifier is slowing down your application. +That's it! For more examples, check out docs/examples/ (they aren't very +different though). Also, SLOW gives advice on what to do if HTML Purifier +is slowing down your application. -4. Quick install +6. Quick install If your website is in UTF-8 and XHTML Transitional, use this code: purify($dirty_html); ?> If your website is in a different encoding or doctype, use this code: set('Core', 'Encoding', 'ISO-8859-1'); //replace with your encoding diff --git a/NEWS b/NEWS index bfcb753b..d312c8ce 100644 --- a/NEWS +++ b/NEWS @@ -1,24 +1,37 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| -1.1.1, released 2006-09-24 -- Various documentation updates -- Fixed parse error in configuration documentation script -- Fixed fatal error in benchmark scripts, slightly augmented -- As far as possible, whitespace is preserved in-between table children -- Configuration option to optionally Tidy up output for indentation to make up - for dropped whitespace by DOMLex (pretty-printing for the entire application - should be done by a page-wide Tidy) -- Sample test-settings.php file included += KEY ==================== + ! Feature + - Bugfix + + Sub-comment + . Internal change +========================== + +1.1.2, released 2006-09-30 +! Add HTMLPurifier.auto.php stub file that automatically configures pathx +- Documentation updated + + INSTALL document rewritten + + TODO added semi-lossy conversion + + API Doxygen docs' file exclusions updated + + Added notes on HTML versus XML attribute whitespace handling + + Noted that HTMLPurifier_ChildDef_Custom isn't being used + + Noted that config object's definitions are cached versions +- Fixed lack of attribute parsing in HTMLPurifier_Lexer_PEARSax3 +- ftp:// URIs now have their typecodes checked +- Hooked up HTMLPurifier_ChildDef_Custom's unit tests (they weren't being run) +. Line endings standardized throughout project (svn:eol-style standardized) +. Refactored parseData() to general Lexer class +. Tester named "HTML Purifier" not "HTMLPurifier" 1.1.0, released 2006-09-16 +! Directive documentation generation using XSLT +! XHTML can now be turned off, output becomes
- Made URI validator more forgiving: will ignore leading and trailing quotes, apostrophes and less than or greater than signs. - Enforce alphanumeric namespace and directive names for configuration. -- Directive documentation generation using XSLT - Table child definition made more flexible, will fix up poorly ordered elements -- XHTML generation can now be turned off, allowing things like
-- Renamed ConfigDef to ConfigSchema +. Renamed ConfigDef to ConfigSchema 1.0.1, released 2006-09-04 - Fixed slight bug in DOMLex attribute parsing @@ -28,17 +41,17 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier space in them. This manifested in TinyMCE. 1.0.0, released 2006-09-01 +! Shorthand CSS properties implemented: font, border, background, list-style +! Basic color keywords translated into hexadecimal values +! Table CSS properties implemented +! Support for charsets other than UTF-8 (defined by iconv) +! Malformed UTF-8 and non-SGML character detection and cleaning implemented - Fixed broken numeric entity conversion -- Malformed UTF-8 and non-SGML character detection and cleaning implemented - API documentation completed -- Shorthand CSS properties implemented: font, border, background, list-style -- Basic color keywords translated into hexadecimal values -- Table CSS properties implemented -- (HTML|CSS)Definition de-singleton-ized -- Support for charsets other than UTF-8 (defined by iconv) +. (HTML|CSS)Definition de-singleton-ized 1.0.0beta, released 2006-08-16 -- First public release, most functionality implemented. Notable omissions are: - . Shorthand CSS properties - . Table CSS properties - . Deprecated attribute transformations +! First public release, most functionality implemented. Notable omissions are: + + Shorthand CSS properties + + Table CSS properties + + Deprecated attribute transformations diff --git a/README b/README index e318049e..78e171ad 100644 --- a/README +++ b/README @@ -1,13 +1,13 @@ - -README - All about HTMLPurifier -HTMLPurifier is an HTML filtering solution. It uses a unique combination of -robust whitelists and agressive parsing to ensure that not only are XSS -attacks thwarted, but the resulting HTML is standards compliant. - -See INSTALL on how to use the library. See docs/ for more developer-oriented -documentation as well as some code examples. Users of TinyMCE or FCKeditor -may be especially interested in WYSIWYG. - -HTMLPurifier can be found on the web at: http://hp.jpsband.org/ +README + All about HTMLPurifier + +HTMLPurifier is an HTML filtering solution. It uses a unique combination of +robust whitelists and agressive parsing to ensure that not only are XSS +attacks thwarted, but the resulting HTML is standards compliant. + +See INSTALL on how to use the library. See docs/ for more developer-oriented +documentation as well as some code examples. Users of TinyMCE or FCKeditor +may be especially interested in WYSIWYG. + +HTMLPurifier can be found on the web at: http://hp.jpsband.org/ diff --git a/TODO b/TODO index 79c32c89..e6a971eb 100644 --- a/TODO +++ b/TODO @@ -45,6 +45,8 @@ Unknown release (on a scratch-an-itch basis) empty-cells:show is applied to have compatibility with Internet Explorer - Non-lossy dumb alternate character encoding transformations, achieved by numerically encoding all non-ASCII characters + - Semi-lossy dumb alternate character encoding transformations, achieved by + encoding all characters that have string entity equivalents Wontfix - Non-lossy smart alternate character encoding transformations diff --git a/configdoc/styles/plain.css b/configdoc/styles/plain.css index 4a2d2e4b..7857dc1a 100644 --- a/configdoc/styles/plain.css +++ b/configdoc/styles/plain.css @@ -1,7 +1,7 @@ -table {border-collapse:collapse;} -table td, table th {padding:0.2em;} - -table.constraints {margin:0 0 1em;} -table.constraints th {text-align:left;padding-left:0.4em;} -table.constraints td {padding-right:0.4em;} -table.constraints td pre {margin:0;} +table {border-collapse:collapse;} +table td, table th {padding:0.2em;} + +table.constraints {margin:0 0 1em;} +table.constraints th {text-align:left;padding-left:0.4em;} +table.constraints td {padding-right:0.4em;} +table.constraints td pre {margin:0;} diff --git a/configdoc/styles/plain.xsl b/configdoc/styles/plain.xsl index d0c60bcf..f4fdb3c2 100644 --- a/configdoc/styles/plain.xsl +++ b/configdoc/styles/plain.xsl @@ -1,105 +1,105 @@ - - - - - - - - - - <xsl:value-of select="/configdoc/title" /> Configuration Documentation - - - - - - - - - - -

Configuration Documentation

-
- - - - -

No configuration directives defined for this namespace.

-
-
- -

-
- -
- -
-
- - - - - -

-
- - - - - - - - -
Used by: - - , - - -
-
- -
- -
-
- - - - Type: - - - type type- - - - - - - - Allowed values: - - , - "" - - - - - - Default: -
- -
- + + + + + + + + + + <xsl:value-of select="/configdoc/title" /> Configuration Documentation + + + + + + + + + + +

Configuration Documentation

+
+ + + + +

No configuration directives defined for this namespace.

+
+
+ +

+
+ +
+ +
+
+ + + + + +

+
+ + + + + + + + +
Used by: + + , + + +
+
+ +
+ +
+
+ + + + Type: + + + type type- + + + + + + + Allowed values: + + , + "" + + + + + + Default: +
+ +
+
\ No newline at end of file diff --git a/library/HTMLPurifier.auto.php b/library/HTMLPurifier.auto.php new file mode 100644 index 00000000..a66fd2e2 --- /dev/null +++ b/library/HTMLPurifier.auto.php @@ -0,0 +1,10 @@ + \ No newline at end of file diff --git a/library/HTMLPurifier.php b/library/HTMLPurifier.php index eeb959e5..f02bf0c2 100644 --- a/library/HTMLPurifier.php +++ b/library/HTMLPurifier.php @@ -3,7 +3,7 @@ /*! * @mainpage * - * HTMLPurifier is an HTML filter that will take an arbitrary snippet of + * HTML Purifier is an HTML filter that will take an arbitrary snippet of * HTML and rigorously test, validate and filter it into a version that * is safe for output onto webpages. It achieves this by: * @@ -22,7 +22,7 @@ */ /* - HTMLPurifier - Standards Compliant HTML Filtering + HTML Purifier - Standards Compliant HTML Filtering Copyright (C) 2006 Edward Z. Yang This library is free software; you can redistribute it and/or diff --git a/library/HTMLPurifier/AttrDef.php b/library/HTMLPurifier/AttrDef.php index 31762ec4..3d04c752 100644 --- a/library/HTMLPurifier/AttrDef.php +++ b/library/HTMLPurifier/AttrDef.php @@ -48,7 +48,16 @@ class HTMLPurifier_AttrDef * * @note This method is not entirely standards compliant, as trim() removes * more types of whitespace than specified in the spec. In practice, - * this is rarely a problem. + * this is rarely a problem, as those extra characters usually have + * already been removed by HTMLPurifier_Encoder. + * + * @warning This processing is inconsistent with XML's whitespace handling + * as specified by section 3.3.3 and referenced XHTML 1.0 section + * 4.7. Compliant processing requires all line breaks normalized + * to "\n", so the fix is not as simple as fixing it in this + * function. Trim and whitespace collapsing are supposed to only + * occur in NMTOKENs. However, note that we are NOT necessarily + * parsing XML, thus, this behavior may still be correct. * * @public */ diff --git a/library/HTMLPurifier/ChildDef.php b/library/HTMLPurifier/ChildDef.php index e6cc93f8..793ec51a 100644 --- a/library/HTMLPurifier/ChildDef.php +++ b/library/HTMLPurifier/ChildDef.php @@ -56,6 +56,8 @@ class HTMLPurifier_ChildDef * * @warning Currently this class is an all or nothing proposition, that is, * it will only give a bool return value. + * @note This class is currently not used by any code, although it is unit + * tested. */ class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef { diff --git a/library/HTMLPurifier/Config.php b/library/HTMLPurifier/Config.php index 706dffce..9bffaab3 100644 --- a/library/HTMLPurifier/Config.php +++ b/library/HTMLPurifier/Config.php @@ -26,12 +26,12 @@ class HTMLPurifier_Config var $def; /** - * Instance of HTMLPurifier_HTMLDefinition + * Cached instance of HTMLPurifier_HTMLDefinition */ var $html_definition; /** - * Instance of HTMLPurifier_CSSDefinition + * Cached instance of HTMLPurifier_CSSDefinition */ var $css_definition; diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index e43c7b8d..962cb7bf 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -60,6 +60,60 @@ class HTMLPurifier_Lexer $this->_entity_parser = new HTMLPurifier_EntityParser(); } + + /** + * Most common entity to raw value conversion table for special entities. + * @protected + */ + var $_special_entity2str = + array( + '"' => '"', + '&' => '&', + '<' => '<', + '>' => '>', + ''' => "'", + ''' => "'", + ''' => "'" + ); + + /** + * Parses special entities into the proper characters. + * + * This string will translate escaped versions of the special characters + * into the correct ones. + * + * @warning + * You should be able to treat the output of this function as + * completely parsed, but that's only because all other entities should + * have been handled previously in substituteNonSpecialEntities() + * + * @param $string String character data to be parsed. + * @returns Parsed character data. + */ + function parseData($string) { + + // following functions require at least one character + if ($string === '') return ''; + + // subtracts amps that cannot possibly be escaped + $num_amp = substr_count($string, '&') - substr_count($string, '& ') - + ($string[strlen($string)-1] === '&' ? 1 : 0); + + if (!$num_amp) return $string; // abort if no entities + $num_esc_amp = substr_count($string, '&'); + $string = strtr($string, $this->_special_entity2str); + + // code duplication for sake of optimization, see above + $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') - + ($string[strlen($string)-1] === '&' ? 1 : 0); + + if ($num_amp_2 <= $num_esc_amp) return $string; + + // hmm... now we have some uncommon entities. Use the callback. + $string = $this->_entity_parser->substituteSpecialEntities($string); + return $string; + } + var $_encoder; /** diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index c2d0a9b0..4b9bff1e 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -12,64 +12,12 @@ require_once 'HTMLPurifier/Lexer.php'; * completely eventually. * * @todo Reread XML spec and document differences. - * @todo Add support for CDATA sections. - * @todo Determine correct behavior in outputting comment data. (preserve dashes?) - * @todo Optimize main function tokenizeHTML(). - * @todo Less than sign (<) being prohibited (even as entity) in attr-values? + * + * @todo Determine correct behavior in transforming comment data. (preserve dashes?) */ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer { - /** - * Most common entity to raw value conversion table for special entities. - * @protected - */ - var $_special_entity2str = - array( - '"' => '"', - '&' => '&', - '<' => '<', - '>' => '>', - ''' => "'", - ''' => "'", - ''' => "'" - ); - - /** - * Parses special entities into the proper characters. - * - * This string will translate escaped versions of the special characters - * into the correct ones. - * - * @warning - * You should be able to treat the output of this function as - * completely parsed, but that's only because all other entities should - * have been handled previously in substituteNonSpecialEntities() - * - * @param $string String character data to be parsed. - * @returns Parsed character data. - */ - function parseData($string) { - - // subtracts amps that cannot possibly be escaped - $num_amp = substr_count($string, '&') - substr_count($string, '& ') - - ($string[strlen($string)-1] === '&' ? 1 : 0); - - if (!$num_amp) return $string; // abort if no entities - $num_esc_amp = substr_count($string, '&'); - $string = strtr($string, $this->_special_entity2str); - - // code duplication for sake of optimization, see above - $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') - - ($string[strlen($string)-1] === '&' ? 1 : 0); - - if ($num_amp_2 <= $num_esc_amp) return $string; - - // hmm... now we have some uncommon entities. Use the callback. - $string = $this->_entity_parser->substituteSpecialEntities($string); - return $string; - } - /** * Whitespace characters for str(c)spn. * @protected diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php index d2d90a12..229b4636 100644 --- a/library/HTMLPurifier/Lexer/PEARSax3.php +++ b/library/HTMLPurifier/Lexer/PEARSax3.php @@ -18,6 +18,8 @@ require_once 'HTMLPurifier/Lexer.php'; * whatever it does for poorly formed HTML is up to it. * * @todo Generalize so that XML_HTMLSax is also supported. + * + * @warning Entity-resolution inside attributes is broken. */ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer @@ -41,6 +43,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer $parser->set_element_handler('openHandler','closeHandler'); $parser->set_data_handler('dataHandler'); $parser->set_escape_handler('escapeHandler'); + + // doesn't seem to work correctly for attributes $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1); $parser->parse($string); @@ -53,6 +57,10 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer * Open tag event handler, interface is defined by PEAR package. */ function openHandler(&$parser, $name, $attrs, $closed) { + // entities are not resolved in attrs + foreach ($attrs as $key => $attr) { + $attrs[$key] = $this->parseData($attr); + } if ($closed) { $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs); } else { diff --git a/library/HTMLPurifier/URIScheme/ftp.php b/library/HTMLPurifier/URIScheme/ftp.php index 0c97d951..c539c354 100644 --- a/library/HTMLPurifier/URIScheme/ftp.php +++ b/library/HTMLPurifier/URIScheme/ftp.php @@ -4,7 +4,6 @@ require_once 'HTMLPurifier/URIScheme.php'; /** * Validates ftp (File Transfer Protocol) URIs as defined by generic RFC 1738. - * @todo Typecode check on path */ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme { @@ -16,7 +15,27 @@ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme { list($userinfo, $host, $port, $path, $query) = parent::validateComponents( $userinfo, $host, $port, $path, $query, $config ); - // typecode check needed on path + $semicolon_pos = strrpos($path, ';'); // reverse + if ($semicolon_pos !== false) { + // typecode check + $type = substr($path, $semicolon_pos + 1); // no semicolon + $path = substr($path, 0, $semicolon_pos); + $type_ret = ''; + if (strpos($type, '=') !== false) { + // figure out whether or not the declaration is correct + list($key, $typecode) = explode('=', $type, 2); + if ($key !== 'type') { + // invalid key, tack it back on encoded + $path .= '%3B' . $type; + } elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') { + $type_ret = ";type=$typecode"; + } + } else { + $path .= '%3B' . $type; + } + $path = str_replace(';', '%3B', $path); + $path .= $type_ret; + } return array($userinfo, $host, $port, $path, null); } diff --git a/maintenance/.htaccess b/maintenance/.htaccess index 03688ee9..3a428827 100644 --- a/maintenance/.htaccess +++ b/maintenance/.htaccess @@ -1 +1 @@ -Deny from all +Deny from all diff --git a/phpdoc.ini b/phpdoc.ini index c95e2153..87b110ed 100644 --- a/phpdoc.ini +++ b/phpdoc.ini @@ -1,100 +1,100 @@ -;; phpDocumentor parse configuration file -;; -;; This file is designed to cut down on repetitive typing on the command-line or web interface -;; You can copy this file to create a number of configuration files that can be used with the -;; command-line switch -c, as in phpdoc -c default.ini or phpdoc -c myini.ini. The web -;; interface will automatically generate a list of .ini files that can be used. -;; -;; default.ini is used to generate the online manual at http://www.phpdoc.org/docs -;; -;; ALL .ini files must be in the user subdirectory of phpDocumentor with an extension of .ini -;; -;; Copyright 2002, Greg Beaver -;; -;; WARNING: do not change the name of any command-line parameters, phpDocumentor will ignore them - -[Parse Data] -;; title of all the documentation -;; legal values: any string -title = HTML Purifier API Documentation - -;; parse files that start with a . like .bash_profile -;; legal values: true, false -hidden = false - -;; show elements marked @access private in documentation by setting this to on -;; legal values: on, off -parseprivate = off - -;; parse with javadoc-like description (first sentence is always the short description) -;; legal values: on, off -javadocdesc = on - -;; add any custom @tags separated by commas here -;; legal values: any legal tagname separated by commas. -;customtags = mytag1,mytag2 - -;; This is only used by the XML:DocBook/peardoc2 converter -defaultcategoryname = Documentation - -;; what is the main package? -;; legal values: alphanumeric string plus - and _ -defaultpackagename = HTMLPurifier - -;; output any parsing information? set to on for cron jobs -;; legal values: on -;quiet = on - -;; parse a PEAR-style repository. Do not turn this on if your project does -;; not have a parent directory named "pear" -;; legal values: on/off -;pear = on - -;; where should the documentation be written? -;; legal values: a legal path -target = docs/phpdoc - -;; Which files should be parsed out as special documentation files, such as README, -;; INSTALL and CHANGELOG? This overrides the default files found in -;; phpDocumentor.ini (this file is not a user .ini file, but the global file) -readmeinstallchangelog = README, INSTALL, NEWS, WYSIWYG, SLOW, LICENSE, CREDITS - -;; limit output to the specified packages, even if others are parsed -;; legal values: package names separated by commas -;packageoutput = package1,package2 - -;; comma-separated list of files to parse -;; legal values: paths separated by commas -;filename = /path/to/file1,/path/to/file2,fileincurrentdirectory - -;; comma-separated list of directories to parse -;; legal values: directory paths separated by commas -;directory = /path1,/path2,.,..,subdirectory -;directory = /home/jeichorn/cvs/pear -directory = ./ - -;; template base directory (the equivalent directory of /phpDocumentor) -;templatebase = /path/to/my/templates - -;; directory to find any example files in through @example and {@example} tags -;examplesdir = /path/to/my/templates - -;; comma-separated list of files, directories or wildcards ? and * (any wildcard) to ignore -;; legal values: any wildcard strings separated by commas -;ignore = /path/to/ignore*,*list.php,myfile.php,subdirectory/ -ignore = pear-*,templates/,Documentation/,test*.php,Lexer.inc - -sourcecode = on - -;; comma-separated list of Converters to use in outputformat:Convertername:templatedirectory format -;; legal values: HTML:frames:default,HTML:frames:l0l33t,HTML:frames:phpdoc.de,HTML:frames:phphtmllib, -;; HTML:frames:earthli, -;; HTML:frames:DOM/default,HTML:frames:DOM/l0l33t,HTML:frames:DOM/phpdoc.de, -;; HTML:frames:DOM/phphtmllib,HTML:frames:DOM/earthli -;; HTML:Smarty:default,HTML:Smarty:PHP,HTML:Smarty:HandS -;; PDF:default:default,CHM:default:default,XML:DocBook/peardoc2:default -output=HTML:frames:default - -;; turn this option on if you want highlighted source code for every file -;; legal values: on/off +;; phpDocumentor parse configuration file +;; +;; This file is designed to cut down on repetitive typing on the command-line or web interface +;; You can copy this file to create a number of configuration files that can be used with the +;; command-line switch -c, as in phpdoc -c default.ini or phpdoc -c myini.ini. The web +;; interface will automatically generate a list of .ini files that can be used. +;; +;; default.ini is used to generate the online manual at http://www.phpdoc.org/docs +;; +;; ALL .ini files must be in the user subdirectory of phpDocumentor with an extension of .ini +;; +;; Copyright 2002, Greg Beaver +;; +;; WARNING: do not change the name of any command-line parameters, phpDocumentor will ignore them + +[Parse Data] +;; title of all the documentation +;; legal values: any string +title = HTML Purifier API Documentation + +;; parse files that start with a . like .bash_profile +;; legal values: true, false +hidden = false + +;; show elements marked @access private in documentation by setting this to on +;; legal values: on, off +parseprivate = off + +;; parse with javadoc-like description (first sentence is always the short description) +;; legal values: on, off +javadocdesc = on + +;; add any custom @tags separated by commas here +;; legal values: any legal tagname separated by commas. +;customtags = mytag1,mytag2 + +;; This is only used by the XML:DocBook/peardoc2 converter +defaultcategoryname = Documentation + +;; what is the main package? +;; legal values: alphanumeric string plus - and _ +defaultpackagename = HTMLPurifier + +;; output any parsing information? set to on for cron jobs +;; legal values: on +;quiet = on + +;; parse a PEAR-style repository. Do not turn this on if your project does +;; not have a parent directory named "pear" +;; legal values: on/off +;pear = on + +;; where should the documentation be written? +;; legal values: a legal path +target = docs/phpdoc + +;; Which files should be parsed out as special documentation files, such as README, +;; INSTALL and CHANGELOG? This overrides the default files found in +;; phpDocumentor.ini (this file is not a user .ini file, but the global file) +readmeinstallchangelog = README, INSTALL, NEWS, WYSIWYG, SLOW, LICENSE, CREDITS + +;; limit output to the specified packages, even if others are parsed +;; legal values: package names separated by commas +;packageoutput = package1,package2 + +;; comma-separated list of files to parse +;; legal values: paths separated by commas +;filename = /path/to/file1,/path/to/file2,fileincurrentdirectory + +;; comma-separated list of directories to parse +;; legal values: directory paths separated by commas +;directory = /path1,/path2,.,..,subdirectory +;directory = /home/jeichorn/cvs/pear +directory = ./ + +;; template base directory (the equivalent directory of /phpDocumentor) +;templatebase = /path/to/my/templates + +;; directory to find any example files in through @example and {@example} tags +;examplesdir = /path/to/my/templates + +;; comma-separated list of files, directories or wildcards ? and * (any wildcard) to ignore +;; legal values: any wildcard strings separated by commas +;ignore = /path/to/ignore*,*list.php,myfile.php,subdirectory/ +ignore = pear-*,templates/,Documentation/,test*.php,Lexer.inc + +sourcecode = on + +;; comma-separated list of Converters to use in outputformat:Convertername:templatedirectory format +;; legal values: HTML:frames:default,HTML:frames:l0l33t,HTML:frames:phpdoc.de,HTML:frames:phphtmllib, +;; HTML:frames:earthli, +;; HTML:frames:DOM/default,HTML:frames:DOM/l0l33t,HTML:frames:DOM/phpdoc.de, +;; HTML:frames:DOM/phphtmllib,HTML:frames:DOM/earthli +;; HTML:Smarty:default,HTML:Smarty:PHP,HTML:Smarty:HandS +;; PDF:default:default,CHM:default:default,XML:DocBook/peardoc2:default +output=HTML:frames:default + +;; turn this option on if you want highlighted source code for every file +;; legal values: on/off sourcecode = on \ No newline at end of file diff --git a/tests/HTMLPurifier/ChildDefTest.php b/tests/HTMLPurifier/ChildDefTest.php index be81831f..db00bd20 100644 --- a/tests/HTMLPurifier/ChildDefTest.php +++ b/tests/HTMLPurifier/ChildDefTest.php @@ -46,18 +46,23 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase $this->def = new HTMLPurifier_ChildDef_Custom( '(a, b?, c*, d+, (a, b)*)'); + $inputs = array(); + $expect = array(); + $config = array(); + $inputs[0] = ''; $expect[0] = false; $inputs[1] = ''; $expect[1] = true; - $inputs[2] = 'Dobfoofoo'; + $inputs[2] = 'Dobfoofoo'; $expect[2] = true; $inputs[3] = ''; $expect[3] = false; + $this->assertSeries($inputs, $expect, $config); } function test_table() { diff --git a/tests/HTMLPurifier/ConfigTest.php b/tests/HTMLPurifier/ConfigTest.php index b5f606f4..6812c4d2 100644 --- a/tests/HTMLPurifier/ConfigTest.php +++ b/tests/HTMLPurifier/ConfigTest.php @@ -8,6 +8,7 @@ class HTMLPurifier_ConfigTest extends UnitTestCase var $our_copy, $old_copy; function setUp() { + // set up a dummy schema object for testing $our_copy = new HTMLPurifier_ConfigSchema(); $this->old_copy = HTMLPurifier_ConfigSchema::instance(); $this->our_copy =& HTMLPurifier_ConfigSchema::instance($our_copy); @@ -93,6 +94,17 @@ class HTMLPurifier_ConfigTest extends UnitTestCase } + function test_getDefinition() { + + $config = HTMLPurifier_Config::createDefault(); + $def = $config->getHTMLDefinition(); + $this->assertIsA($def, 'HTMLPurifier_HTMLDefinition'); + + $def = $config->getCSSDefinition(); + $this->assertIsA($def, 'HTMLPurifier_CSSDefinition'); + + } + } ?> \ No newline at end of file diff --git a/tests/HTMLPurifier/Lexer/DirectLexTest.php b/tests/HTMLPurifier/Lexer/DirectLexTest.php index 2ad14476..de35c1d1 100644 --- a/tests/HTMLPurifier/Lexer/DirectLexTest.php +++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php @@ -11,24 +11,6 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase $this->DirectLex = new HTMLPurifier_Lexer_DirectLex(); } - function test_parseData() { - $HP =& $this->DirectLex; - - $this->assertIdentical('asdf', $HP->parseData('asdf')); - $this->assertIdentical('&', $HP->parseData('&')); - $this->assertIdentical('"', $HP->parseData('"')); - $this->assertIdentical("'", $HP->parseData(''')); - $this->assertIdentical("'", $HP->parseData(''')); - $this->assertIdentical('&&&', $HP->parseData('&&&')); - $this->assertIdentical('&&', $HP->parseData('&&')); // [INVALID] - $this->assertIdentical('Procter & Gamble', - $HP->parseData('Procter & Gamble')); // [INVALID] - - // This is not special, thus not converted. Test of fault tolerance, - // realistically speaking, this should never happen - $this->assertIdentical('-', $HP->parseData('-')); - } - // internals testing function test_parseAttributeString() { diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index 25fff13c..1ddc8a67 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -38,6 +38,25 @@ class HTMLPurifier_LexerTest extends UnitTestCase $this->assertIdentical($extract, $result); } + function test_parseData() { + $HP =& $this->Lexer; + + $this->assertIdentical('asdf', $HP->parseData('asdf')); + $this->assertIdentical('&', $HP->parseData('&')); + $this->assertIdentical('"', $HP->parseData('"')); + $this->assertIdentical("'", $HP->parseData(''')); + $this->assertIdentical("'", $HP->parseData(''')); + $this->assertIdentical('&&&', $HP->parseData('&&&')); + $this->assertIdentical('&&', $HP->parseData('&&')); // [INVALID] + $this->assertIdentical('Procter & Gamble', + $HP->parseData('Procter & Gamble')); // [INVALID] + + // This is not special, thus not converted. Test of fault tolerance, + // realistically speaking, this should never happen + $this->assertIdentical('-', $HP->parseData('-')); + } + + function test_extractBody() { $this->assertExtractBody('Bold'); $this->assertExtractBody('Bold', 'Bold'); @@ -249,13 +268,16 @@ class HTMLPurifier_LexerTest extends UnitTestCase ,new HTMLPurifier_Token_Text('Link') ,new HTMLPurifier_Token_End('a') ); - $sax_expect[16] = false; // PEARSax doesn't support it! // test that UTF-8 is preserved $char_hearts = $this->_entity_lookup->table['hearts']; $input[17] = $char_hearts; $expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) ); + // test weird characters in attributes + $input[18] = '
'; + $expect[18] = array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) ); + $default_config = HTMLPurifier_Config::createDefault(); foreach($input as $i => $discard) { if (!isset($config[$i])) $config[$i] = $default_config; diff --git a/tests/HTMLPurifier/URISchemeTest.php b/tests/HTMLPurifier/URISchemeTest.php index 6cc32b5f..7400b8d1 100644 --- a/tests/HTMLPurifier/URISchemeTest.php +++ b/tests/HTMLPurifier/URISchemeTest.php @@ -54,12 +54,34 @@ class HTMLPurifier_URISchemeTest extends UnitTestCase $scheme = new HTMLPurifier_URIScheme_ftp(); $config = HTMLPurifier_Config::createDefault(); + $this->assertIdentical( $scheme->validateComponents( 'user', 'www.example.com', 21, '/', 's=foobar', $config), array('user', 'www.example.com', null, '/', null) ); + // valid typecode + $this->assertIdentical( + $scheme->validateComponents( + null, 'www.example.com', null, '/file.txt;type=a', null, $config), + array(null, 'www.example.com', null, '/file.txt;type=a', null) + ); + + // remove invalid typecode + $this->assertIdentical( + $scheme->validateComponents( + null, 'www.example.com', null, '/file.txt;type=z', null, $config), + array(null, 'www.example.com', null, '/file.txt', null) + ); + + // encode errant semicolons + $this->assertIdentical( + $scheme->validateComponents( + null, 'www.example.com', null, '/too;many;semicolons=1', null, $config), + array(null, 'www.example.com', null, '/too%3Bmany%3Bsemicolons=1', null) + ); + } function test_news() { diff --git a/tests/index.php b/tests/index.php index 880c5a87..cdb87ec0 100644 --- a/tests/index.php +++ b/tests/index.php @@ -114,14 +114,14 @@ if (isset($_GET['file']) && isset($test_file_lookup[$_GET['file']])) { // execute only one test $test_file = $_GET['file']; - $test = new GroupTest('HTMLPurifier - ' . $test_file); + $test = new GroupTest('HTML Purifier - ' . $test_file); $path = 'HTMLPurifier/' . $test_file; require_once $path; $test->addTestClass(htmlpurifier_path2class($path)); } else { - $test = new GroupTest('HTMLPurifier'); + $test = new GroupTest('HTML Purifier'); foreach ($test_files as $test_file) { $path = 'HTMLPurifier/' . $test_file;