From 80c60bb9b50d2b251996eb4fb82654c9ed92700b Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" <edwardzyang@thewritingpot.com> Date: Sun, 5 Aug 2007 02:02:46 +0000 Subject: [PATCH] Release 2.1.0, merged in 1255 to HEAD. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/strict@1368 48356398-32a2-884e-a903-53898d9a118a --- Doxyfile | 2 +- NEWS | 62 +++ TODO | 23 +- VERSION | 2 +- WHATSNEW | 22 +- benchmarks/.htaccess | 1 + benchmarks/Trace.php | 12 + configdoc/generate.php | 2 + docs/dev-code-quality.txt | 6 +- docs/enduser-security.txt | 4 +- docs/enduser-uri-filter.html | 201 +++++++ docs/enduser-utf8.html | 7 +- docs/index.html | 3 + docs/proposal-filter-levels.txt | 2 +- docs/proposal-new-directives.txt | 5 +- docs/ref-css-length.txt | 28 + docs/style.css | 3 + library/HTMLPurifier.php | 13 +- .../HTMLPurifier/AttrDef/CSS/FontFamily.php | 19 +- library/HTMLPurifier/AttrDef/CSS/URI.php | 2 +- library/HTMLPurifier/AttrDef/URI.php | 354 ++++-------- library/HTMLPurifier/Config.php | 46 +- library/HTMLPurifier/ConfigSchema.php | 201 ++++--- library/HTMLPurifier/ContentSets.php | 1 + .../DefinitionCache/Serializer.php | 2 +- library/HTMLPurifier/EntityLookup.php | 2 +- library/HTMLPurifier/HTMLDefinition.php | 8 +- library/HTMLPurifier/HTMLModule/Ruby.php | 28 + library/HTMLPurifier/HTMLModuleManager.php | 5 +- library/HTMLPurifier/Injector.php | 40 +- .../HTMLPurifier/Injector/AutoParagraph.php | 39 +- library/HTMLPurifier/Injector/Linkify.php | 8 +- .../HTMLPurifier/Injector/PurifierLinkify.php | 11 +- library/HTMLPurifier/Language/messages/en.php | 2 +- library/HTMLPurifier/LanguageFactory.php | 2 +- library/HTMLPurifier/Lexer.php | 10 + library/HTMLPurifier/Lexer/DOMLex.php | 26 + library/HTMLPurifier/Lexer/DirectLex.php | 26 +- library/HTMLPurifier/Printer/ConfigForm.css | 2 +- library/HTMLPurifier/Printer/ConfigForm.php | 56 +- .../HTMLPurifier/Strategy/MakeWellFormed.php | 65 ++- .../Strategy/RemoveForeignElements.php | 50 +- .../Strategy/ValidateAttributes.php | 1 + library/HTMLPurifier/URI.php | 119 +++++ library/HTMLPurifier/URIDefinition.php | 145 +++++ library/HTMLPurifier/URIFilter.php | 24 + .../URIFilter/DisableExternal.php | 34 ++ .../URIFilter/DisableExternalResources.php | 26 + .../HTMLPurifier/URIFilter/HostBlacklist.php | 28 + .../HTMLPurifier/URIFilter/MakeAbsolute.php | 115 ++++ library/HTMLPurifier/URIParser.php | 62 +++ library/HTMLPurifier/URIScheme.php | 22 +- library/HTMLPurifier/URIScheme/ftp.php | 30 +- library/HTMLPurifier/URIScheme/http.php | 12 +- library/HTMLPurifier/URIScheme/mailto.php | 13 +- library/HTMLPurifier/URIScheme/news.php | 14 +- library/HTMLPurifier/URIScheme/nntp.php | 12 +- library/HTMLPurifier/URISchemeRegistry.php | 6 +- maintenance/common.php | 9 + maintenance/flush-definition-cache.php | 36 ++ maintenance/flush-htmldefinition-cache.php | 23 - maintenance/generate-entity-file.php | 8 +- maintenance/merge-library.php | 207 +++++++ plugins/phorum/config.default.php | 56 ++ plugins/phorum/htmlpurifier.php | 272 ++++++++++ plugins/phorum/htmlpurifier/LICENSE | 504 ++++++++++++++++++ plugins/phorum/htmlpurifier/README | 1 + plugins/phorum/info.txt | 8 + plugins/phorum/init-config.php | 27 + plugins/phorum/install.txt | 33 ++ plugins/phorum/migrate.bbcode.php | 28 + plugins/phorum/settings.php | 63 +++ plugins/phorum/settings/form.php | 79 +++ plugins/phorum/settings/migrate-sigs-form.php | 21 + plugins/phorum/settings/migrate-sigs.php | 85 +++ plugins/phorum/settings/save.php | 23 + smoketests/testSchema.php | 4 + test-settings.sample.php | 20 +- tests/HTMLPurifier/AttrCollectionsTest.php | 2 +- .../AttrDef/CSS/FontFamilyTest.php | 4 + tests/HTMLPurifier/AttrDef/URITest.php | 324 ++--------- tests/HTMLPurifier/AttrDefHarness.php | 16 +- tests/HTMLPurifier/AttrDefTest.php | 2 +- tests/HTMLPurifier/AttrTransformHarness.php | 4 +- tests/HTMLPurifier/AttrTransformTest.php | 2 +- tests/HTMLPurifier/AttrTypesTest.php | 2 +- tests/HTMLPurifier/ChildDefHarness.php | 4 +- tests/HTMLPurifier/ComplexHarness.php | 129 +++++ tests/HTMLPurifier/ConfigSchemaTest.php | 9 +- tests/HTMLPurifier/ConfigTest.php | 2 +- tests/HTMLPurifier/ContextTest.php | 2 +- .../DefinitionCache/SerializerTest.php | 3 +- .../DefinitionCacheFactoryTest.php | 2 +- tests/HTMLPurifier/DefinitionCacheHarness.php | 2 +- tests/HTMLPurifier/DefinitionCacheTest.php | 2 +- tests/HTMLPurifier/DefinitionTest.php | 2 +- tests/HTMLPurifier/DoctypeRegistryTest.php | 2 +- tests/HTMLPurifier/ElementDefTest.php | 2 +- tests/HTMLPurifier/EncoderTest.php | 2 +- tests/HTMLPurifier/EntityLookupTest.php | 2 +- tests/HTMLPurifier/EntityParserTest.php | 2 +- tests/HTMLPurifier/ErrorCollectorTest.php | 2 +- tests/HTMLPurifier/ErrorsHarness.php | 2 +- tests/HTMLPurifier/GeneratorTest.php | 6 +- tests/HTMLPurifier/HTMLDefinitionTest.php | 22 +- tests/HTMLPurifier/HTMLModule/RubyTest.php | 56 ++ tests/HTMLPurifier/HTMLModule/TidyTest.php | 2 +- tests/HTMLPurifier/HTMLModuleManagerTest.php | 2 +- tests/HTMLPurifier/HTMLModuleTest.php | 2 +- tests/HTMLPurifier/Harness.php | 153 ++---- tests/HTMLPurifier/IDAccumulatorTest.php | 2 +- .../Injector/AutoParagraphTest.php | 34 ++ tests/HTMLPurifier/Injector/LinkifyTest.php | 5 + .../Injector/PurifierLinkifyTest.php | 5 + tests/HTMLPurifier/LanguageFactoryTest.php | 2 +- tests/HTMLPurifier/LanguageTest.php | 2 +- tests/HTMLPurifier/Lexer/DirectLexTest.php | 8 +- tests/HTMLPurifier/LexerTest.php | 37 +- tests/HTMLPurifier/PercentEncoderTest.php | 2 +- tests/HTMLPurifier/Strategy/CompositeTest.php | 2 +- .../HTMLPurifier/Strategy/FixNestingTest.php | 12 +- .../Strategy/RemoveForeignElementsTest.php | 11 + .../RemoveForeignElements_ErrorsTest.php | 4 +- tests/HTMLPurifier/StrategyHarness.php | 5 +- tests/HTMLPurifier/TagTransformTest.php | 2 +- tests/HTMLPurifier/TokenFactoryTest.php | 2 +- tests/HTMLPurifier/TokenTest.php | 2 +- tests/HTMLPurifier/URIDefinitionTest.php | 59 ++ .../DisableExternalResourcesTest.php | 24 + .../URIFilter/DisableExternalTest.php | 47 ++ .../URIFilter/HostBlacklistTest.php | 30 ++ .../URIFilter/MakeAbsoluteTest.php | 122 +++++ tests/HTMLPurifier/URIFilterHarness.php | 15 + tests/HTMLPurifier/URIHarness.php | 31 ++ tests/HTMLPurifier/URIParserTest.php | 140 +++++ tests/HTMLPurifier/URISchemeRegistryTest.php | 2 +- tests/HTMLPurifier/URISchemeTest.php | 252 +++++---- tests/HTMLPurifier/URITest.php | 166 ++++++ tests/HTMLPurifierTest.php | 19 +- tests/index.php | 10 +- tests/test_files.php | 8 + 141 files changed, 4250 insertions(+), 1155 deletions(-) create mode 100644 benchmarks/.htaccess create mode 100644 benchmarks/Trace.php create mode 100644 docs/enduser-uri-filter.html create mode 100644 docs/ref-css-length.txt create mode 100644 library/HTMLPurifier/HTMLModule/Ruby.php create mode 100644 library/HTMLPurifier/URI.php create mode 100644 library/HTMLPurifier/URIDefinition.php create mode 100644 library/HTMLPurifier/URIFilter.php create mode 100644 library/HTMLPurifier/URIFilter/DisableExternal.php create mode 100644 library/HTMLPurifier/URIFilter/DisableExternalResources.php create mode 100644 library/HTMLPurifier/URIFilter/HostBlacklist.php create mode 100644 library/HTMLPurifier/URIFilter/MakeAbsolute.php create mode 100644 library/HTMLPurifier/URIParser.php create mode 100644 maintenance/common.php create mode 100755 maintenance/flush-definition-cache.php delete mode 100644 maintenance/flush-htmldefinition-cache.php mode change 100644 => 100755 maintenance/generate-entity-file.php create mode 100755 maintenance/merge-library.php create mode 100644 plugins/phorum/config.default.php create mode 100644 plugins/phorum/htmlpurifier.php create mode 100644 plugins/phorum/htmlpurifier/LICENSE create mode 100644 plugins/phorum/htmlpurifier/README create mode 100644 plugins/phorum/info.txt create mode 100644 plugins/phorum/init-config.php create mode 100644 plugins/phorum/install.txt create mode 100644 plugins/phorum/migrate.bbcode.php create mode 100644 plugins/phorum/settings.php create mode 100644 plugins/phorum/settings/form.php create mode 100644 plugins/phorum/settings/migrate-sigs-form.php create mode 100644 plugins/phorum/settings/migrate-sigs.php create mode 100644 plugins/phorum/settings/save.php create mode 100644 tests/HTMLPurifier/ComplexHarness.php create mode 100644 tests/HTMLPurifier/HTMLModule/RubyTest.php create mode 100644 tests/HTMLPurifier/URIDefinitionTest.php create mode 100644 tests/HTMLPurifier/URIFilter/DisableExternalResourcesTest.php create mode 100644 tests/HTMLPurifier/URIFilter/DisableExternalTest.php create mode 100644 tests/HTMLPurifier/URIFilter/HostBlacklistTest.php create mode 100644 tests/HTMLPurifier/URIFilter/MakeAbsoluteTest.php create mode 100644 tests/HTMLPurifier/URIFilterHarness.php create mode 100644 tests/HTMLPurifier/URIHarness.php create mode 100644 tests/HTMLPurifier/URIParserTest.php create mode 100644 tests/HTMLPurifier/URITest.php mode change 100644 => 100755 tests/index.php diff --git a/Doxyfile b/Doxyfile index 8ecf65ae..9076573d 100644 --- a/Doxyfile +++ b/Doxyfile @@ -4,7 +4,7 @@ # Project related configuration options #--------------------------------------------------------------------------- PROJECT_NAME = HTML Purifier -PROJECT_NUMBER = 2.0.1 +PROJECT_NUMBER = 2.1.1 OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen" CREATE_SUBDIRS = NO OUTPUT_LANGUAGE = English diff --git a/NEWS b/NEWS index 19b70259..04bfa37d 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,68 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier . Internal change ========================== +2.1.1, released 2007-08-04 +- Fix show-stopper bug in %URI.MakeAbsolute functionality +- Fix PHP4 syntax error in standalone version +. Add prefix directory to include path for standalone, this prevents + other installations from clobbering the standalone's URI schemes +. Single test methods can be invoked by prefixing with __only + +2.1.0, released 2007-08-02 +# flush-htmldefinition-cache.php superseded in favor of a generic + flush-definition-cache.php script, you can clear a specific cache + by passing its name as a parameter to the script +! Phorum mod implemented for HTML Purifier +! With %Core.AggressivelyFixLt, <3 and similar emoticons no longer + trigger HTML removal in PHP5 (DOMLex). This directive is not necessary + for PHP4 (DirectLex). +! Standalone file now available, which greatly reduces the amount of + includes (although there are still a few files that reside in the + standalone folder) +! Relative URIs can now be transformed into their absolute equivalents + using %URI.Base and %URI.MakeAbsolute +! Ruby implemented for XHTML 1.1 +! You can now define custom URI filtering behavior, see enduser-uri-filter.html + for more details +! UTF-8 font names now supported in CSS +- AutoFormatters emit friendly error messages if tags or attributes they + need are not allowed +- ConfigForm's compactification of directive names is now configurable +- AutoParagraph autoformatter algorithm refined after field-testing +- XHTML 1.1 now applies XHTML 1.0 Strict cleanup routines, namely + blockquote wrapping +- Contents of <style> tags removed by default when tags are removed +. HTMLPurifier_Config->getSerial() implemented, this is extremely useful + for output cache invalidation +. ConfigForm printer now can retrieve CSS and JS files as strings, in + case HTML Purifier's directory is not publically accessible +. Introduce new text/itext configuration directive values: these represent + longer strings that would be more appropriately edited with a textarea +. Allow newlines to act as separators for lists, hashes, lookups and + %HTML.Allowed +. ConfigForm generates textareas instead of text inputs for lists, hashes, + lookups, text and itext fields +. Hidden element content removal genericized: %Core.HiddenElements can + be used to customize this behavior, by default <script> and <style> are + hidden +. Added HTMLPURIFIER_PREFIX constant, should be used instead of dirname(__FILE__) +. Custom ChildDef added to default include list +. URIScheme reflection improved: will not attempt to include file if class + already exists. May clobber autoload, so I need to keep an eye on it +. ConfigSchema heavily optimized, will only collect information and validate + definitions when HTMLPURIFIER_SCHEMA_STRICT is true. +. AttrDef_URI unit tests and implementation refactored +. benchmarks/ directory now protected from public view with .htaccess file; + run the tests via command line +. URI scheme is munged off if there is no authority and the scheme is the + default one +. All unit tests inherit from HTMLPurifier_Harness, not UnitTestCase +. Interface for URIScheme changed +. Generic URI object to hold components of URI added, most systems involved + in URI validation have been migrated to use it +. Custom filtering for URIs factored out to URIDefinition interface for + maximum extensibility + 2.0.1, released 2007-06-27 ! Tag auto-closing now based on a ChildDef heuristic rather than a manually set auto_close array; some behavior may change diff --git a/TODO b/TODO index 5bce0a60..0fa3eb08 100644 --- a/TODO +++ b/TODO @@ -6,14 +6,9 @@ TODO List ? Maybe I'll Do It ========================== -2.1 release [Refactor, refactor!] - # URI validation routines tighter (see docs/dev-code-quality.html) (COMPLEX) - # Advanced URI filtering schemes (see docs/proposal-new-directives.txt) - # Ruby support - - Configuration profiles: predefined directives set with one func call - - Implement IDREF support (harder than it seems, since you cannot have - IDREFs to non-existent IDs) - - Allow non-ASCII characters in font names +If no interest is expressed for a feature that may required a considerable +amount of effort to implement, it may get endlessly delayed. Do not be +afraid to cast your vote for the next feature to be implemented! 2.2 release [Error'ed] # Error logging for filtering/cleanup procedures @@ -36,6 +31,8 @@ TODO List 2.4 release [It's All About Trust] (floating) # Implement untrusted, dangerous elements/attributes + # Implement IDREF support (harder than it seems, since you cannot have + IDREFs to non-existent IDs) 3.0 release [Beyond HTML] # Legit token based CSS parsing (will require revamping almost every @@ -60,9 +57,7 @@ TODO List Ongoing - Lots of profiling, make it faster! - Plugins for major CMSes (COMPLEX) - - WordPress (mostly written, needs beta-testing) - phpBB - - Phorum - eFiction - more! (look for ones that use WYSIWYGs) - Complete basic smoketests @@ -71,13 +66,15 @@ Unknown release (on a scratch-an-itch basis) ? Semi-lossy dumb alternate character encoding transfor ? Have 'lang' attribute be checked against official lists, achieved by encoding all characters that have string entity equivalents - - Explain how to use HTML Purifier in non-PHP languages / create - a simple command line stub - Abstract ChildDef_BlockQuote to work with all elements that only allow blocks in them, required or optional - Reorganize Unit Tests - - Refactor loop tests (esp. AttrDef_URI) + - Refactor loop tests: Lexer - Reorganize configuration directives (Create more namespaces! Get messy!) + - Advanced URI filtering schemes (see docs/proposal-new-directives.txt) + - Implement lenient <ruby> child validation + - Explain how to use HTML Purifier in non-PHP languages / create + a simple command line stub (or complicated?) Requested diff --git a/VERSION b/VERSION index 10bf840e..7c327287 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.0.1 \ No newline at end of file +2.1.1 \ No newline at end of file diff --git a/WHATSNEW b/WHATSNEW index 2f0b2d9d..a08edbb5 100644 --- a/WHATSNEW +++ b/WHATSNEW @@ -1,12 +1,10 @@ -The 2.0.1 release introduces a number of stability and usability fixes, -as well as a number of (disabled by default) experimental features. The -security-minded should note that a reflected XSS vulnerability was patched -in smoketests/configForm.php; if you cannot upgrade immediately, please -delete that file (if that directory is not publically accessible, there -is no security risk). The maintenance changes include more helpful file -permissions errors, internal newline normalization, reordered includes -to prevent a missing class definition in some setups, and better cache -revision and id handling. The two experimental features are auto-formatting -(auto-paragraphing and linkification) and error collection, these can -be enabled with %AutoFormat.AutoParagraph, %AutoFormat.Linkify and -%Core.CollectErrors respectively. +In version 2.1, HTML Purifier's URI validation and filtering handling +system has been revamped with a new, extensible URIFilter system. Also +notable features include preservation of emoticons in PHP5 with +%Core.AggressivelyFixLt, standalone and lite download versions, +transforming relative URIs to absolute URIs, Ruby in XHTML 1.1, a Phorum +mod, and UTF-8 font names. Notable bug-fixes include refinement of +the auto-paragraphing algorithm (no longer experimental), better XHTML +1.1 support and the removal of the contents of <style> elements. Version +2.1.1 amends a few bugs in some of newly introduced features, namely +running the standalone download version in PHP4 and %URI.MakeAbsolute. diff --git a/benchmarks/.htaccess b/benchmarks/.htaccess new file mode 100644 index 00000000..3a428827 --- /dev/null +++ b/benchmarks/.htaccess @@ -0,0 +1 @@ +Deny from all diff --git a/benchmarks/Trace.php b/benchmarks/Trace.php new file mode 100644 index 00000000..fa98ffac --- /dev/null +++ b/benchmarks/Trace.php @@ -0,0 +1,12 @@ +<?php + +ini_set('xdebug.trace_format', 1); +ini_set('xdebug.show_mem_delta', true); + +xdebug_start_trace(dirname(__FILE__) . '/Trace'); +require_once '../library/HTMLPurifier.auto.php'; + +$purifier = new HTMLPurifier(); + +$data = $purifier->purify(file_get_contents('samples/Lexer/4.html')); +xdebug_stop_trace(); diff --git a/configdoc/generate.php b/configdoc/generate.php index 97e96433..9e73f4c7 100644 --- a/configdoc/generate.php +++ b/configdoc/generate.php @@ -18,6 +18,8 @@ TODO: if (version_compare('5', PHP_VERSION, '>')) exit('Requires PHP 5 or higher.'); error_reporting(E_ALL); // probably not possible to use E_STRICT +define('HTMLPURIFIER_SCHEMA_STRICT', true); // description data needs to be collected + // load dual-libraries require_once '../library/HTMLPurifier.auto.php'; require_once 'library/ConfigDoc.auto.php'; diff --git a/docs/dev-code-quality.txt b/docs/dev-code-quality.txt index 7c09a22c..10e21cb7 100644 --- a/docs/dev-code-quality.txt +++ b/docs/dev-code-quality.txt @@ -11,8 +11,7 @@ docs/examples/demo.php - ad hoc HTML/PHP soup to the extreme AttrDef - a lot of duplication, more generic classes need to be created; a lot of strtolower() calls, no legit casing - Class - doesn't support Unicode characters (fringe); uses regular - expressions + Class - doesn't support Unicode characters (fringe); uses regular expressions Lang - code duplication; premature optimization Length - easily mistaken for CSSLength URI - multiple regular expressions; missing validation for parts (?) @@ -22,9 +21,6 @@ ConfigSchema - redefinition is a mess Strategy FixNesting - cannot bubble nodes out of structures, duplicated checks for special-case parent node - MakeWellFormed - insufficient automatic closing definitions (check HTML - spec for optional end tags, also, closing based on type (block/inline) - might be efficient). RemoveForeignElements - should be run in parallel with MakeWellFormed URIScheme - needs to have callable generic checks mailto - doesn't validate emails, doesn't validate querystring diff --git a/docs/enduser-security.txt b/docs/enduser-security.txt index 49aff331..dd856395 100644 --- a/docs/enduser-security.txt +++ b/docs/enduser-security.txt @@ -10,9 +10,7 @@ to be effective. Things to remember: 2. IDs: see enduser-id.html for more info -3. Links: document pending feature completion -Rudimentary blacklisting, we should also allow only relative URIs. We -need a doc to explain the stuff. +3. URIs: see enduser-uri-filter.html 4. CSS: document pending Explain which CSS styles we blocked and why. diff --git a/docs/enduser-uri-filter.html b/docs/enduser-uri-filter.html new file mode 100644 index 00000000..04a611f8 --- /dev/null +++ b/docs/enduser-uri-filter.html @@ -0,0 +1,201 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head> +<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> +<meta name="description" content="Tutorial for creating custom URI filters." /> +<link rel="stylesheet" type="text/css" href="style.css" /> + +<title>URI Filters - HTML Purifier</title> + +</head><body> + +<h1>URI Filters</h1> + +<div id="filing">Filed under End-User</div> +<div id="index">Return to the <a href="index.html">index</a>.</div> +<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div> + +<p> + This is a quick and dirty document to get you on your way to writing + custom URI filters for your own URL filtering needs. Why would you + want to write a URI filter? If you need URIs your users put into + HTML to magically change into a different URI, this is + exactly what you need! +</p> + +<h2>Creating the class</h2> + +<p> + Any URI filter you make will be a subclass of <code>HTMLPurifier_URIFilter</code>. + The scaffolding is thus: +</p> + +<pre>class HTMLPurifier_URIFilter_<strong>NameOfFilter</strong> extends HTMLPurifier_URIFilter +{ + var $name = '<strong>NameOfFilter</strong>'; + function prepare($config) {} + function filter(&$uri, $config, &$context) {} +}</pre> + +<p> + Fill in the variable <code>$name</code> with the name of your filter, and + take a look at the two methods. <code>prepare()</code> is an initialization + method that is called only once, before any filtering has been done of the + HTML. Use it to perform any costly setup work that only needs to be done + once. <code>filter()</code> is the guts and innards of our filter: + it takes the URI and does whatever needs to be done to it. +</p> + +<p> + If you've worked with HTML Purifier, you'll recognize the <code>$config</code> + and <code>$context</code> parameters. On the other hand, <code>$uri</code> + is something unique to this section of the application: it's a + <code>HTMLPurifier_URI</code> object. The interface is thus: +</p> + +<pre>class HTMLPurifier_URI +{ + var $scheme, $userinfo, $host, $port, $path, $query, $fragment; + function HTMLPurifier_URI($scheme, $userinfo, $host, $port, $path, $query, $fragment); + function toString(); + function copy(); + function getSchemeObj($config, &$context); + function validate($config, &$context); +}</pre> + +<p> + The first three methods are fairly self-explanatory: you have a constructor, + a serializer, and a cloner. Generally, you won't be using them when + you are manipulating the URI objects themselves. + <code>getSchemeObj()</code> is a special purpose method that returns + a <code>HTMLPurifier_URIScheme</code> object corresponding to the specific + URI at hand. <code>validate()</code> performs general-purpose validation + on the internal components of a URI. Once again, you don't need to + worry about these: they've already been handled for you. +</p> + +<h2>URI format</h2> + +<p> + As a URIFilter, we're interested in the member variables of the URI object. +</p> + +<table class="quick"><tbody> + <tr><th>Scheme</th> <td>The protocol for identifying (and possibly locating) a resource (http, ftp, https)</td></tr> + <tr><th>Userinfo</th> <td>User information such as a username (bob)</td></tr> + <tr><th>Host</th> <td>Domain name or IP address of the server (example.com, 127.0.0.1)</td></tr> + <tr><th>Port</th> <td>Network port number for the server (80, 12345)</td></tr> + <tr><th>Path</th> <td>Data that identifies the resource, possibly hierarchical (/path/to, ed@example.com)</td></tr> + <tr><th>Query</th> <td>String of information to be interpreted by the resource (?q=search-term)</td></tr> + <tr><th>Fragment</th> <td>Additional information for the resource after retrieval (#bookmark)</td></tr> +</tbody></table> + +<p> + Because the URI is presented to us in this form, and not + <code>http://bob@example.com:8080/foo.php?q=string#hash</code>, it saves us + a lot of trouble in having to parse the URI every time we want to filter + it. For the record, the above URI has the following components: +</p> + +<table class="quick"><tbody> + <tr><th>Scheme</th> <td>http</td></tr> + <tr><th>Userinfo</th> <td>bob</td></tr> + <tr><th>Host</th> <td>example.com</td></tr> + <tr><th>Port</th> <td>8080</td></tr> + <tr><th>Path</th> <td>/foo.php</td></tr> + <tr><th>Query</th> <td>q=string</td></tr> + <tr><th>Fragment</th> <td>hash</td></tr> +</tbody></table> + +<p> + Note that there is no question mark or octothorpe in the query or + fragment: these get removed during parsing. +</p> + +<p> + With this information, you can get straight to implementing your + <code>filter()</code> method. But one more thing... +</p> + +<h2>Return value: Boolean, not URI</h2> + +<p> + You may have noticed that the URI is being passed in by reference. + This means that whatever changes you make to it, those changes will + be reflected in the URI object the callee had. <strong>Do not + return the URI object: it is unnecessary and will cause bugs.</strong> + Instead, return a boolean value, true if the filtering was successful, + or false if the URI is beyond repair and needs to be axed. +</p> + +<p> + Let's suppose I wanted to write a filter that de-internationalized domain + names by converting them to <a href="http://en.wikipedia.org/wiki/Punycode">Punycode</a>. + Assuming that <code>punycode_encode($input)</code> converts <code>$input</code> to + Punycode and returns <code>false</code> on failure: +</p> + +<pre>class HTMLPurifier_URIFilter_ConvertIDNToPunycode extends HTMLPurifier_URIFilter +{ + var $name = 'ConvertIDNToPunycode'; + function filter(&$uri, $config, &$context) { + if (is_null($uri->host)) return true; + if ($uri->host == utf8_decode($uri->host)) { + // is ASCII, abort + return true; + } + $host = punycode_encode($uri->host); + if ($host === false) return false; + $uri->host = $host; + return true; + } +}</pre> + +<p> + Notice I did not <code>return $uri;</code>. +</p> + +<h2>Activating your filter</h2> + +<p> + Having a filter is all well and good, but you need to tell HTML Purifier + to use it. Fortunately, this part's simple: +</p> + +<pre>$uri =& $config->getDefinition('URI'); +$uri->addFilter(new HTMLPurifier_URIFilter_<strong>NameOfFilter</strong>());</pre> + +<p> + If you want to be really fancy, you can define a configuration directive + for your filter and have HTML Purifier automatically manage whether or + not your filter gets loaded or not (this is how internal filters manage + things): +</p> + +<pre>HTMLPurifier_ConfigSchema::define( + 'URI', '<strong>NameOfFilter</strong>', false, 'bool', + '<strong>What your filter does.</strong>' +); +$uri =& $config->getDefinition('URI', true); +$uri->registerFilter(new HTMLPurifier_URIFilter_<strong>NameOfFilter</strong>()); +</pre> + +<p> + Now, your filter will only be called when %URI.<strong>NameOfFilter</strong> + is set to true. +</p> + +<h2>Examples</h2> + +<p> + Check the + <a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/URIFilter/">URIFilter</a> + directory for more implementation examples, and see <a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/docs/proposal-new-directives.txt">the + new directives proposal document</a> for ideas on what could be implemented + as a filter. +</p> + +<div id="version">$Id$</div> + +</body></html> diff --git a/docs/enduser-utf8.html b/docs/enduser-utf8.html index ef8c136b..b8cee57d 100644 --- a/docs/enduser-utf8.html +++ b/docs/enduser-utf8.html @@ -231,7 +231,7 @@ of your real encoding.</p> why the character encoding should be explicitly stated. When the browser isn't told what the character encoding of a text is, it has to guess: and sometimes the guess is wrong. Hackers can manipulate - this guess in order to slip XSS pass filters and then fool the + this guess in order to slip XSS past filters and then fool the browser into executing it as active code. A great example of this is the <a href="http://shiflett.org/archive/177">Google UTF-7 exploit</a>.</p> @@ -567,10 +567,11 @@ which may be used by POST, and is required when you want to upload files.</p> <p>The following is a summarization of notes from -<a href="http://ppewww.physics.gla.ac.uk/~flavell/charset/form-i18n.html"> +<a href="http://web.archive.org/web/20060427015200/ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html"> <code>FORM</code> submission and i18n</a>. That document contains lots of useful information, but is written in a rambly manner, so -here I try to get right to the point.</p> +here I try to get right to the point. (Note: the original has +disappeared off the web, so I am linking to the Web Archive copy.)</p> <h4 id="whyutf8-forms-urlencoded"><code>application/x-www-form-urlencoded</code></h4> diff --git a/docs/index.html b/docs/index.html index 437a8bfd..8d295dda 100644 --- a/docs/index.html +++ b/docs/index.html @@ -40,6 +40,9 @@ information for casual developers using HTML Purifier.</p> <dt><a href="enduser-customize.html">Customize</a></dt> <dd>Tutorial for customizing HTML Purifier's tag and attribute sets.</dd> +<dt><a href="enduser-uri-filter.html">URI Filters</a></dt> +<dd>Tutorial for creating custom URI filters.</dd> + </dl> <h2>Development</h2> diff --git a/docs/proposal-filter-levels.txt b/docs/proposal-filter-levels.txt index 9e9cfbb0..3118c644 100644 --- a/docs/proposal-filter-levels.txt +++ b/docs/proposal-filter-levels.txt @@ -32,7 +32,7 @@ Here are some fuzzy levels you could set: One final note: when you start axing tags that are more commonly used, you run the risk of accidentally destroying user data, especially if the data -is incoming from a WYSIWYG eidtor that hasn't been synced accordingly. This may +is incoming from a WYSIWYG editor that hasn't been synced accordingly. This may make forbidden element to text transformations desirable (for example, images). diff --git a/docs/proposal-new-directives.txt b/docs/proposal-new-directives.txt index 2c08ddbb..1ce1b93b 100644 --- a/docs/proposal-new-directives.txt +++ b/docs/proposal-new-directives.txt @@ -2,7 +2,8 @@ Configuration Ideas Here are some theoretical configuration ideas that we could implement some -time. Note the naming convention: %Namespace.Directive +time. Note the naming convention: %Namespace.Directive. If you want one +implemented, give us a ring, and we'll move it up the priority chain. %Attr.RewriteFragments - if there's %Attr.IDPrefix we may want to transparently rewrite the URLs we parse too. However, we can only do it when it's a pure @@ -22,8 +23,6 @@ time. Note the naming convention: %Namespace.Directive %URI.AddRelNofollow - will add rel="nofollow" to all links, preventing the spread of ill-gotten pagerank -%URI.RelativeToAbsolute - transforms all relative URIs to absolute form - %URI.HostBlacklistRegex - regexes that if matching the host are disallowed %URI.HostWhitelist - domain names that are excluded from the host blacklist %URI.HostPolicy - determines whether or not its reject all and then whitelist diff --git a/docs/ref-css-length.txt b/docs/ref-css-length.txt new file mode 100644 index 00000000..284ec8b2 --- /dev/null +++ b/docs/ref-css-length.txt @@ -0,0 +1,28 @@ + +CSS Length Reference + To bound, or not to bound, that is the question + +It's quite a reasonable request, really, and it's already been implemented +for HTML. That is, length bounding. It makes little sense to let users +define text blocks that have a font-size of 63,360 inches (that's a mile, +by the way) or a width of forty-fold the parent container. + +But it's a little more complicated then that. There are multiple units +one can use, and we have to a little unit conversion to get things working. +Here's what we have: + +Absolute: + 1 in ~= 2.54 cm + 1 cm = 10 mm + 1 pt = 1/72 in + 1 pc = 12 pt + +Relative: + 1 em ~= 10.0667 px + 1 ex ~= 0.5 em, though Mozilla Firefox says 1 ex = 6px + 1 px ~= 1 pt + +Watch out: font-sizes can also be nested to get successively larger +(although I do not relish having to keep track of context font-sizes, +this may be necessary, especially for some of the more advanced features +for preventing things like white on white). diff --git a/docs/style.css b/docs/style.css index db2dd7d7..40e732c1 100644 --- a/docs/style.css +++ b/docs/style.css @@ -33,6 +33,9 @@ blockquote .label {font-weight:bold; font-size:1em; margin:0 0 .1em; .table thead th:first-child {-moz-border-radius-topleft:1em;} .table tbody td {border-bottom:1px solid #CCC; padding-right:0.6em;padding-left:0.6em;} +/* A quick table*/ +table.quick tbody th {text-align:right; padding-right:1em;} + /* Category of the file */ #filing {font-weight:bold; font-size:smaller; } diff --git a/library/HTMLPurifier.php b/library/HTMLPurifier.php index ebbde869..af61751b 100644 --- a/library/HTMLPurifier.php +++ b/library/HTMLPurifier.php @@ -22,7 +22,7 @@ */ /* - HTML Purifier 2.0.1 - Standards Compliant HTML Filtering + HTML Purifier 2.1.1 - Standards Compliant HTML Filtering Copyright (C) 2006 Edward Z. Yang This library is free software; you can redistribute it and/or @@ -40,6 +40,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +// constants are slow, but we'll make one exception +define('HTMLPURIFIER_PREFIX', dirname(__FILE__)); + // almost every class has an undocumented dependency to these, so make sure // they get included require_once 'HTMLPurifier/ConfigSchema.php'; // important @@ -74,7 +77,7 @@ This directive has been available since 2.0.0. class HTMLPurifier { - var $version = '2.0.1'; + var $version = '2.1.1'; var $config; var $filters; @@ -196,13 +199,13 @@ class HTMLPurifier /** * Singleton for enforcing just one HTML Purifier in your system */ - function &getInstance($prototype = null) { + static function &getInstance($prototype = null) { static $htmlpurifier; if (!$htmlpurifier || $prototype) { - if (is_a($prototype, 'HTMLPurifier')) { + if ($prototype instanceof HTMLPurifier) { $htmlpurifier = $prototype; } elseif ($prototype) { - $htmlpurifier = new HTMLPurifier(HTMLPurifier_Config::create($prototype)); + $htmlpurifier = new HTMLPurifier($prototype); } else { $htmlpurifier = new HTMLPurifier(); } diff --git a/library/HTMLPurifier/AttrDef/CSS/FontFamily.php b/library/HTMLPurifier/AttrDef/CSS/FontFamily.php index 223e7769..dfd89b95 100644 --- a/library/HTMLPurifier/AttrDef/CSS/FontFamily.php +++ b/library/HTMLPurifier/AttrDef/CSS/FontFamily.php @@ -38,19 +38,24 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef $quote = $font[0]; if ($font[$length - 1] !== $quote) continue; $font = substr($font, 1, $length - 2); + // double-backslash processing is buggy + $font = str_replace("\\$quote", $quote, $font); // de-escape quote + $font = str_replace("\\\n", "\n", $font); // de-escape newlines } - // process font + // $font is a pure representation of the font name + if (ctype_alnum($font)) { // very simple font, allow it in unharmed $final .= $font . ', '; continue; } - $nospace = str_replace(array(' ', '.', '!'), '', $font); - if (ctype_alnum($nospace)) { - // font with spaces in it - $final .= "'$font', "; - continue; - } + + // complicated font, requires quoting + + // armor single quotes and new lines + $font = str_replace("'", "\\'", $font); + $font = str_replace("\n", "\\\n", $font); + $final .= "'$font', "; } $final = rtrim($final, ', '); if ($final === '') return false; diff --git a/library/HTMLPurifier/AttrDef/CSS/URI.php b/library/HTMLPurifier/AttrDef/CSS/URI.php index 107545cc..b71a8585 100644 --- a/library/HTMLPurifier/AttrDef/CSS/URI.php +++ b/library/HTMLPurifier/AttrDef/CSS/URI.php @@ -15,7 +15,7 @@ class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI { function HTMLPurifier_AttrDef_CSS_URI() { - $this->HTMLPurifier_AttrDef_URI(true); // always embedded + parent::HTMLPurifier_AttrDef_URI(true); // always embedded } function validate($uri_string, $config, &$context) { diff --git a/library/HTMLPurifier/AttrDef/URI.php b/library/HTMLPurifier/AttrDef/URI.php index 6250d08e..dcf9849c 100644 --- a/library/HTMLPurifier/AttrDef/URI.php +++ b/library/HTMLPurifier/AttrDef/URI.php @@ -1,90 +1,65 @@ <?php require_once 'HTMLPurifier/AttrDef.php'; +require_once 'HTMLPurifier/URIParser.php'; require_once 'HTMLPurifier/URIScheme.php'; require_once 'HTMLPurifier/URISchemeRegistry.php'; require_once 'HTMLPurifier/AttrDef/URI/Host.php'; require_once 'HTMLPurifier/PercentEncoder.php'; -HTMLPurifier_ConfigSchema::define( - 'URI', 'DefaultScheme', 'http', 'string', - 'Defines through what scheme the output will be served, in order to '. - 'select the proper object validator when no scheme information is present.' -); +// special case filtering directives HTMLPurifier_ConfigSchema::define( - 'URI', 'Host', null, 'string/null', - 'Defines the domain name of the server, so we can determine whether or '. - 'an absolute URI is from your website or not. Not strictly necessary, '. - 'as users should be using relative URIs to reference resources on your '. - 'website. It will, however, let you use absolute URIs to link to '. - 'subdomains of the domain you post here: i.e. example.com will allow '. - 'sub.example.com. However, higher up domains will still be excluded: '. - 'if you set %URI.Host to sub.example.com, example.com will be blocked. '. - 'This directive has been available since 1.2.0.' -); + 'URI', 'Munge', null, 'string/null', ' +<p> + Munges all browsable (usually http, https and ftp) + absolute URI\'s into another URI, usually a URI redirection service. + This directive accepts a URI, formatted with a <code>%s</code> where + the url-encoded original URI should be inserted (sample: + <code>http://www.google.com/url?q=%s</code>). +</p> +<p> + Uses for this directive: +</p> +<ul> + <li> + Prevent PageRank leaks, while being fairly transparent + to users (you may also want to add some client side JavaScript to + override the text in the statusbar). <strong>Notice</strong>: + Many security experts believe that this form of protection does not deter spam-bots. + </li> + <li> + Redirect users to a splash page telling them they are leaving your + website. While this is poor usability practice, it is often mandated + in corporate environments. + </li> +</ul> +<p> + This directive has been available since 1.3.0. +</p> +'); + +// disabling directives HTMLPurifier_ConfigSchema::define( - 'URI', 'DisableExternal', false, 'bool', - 'Disables links to external websites. This is a highly effective '. - 'anti-spam and anti-pagerank-leech measure, but comes at a hefty price: no'. - 'links or images outside of your domain will be allowed. Non-linkified '. - 'URIs will still be preserved. If you want to be able to link to '. - 'subdomains or use absolute URIs, specify %URI.Host for your website. '. - 'This directive has been available since 1.2.0.' -); - -HTMLPurifier_ConfigSchema::define( - 'URI', 'DisableExternalResources', false, 'bool', - 'Disables the embedding of external resources, preventing users from '. - 'embedding things like images from other hosts. This prevents '. - 'access tracking (good for email viewers), bandwidth leeching, '. - 'cross-site request forging, goatse.cx posting, and '. - 'other nasties, but also results in '. - 'a loss of end-user functionality (they can\'t directly post a pic '. - 'they posted from Flickr anymore). Use it if you don\'t have a '. - 'robust user-content moderation team. This directive has been '. - 'available since 1.3.0.' -); - -HTMLPurifier_ConfigSchema::define( - 'URI', 'DisableResources', false, 'bool', - 'Disables embedding resources, essentially meaning no pictures. You can '. - 'still link to them though. See %URI.DisableExternalResources for why '. - 'this might be a good idea. This directive has been available since 1.3.0.' -); - -HTMLPurifier_ConfigSchema::define( - 'URI', 'Munge', null, 'string/null', - 'Munges all browsable (usually http, https and ftp) URI\'s into some URL '. - 'redirection service. Pass this directive a URI, with %s inserted where '. - 'the url-encoded original URI should be inserted (sample: '. - '<code>http://www.google.com/url?q=%s</code>). '. - 'This prevents PageRank leaks, while being as transparent as possible '. - 'to users (you may also want to add some client side JavaScript to '. - 'override the text in the statusbar). Warning: many security experts '. - 'believe that this form of protection does not deter spam-bots. '. - 'You can also use this directive to redirect users to a splash page '. - 'telling them they are leaving your website. '. - 'This directive has been available since 1.3.0.' -); - -HTMLPurifier_ConfigSchema::define( - 'URI', 'HostBlacklist', array(), 'list', - 'List of strings that are forbidden in the host of any URI. Use it to '. - 'kill domain names of spam, etc. Note that it will catch anything in '. - 'the domain, so <tt>moo.com</tt> will catch <tt>moo.com.example.com</tt>. '. - 'This directive has been available since 1.3.0.' -); - -HTMLPurifier_ConfigSchema::define( - 'URI', 'Disable', false, 'bool', - 'Disables all URIs in all forms. Not sure why you\'d want to do that '. - '(after all, the Internet\'s founded on the notion of a hyperlink). '. - 'This directive has been available since 1.3.0.' -); + 'URI', 'Disable', false, 'bool', ' +<p> + Disables all URIs in all forms. Not sure why you\'d want to do that + (after all, the Internet\'s founded on the notion of a hyperlink). + This directive has been available since 1.3.0. +</p> +'); HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable'); +HTMLPurifier_ConfigSchema::define( + 'URI', 'DisableResources', false, 'bool', ' +<p> + Disables embedding resources, essentially meaning no pictures. You can + still link to them though. See %URI.DisableExternalResources for why + this might be a good idea. This directive has been available since 1.3.0. +</p> +'); + /** * Validates a URI as defined by RFC 3986. * @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme @@ -92,214 +67,83 @@ HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable'); class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef { - var $host; - var $embeds_resource; + var $parser, $percentEncoder; + var $embedsResource; /** * @param $embeds_resource_resource Does the URI here result in an extra HTTP request? */ function HTMLPurifier_AttrDef_URI($embeds_resource = false) { - $this->host = new HTMLPurifier_AttrDef_URI_Host(); - $this->embeds_resource = (bool) $embeds_resource; + $this->parser = new HTMLPurifier_URIParser(); + $this->percentEncoder = new HTMLPurifier_PercentEncoder(); + $this->embedsResource = (bool) $embeds_resource; } function validate($uri, $config, &$context) { - static $PercentEncoder = null; - if ($PercentEncoder === null) $PercentEncoder = new HTMLPurifier_PercentEncoder(); - - // We'll write stack-based parsers later, for now, use regexps to - // get things working as fast as possible (irony) - if ($config->get('URI', 'Disable')) return false; - // parse as CDATA + // initial operations $uri = $this->parseCDATA($uri); + $uri = $this->percentEncoder->normalize($uri); - // fix up percent-encoding - $uri = $PercentEncoder->normalize($uri); + // parse the URI + $uri = $this->parser->parse($uri); + if ($uri === false) return false; - // while it would be nice to use parse_url(), that's specifically - // for HTTP and thus won't work for our generic URI parsing + // add embedded flag to context for validators + $context->register('EmbeddedURI', $this->embedsResource); - // according to the RFC... (but this cuts corners, i.e. non-validating) - $r_URI = '!'. - '(([^:/?#<>\'"]+):)?'. // 2. Scheme - '(//([^/?#<>\'"]*))?'. // 4. Authority - '([^?#<>\'"]*)'. // 5. Path - '(\?([^#<>\'"]*))?'. // 7. Query - '(#([^<>\'"]*))?'. // 8. Fragment - '!'; - - $matches = array(); - $result = preg_match($r_URI, $uri, $matches); - - if (!$result) return false; // invalid URI - - // seperate out parts - $scheme = !empty($matches[1]) ? $matches[2] : null; - $authority = !empty($matches[3]) ? $matches[4] : null; - $path = $matches[5]; // always present, can be empty - $query = !empty($matches[6]) ? $matches[7] : null; - $fragment = !empty($matches[8]) ? $matches[9] : null; - - - - $registry =& HTMLPurifier_URISchemeRegistry::instance(); - if ($scheme !== null) { - // no need to validate the scheme's fmt since we do that when we - // retrieve the specific scheme object from the registry - $scheme = ctype_lower($scheme) ? $scheme : strtolower($scheme); - $scheme_obj = $registry->getScheme($scheme, $config, $context); - if (!$scheme_obj) return false; // invalid scheme, clean it out - } else { - $scheme_obj = $registry->getScheme( - $config->get('URI', 'DefaultScheme'), $config, $context - ); - } - - // something funky weird happened in the registry, abort! - if (!$scheme_obj) { - trigger_error( - 'Default scheme object "' . $config->get('URI', 'DefaultScheme') . '" was not readable', - E_USER_WARNING - ); - return false; - } - - // the URI we're processing embeds_resource a resource in the page, but the URI - // it references cannot be located - if ($this->embeds_resource && !$scheme_obj->browsable) { - return false; - } - - - if ($authority !== null) { + $ok = false; + do { - // remove URI if it's absolute and we disabled externals or - // if it's absolute and embedded and we disabled external resources - unset($our_host); - if ( - $config->get('URI', 'DisableExternal') || - ( - $config->get('URI', 'DisableExternalResources') && - $this->embeds_resource - ) - ) { - $our_host = $config->get('URI', 'Host'); - if ($our_host === null) return false; + // generic validation + $result = $uri->validate($config, $context); + if (!$result) break; + + // chained validation + $uri_def =& $config->getDefinition('URI'); + $result = $uri_def->filter($uri, $config, $context); + if (!$result) break; + + // scheme-specific validation + $scheme_obj = $uri->getSchemeObj($config, $context); + if (!$scheme_obj) break; + if ($this->embedsResource && !$scheme_obj->browsable) break; + $result = $scheme_obj->validate($uri, $config, $context); + if (!$result) break; + + // survived gauntlet + $ok = true; + + } while (false); + + $context->destroy('EmbeddedURI'); + if (!$ok) return false; + + // munge scheme off if necessary (this must be last) + if (!is_null($uri->scheme) && is_null($uri->host)) { + if ($uri_def->defaultScheme == $uri->scheme) { + $uri->scheme = null; } - - $HEXDIG = '[A-Fa-f0-9]'; - $unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with [] - $sub_delims = '!$&\'()'; // needs [] - $pct_encoded = "%$HEXDIG$HEXDIG"; - $r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*"; - $r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/"; - $matches = array(); - preg_match($r_authority, $authority, $matches); - // overloads regexp! - $userinfo = !empty($matches[1]) ? $matches[2] : null; - $host = !empty($matches[3]) ? $matches[3] : null; - $port = !empty($matches[4]) ? $matches[5] : null; - - // validate port - if ($port !== null) { - $port = (int) $port; - if ($port < 1 || $port > 65535) $port = null; - } - - $host = $this->host->validate($host, $config, $context); - if ($host === false) $host = null; - - if ($this->checkBlacklist($host, $config, $context)) return false; - - // more lenient absolute checking - if (isset($our_host)) { - $host_parts = array_reverse(explode('.', $host)); - // could be cached - $our_host_parts = array_reverse(explode('.', $our_host)); - foreach ($our_host_parts as $i => $discard) { - if (!isset($host_parts[$i])) return false; - if ($host_parts[$i] != $our_host_parts[$i]) return false; - } - } - - // userinfo and host are validated within the regexp - - } else { - $port = $host = $userinfo = null; } + // back to string + $result = $uri->toString(); - // query and fragment are quite simple in terms of definition: - // *( pchar / "/" / "?" ), so define their validation routines - // when we start fixing percent encoding - - - - // path gets to be validated against a hodge-podge of rules depending - // on the status of authority and scheme, but it's not that important, - // esp. since it won't be applicable to everyone - - - - // okay, now we defer execution to the subobject for more processing - // note that $fragment is omitted - list($userinfo, $host, $port, $path, $query) = - $scheme_obj->validateComponents( - $userinfo, $host, $port, $path, $query, $config, $context - ); - - - // reconstruct authority - $authority = null; - if (!is_null($userinfo) || !is_null($host) || !is_null($port)) { - $authority = ''; - if($userinfo !== null) $authority .= $userinfo . '@'; - $authority .= $host; - if($port !== null) $authority .= ':' . $port; - } - - // reconstruct the result - $result = ''; - if ($scheme !== null) $result .= "$scheme:"; - if ($authority !== null) $result .= "//$authority"; - $result .= $path; - if ($query !== null) $result .= "?$query"; - if ($fragment !== null) $result .= "#$fragment"; - - // munge if necessary - $munge = $config->get('URI', 'Munge'); - if (!empty($scheme_obj->browsable) && $munge !== null) { - if ($authority !== null) { - $result = str_replace('%s', rawurlencode($result), $munge); - } + // munge entire URI if necessary + if ( + !is_null($uri->host) && // indicator for authority + !empty($scheme_obj->browsable) && + !is_null($munge = $config->get('URI', 'Munge')) + ) { + $result = str_replace('%s', rawurlencode($result), $munge); } return $result; } - /** - * Checks a host against an array blacklist - * @param $host Host to check - * @param $config HTMLPurifier_Config instance - * @param $context HTMLPurifier_Context instance - * @return bool Is spam? - */ - function checkBlacklist($host, &$config, &$context) { - $blacklist = $config->get('URI', 'HostBlacklist'); - if (!empty($blacklist)) { - foreach($blacklist as $blacklisted_host_fragment) { - if (strpos($host, $blacklisted_host_fragment) !== false) { - return true; - } - } - } - return false; - } - } diff --git a/library/HTMLPurifier/Config.php b/library/HTMLPurifier/Config.php index b6dff4b5..73be27b1 100644 --- a/library/HTMLPurifier/Config.php +++ b/library/HTMLPurifier/Config.php @@ -5,6 +5,7 @@ require_once 'HTMLPurifier/ConfigSchema.php'; // member variables require_once 'HTMLPurifier/HTMLDefinition.php'; require_once 'HTMLPurifier/CSSDefinition.php'; +require_once 'HTMLPurifier/URIDefinition.php'; require_once 'HTMLPurifier/Doctype.php'; require_once 'HTMLPurifier/DefinitionCacheFactory.php'; @@ -41,7 +42,7 @@ class HTMLPurifier_Config /** * HTML Purifier's version */ - var $version = '2.0.1'; + var $version = '2.1.1'; /** * Two-level associative array of configuration directives @@ -75,6 +76,11 @@ class HTMLPurifier_Config */ var $serials = array(); + /** + * Serial for entire configuration object + */ + var $serial; + /** * @param $definition HTMLPurifier_ConfigSchema that defines what directives * are allowed. @@ -98,7 +104,6 @@ class HTMLPurifier_Config $ret = HTMLPurifier_Config::createDefault(); if (is_string($config)) $ret->loadIni($config); elseif (is_array($config)) $ret->loadArray($config); - if (isset($revision)) $ret->revision = $revision; return $ret; } @@ -165,6 +170,17 @@ class HTMLPurifier_Config return $this->serials[$namespace]; } + /** + * Returns a md5 signature for the entire configuration object + * that uniquely identifies that particular configuration + */ + function getSerial() { + if (empty($this->serial)) { + $this->serial = md5(serialize($this->getAll())); + } + return $this->serial; + } + /** * Retrieves all directives, organized by namespace */ @@ -295,6 +311,8 @@ class HTMLPurifier_Config $this->definitions[$type] = new HTMLPurifier_HTMLDefinition(); } elseif ($type == 'CSS') { $this->definitions[$type] = new HTMLPurifier_CSSDefinition(); + } elseif ($type == 'URI') { + $this->definitions[$type] = new HTMLPurifier_URIDefinition(); } else { trigger_error("Definition of $type type not supported"); $false = false; @@ -393,6 +411,26 @@ class HTMLPurifier_Config * @static */ static function loadArrayFromForm($array, $index, $allowed = true, $mq_fix = true) { + $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix); + $config = HTMLPurifier_Config::create($ret); + return $config; + } + + /** + * Merges in configuration values from $_GET/$_POST to object. NOT STATIC. + * @note Same parameters as loadArrayFromForm + */ + function mergeArrayFromForm($array, $index, $allowed = true, $mq_fix = true) { + $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix); + $this->loadArray($ret); + } + + /** + * Prepares an array from a form into something usable for the more + * strict parts of HTMLPurifier_Config + * @static + */ + static function prepareArrayFromForm($array, $index, $allowed = true, $mq_fix = true) { $array = (isset($array[$index]) && is_array($array[$index])) ? $array[$index] : array(); $mq = get_magic_quotes_gpc() && $mq_fix; @@ -409,9 +447,7 @@ class HTMLPurifier_Config $value = $mq ? stripslashes($array[$skey]) : $array[$skey]; $ret[$ns][$directive] = $value; } - - $config = HTMLPurifier_Config::create($ret); - return $config; + return $ret; } /** diff --git a/library/HTMLPurifier/ConfigSchema.php b/library/HTMLPurifier/ConfigSchema.php index 13ad6036..83e1616e 100644 --- a/library/HTMLPurifier/ConfigSchema.php +++ b/library/HTMLPurifier/ConfigSchema.php @@ -6,6 +6,8 @@ require_once 'HTMLPurifier/ConfigDef/Namespace.php'; require_once 'HTMLPurifier/ConfigDef/Directive.php'; require_once 'HTMLPurifier/ConfigDef/DirectiveAlias.php'; +if (!defined('HTMLPURIFIER_SCHEMA_STRICT')) define('HTMLPURIFIER_SCHEMA_STRICT', false); + /** * Configuration definition, defines directives and their defaults. * @note If you update this, please update Printer_ConfigForm @@ -49,6 +51,8 @@ class HTMLPurifier_ConfigSchema { var $types = array( 'string' => 'String', 'istring' => 'Case-insensitive string', + 'text' => 'Text', + 'itext' => 'Case-insensitive text', 'int' => 'Integer', 'float' => 'Float', 'bool' => 'Boolean', @@ -100,27 +104,30 @@ class HTMLPurifier_ConfigSchema { * HTMLPurifier_DirectiveDef::$type for allowed values * @param $description Description of directive for documentation */ - static function define( - $namespace, $name, $default, $type, - $description - ) { + static function define($namespace, $name, $default, $type, $description) { $def =& HTMLPurifier_ConfigSchema::instance(); - if (!isset($def->info[$namespace])) { - trigger_error('Cannot define directive for undefined namespace', - E_USER_ERROR); - return; - } - if (!ctype_alnum($name)) { - trigger_error('Directive name must be alphanumeric', - E_USER_ERROR); - return; - } - if (empty($description)) { - trigger_error('Description must be non-empty', - E_USER_ERROR); - return; + + // basic sanity checks + if (HTMLPURIFIER_SCHEMA_STRICT) { + if (!isset($def->info[$namespace])) { + trigger_error('Cannot define directive for undefined namespace', + E_USER_ERROR); + return; + } + if (!ctype_alnum($name)) { + trigger_error('Directive name must be alphanumeric', + E_USER_ERROR); + return; + } + if (empty($description)) { + trigger_error('Description must be non-empty', + E_USER_ERROR); + return; + } } + if (isset($def->info[$namespace][$name])) { + // already defined if ( $def->info[$namespace][$name]->type !== $type || $def->defaults[$namespace][$name] !== $default @@ -129,29 +136,35 @@ class HTMLPurifier_ConfigSchema { return; } } else { - // process modifiers + // needs defining + + // process modifiers (OPTIMIZE!) $type_values = explode('/', $type, 2); $type = $type_values[0]; $modifier = isset($type_values[1]) ? $type_values[1] : false; $allow_null = ($modifier === 'null'); - if (!isset($def->types[$type])) { - trigger_error('Invalid type for configuration directive', - E_USER_ERROR); - return; - } - $default = $def->validate($default, $type, $allow_null); - if ($def->isError($default)) { - trigger_error('Default value does not match directive type', - E_USER_ERROR); - return; + if (HTMLPURIFIER_SCHEMA_STRICT) { + if (!isset($def->types[$type])) { + trigger_error('Invalid type for configuration directive', + E_USER_ERROR); + return; + } + $default = $def->validate($default, $type, $allow_null); + if ($def->isError($default)) { + trigger_error('Default value does not match directive type', + E_USER_ERROR); + return; + } } + $def->info[$namespace][$name] = new HTMLPurifier_ConfigDef_Directive(); $def->info[$namespace][$name]->type = $type; $def->info[$namespace][$name]->allow_null = $allow_null; $def->defaults[$namespace][$name] = $default; } + if (!HTMLPURIFIER_SCHEMA_STRICT) return; $backtrace = debug_backtrace(); $file = $def->mungeFilename($backtrace[0]['file']); $line = $backtrace[0]['line']; @@ -166,19 +179,21 @@ class HTMLPurifier_ConfigSchema { */ static function defineNamespace($namespace, $description) { $def =& HTMLPurifier_ConfigSchema::instance(); - if (isset($def->info[$namespace])) { - trigger_error('Cannot redefine namespace', E_USER_ERROR); - return; - } - if (!ctype_alnum($namespace)) { - trigger_error('Namespace name must be alphanumeric', - E_USER_ERROR); - return; - } - if (empty($description)) { - trigger_error('Description must be non-empty', - E_USER_ERROR); - return; + if (HTMLPURIFIER_SCHEMA_STRICT) { + if (isset($def->info[$namespace])) { + trigger_error('Cannot redefine namespace', E_USER_ERROR); + return; + } + if (!ctype_alnum($namespace)) { + trigger_error('Namespace name must be alphanumeric', + E_USER_ERROR); + return; + } + if (empty($description)) { + trigger_error('Description must be non-empty', + E_USER_ERROR); + return; + } } $def->info[$namespace] = array(); $def->info_namespace[$namespace] = new HTMLPurifier_ConfigDef_Namespace(); @@ -199,23 +214,25 @@ class HTMLPurifier_ConfigSchema { */ static function defineValueAliases($namespace, $name, $aliases) { $def =& HTMLPurifier_ConfigSchema::instance(); - if (!isset($def->info[$namespace][$name])) { + if (HTMLPURIFIER_SCHEMA_STRICT && !isset($def->info[$namespace][$name])) { trigger_error('Cannot set value alias for non-existant directive', E_USER_ERROR); return; } foreach ($aliases as $alias => $real) { - if (!$def->info[$namespace][$name] !== true && - !isset($def->info[$namespace][$name]->allowed[$real]) - ) { - trigger_error('Cannot define alias to value that is not allowed', - E_USER_ERROR); - return; - } - if (isset($def->info[$namespace][$name]->allowed[$alias])) { - trigger_error('Cannot define alias over allowed value', - E_USER_ERROR); - return; + if (HTMLPURIFIER_SCHEMA_STRICT) { + if (!$def->info[$namespace][$name] !== true && + !isset($def->info[$namespace][$name]->allowed[$real]) + ) { + trigger_error('Cannot define alias to value that is not allowed', + E_USER_ERROR); + return; + } + if (isset($def->info[$namespace][$name]->allowed[$alias])) { + trigger_error('Cannot define alias over allowed value', + E_USER_ERROR); + return; + } } $def->info[$namespace][$name]->aliases[$alias] = $real; } @@ -230,14 +247,14 @@ class HTMLPurifier_ConfigSchema { */ static function defineAllowedValues($namespace, $name, $allowed_values) { $def =& HTMLPurifier_ConfigSchema::instance(); - if (!isset($def->info[$namespace][$name])) { + if (HTMLPURIFIER_SCHEMA_STRICT && !isset($def->info[$namespace][$name])) { trigger_error('Cannot define allowed values for undefined directive', E_USER_ERROR); return; } $directive =& $def->info[$namespace][$name]; $type = $directive->type; - if ($type != 'string' && $type != 'istring') { + if (HTMLPURIFIER_SCHEMA_STRICT && $type != 'string' && $type != 'istring') { trigger_error('Cannot define allowed values for directive whose type is not string', E_USER_ERROR); return; @@ -248,8 +265,11 @@ class HTMLPurifier_ConfigSchema { foreach ($allowed_values as $value) { $directive->allowed[$value] = true; } - if ($def->defaults[$namespace][$name] !== null && - !isset($directive->allowed[$def->defaults[$namespace][$name]])) { + if ( + HTMLPURIFIER_SCHEMA_STRICT && + $def->defaults[$namespace][$name] !== null && + !isset($directive->allowed[$def->defaults[$namespace][$name]]) + ) { trigger_error('Default value must be in allowed range of variables', E_USER_ERROR); $directive->allowed = true; // undo undo! @@ -267,30 +287,32 @@ class HTMLPurifier_ConfigSchema { */ static function defineAlias($namespace, $name, $new_namespace, $new_name) { $def =& HTMLPurifier_ConfigSchema::instance(); - if (!isset($def->info[$namespace])) { - trigger_error('Cannot define directive alias in undefined namespace', - E_USER_ERROR); - return; - } - if (!ctype_alnum($name)) { - trigger_error('Directive name must be alphanumeric', - E_USER_ERROR); - return; - } - if (isset($def->info[$namespace][$name])) { - trigger_error('Cannot define alias over directive', - E_USER_ERROR); - return; - } - if (!isset($def->info[$new_namespace][$new_name])) { - trigger_error('Cannot define alias to undefined directive', - E_USER_ERROR); - return; - } - if ($def->info[$new_namespace][$new_name]->class == 'alias') { - trigger_error('Cannot define alias to alias', - E_USER_ERROR); - return; + if (HTMLPURIFIER_SCHEMA_STRICT) { + if (!isset($def->info[$namespace])) { + trigger_error('Cannot define directive alias in undefined namespace', + E_USER_ERROR); + return; + } + if (!ctype_alnum($name)) { + trigger_error('Directive name must be alphanumeric', + E_USER_ERROR); + return; + } + if (isset($def->info[$namespace][$name])) { + trigger_error('Cannot define alias over directive', + E_USER_ERROR); + return; + } + if (!isset($def->info[$new_namespace][$new_name])) { + trigger_error('Cannot define alias to undefined directive', + E_USER_ERROR); + return; + } + if ($def->info[$new_namespace][$new_name]->class == 'alias') { + trigger_error('Cannot define alias to alias', + E_USER_ERROR); + return; + } } $def->info[$namespace][$name] = new HTMLPurifier_ConfigDef_DirectiveAlias( @@ -313,8 +335,10 @@ class HTMLPurifier_ConfigSchema { return $var; case 'istring': case 'string': + case 'text': // no difference, just is longer/multiple line string + case 'itext': if (!is_string($var)) break; - if ($type === 'istring') $var = strtolower($var); + if ($type === 'istring' || $type === 'itext') $var = strtolower($var); return $var; case 'int': if (is_string($var) && ctype_digit($var)) $var = (int) $var; @@ -345,9 +369,13 @@ class HTMLPurifier_ConfigSchema { // a single empty string item, but having an empty // array is more intuitive if ($var == '') return array(); - // simplistic string to array method that only works - // for simple lists of tag names or alphanumeric characters - $var = explode(',',$var); + if (strpos($var, "\n") === false && strpos($var, "\r") === false) { + // simplistic string to array method that only works + // for simple lists of tag names or alphanumeric characters + $var = explode(',',$var); + } else { + $var = preg_split('/(,|[\n\r]+)/', $var); + } // remove spaces foreach ($var as $i => $j) $var[$i] = trim($j); if ($type === 'hash') { @@ -388,6 +416,7 @@ class HTMLPurifier_ConfigSchema { * Takes an absolute path and munges it into a more manageable relative path */ function mungeFilename($filename) { + if (!HTMLPURIFIER_SCHEMA_STRICT) return $filename; $offset = strrpos($filename, 'HTMLPurifier'); $filename = substr($filename, $offset); $filename = str_replace('\\', '/', $filename); diff --git a/library/HTMLPurifier/ContentSets.php b/library/HTMLPurifier/ContentSets.php index 001f4814..7baf7a31 100644 --- a/library/HTMLPurifier/ContentSets.php +++ b/library/HTMLPurifier/ContentSets.php @@ -5,6 +5,7 @@ require_once 'HTMLPurifier/ChildDef.php'; require_once 'HTMLPurifier/ChildDef/Empty.php'; require_once 'HTMLPurifier/ChildDef/Required.php'; require_once 'HTMLPurifier/ChildDef/Optional.php'; +require_once 'HTMLPurifier/ChildDef/Custom.php'; // NOT UNIT TESTED!!! diff --git a/library/HTMLPurifier/DefinitionCache/Serializer.php b/library/HTMLPurifier/DefinitionCache/Serializer.php index 2b07da13..1830e37e 100644 --- a/library/HTMLPurifier/DefinitionCache/Serializer.php +++ b/library/HTMLPurifier/DefinitionCache/Serializer.php @@ -99,7 +99,7 @@ class HTMLPurifier_DefinitionCache_Serializer extends */ function generateBaseDirectoryPath($config) { $base = $config->get('Cache', 'SerializerPath'); - $base = is_null($base) ? dirname(__FILE__) . '/Serializer' : $base; + $base = is_null($base) ? HTMLPURIFIER_PREFIX . '/HTMLPurifier/DefinitionCache/Serializer' : $base; return $base; } diff --git a/library/HTMLPurifier/EntityLookup.php b/library/HTMLPurifier/EntityLookup.php index ed3ea3df..212cf780 100644 --- a/library/HTMLPurifier/EntityLookup.php +++ b/library/HTMLPurifier/EntityLookup.php @@ -19,7 +19,7 @@ class HTMLPurifier_EntityLookup { */ function setup($file = false) { if (!$file) { - $file = dirname(__FILE__) . '/EntityLookup/entities.ser'; + $file = HTMLPURIFIER_PREFIX . '/HTMLPurifier/EntityLookup/entities.ser'; } $this->table = unserialize(file_get_contents($file)); } diff --git a/library/HTMLPurifier/HTMLDefinition.php b/library/HTMLPurifier/HTMLDefinition.php index 9ed413c7..aaeb8bae 100644 --- a/library/HTMLPurifier/HTMLDefinition.php +++ b/library/HTMLPurifier/HTMLDefinition.php @@ -110,12 +110,13 @@ HTMLPurifier_ConfigSchema::define( '); HTMLPurifier_ConfigSchema::define( - 'HTML', 'Allowed', null, 'string/null', ' + 'HTML', 'Allowed', null, 'itext/null', ' <p> This is a convenience directive that rolls the functionality of %HTML.AllowedElements and %HTML.AllowedAttributes into one directive. Specify elements and attributes that are allowed using: - <code>element1[attr1|attr2],element2...</code>. + <code>element1[attr1|attr2],element2...</code>. You can also use + newlines instead of commas to separate elements. </p> <p> <strong>Warning</strong>: @@ -426,8 +427,9 @@ class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition $elements = array(); $attributes = array(); - $chunks = explode(',', $list); + $chunks = preg_split('/(,|[\n\r]+)/', $list); foreach ($chunks as $chunk) { + if (empty($chunk)) continue; // remove TinyMCE element control characters if (!strpos($chunk, '[')) { $element = $chunk; diff --git a/library/HTMLPurifier/HTMLModule/Ruby.php b/library/HTMLPurifier/HTMLModule/Ruby.php new file mode 100644 index 00000000..f5432446 --- /dev/null +++ b/library/HTMLPurifier/HTMLModule/Ruby.php @@ -0,0 +1,28 @@ +<?php + +require_once 'HTMLPurifier/HTMLModule.php'; + +/** + * XHTML 1.1 Ruby Annotation Module, defines elements that indicate + * short runs of text alongside base text for annotation or pronounciation. + */ +class HTMLPurifier_HTMLModule_Ruby extends HTMLPurifier_HTMLModule +{ + + var $name = 'Ruby'; + + function HTMLPurifier_HTMLModule_Ruby() { + $this->addElement('ruby', true, 'Inline', + 'Custom: ((rb, (rt | (rp, rt, rp))) | (rbc, rtc, rtc?))', + 'Common'); + $this->addElement('rbc', true, false, 'Required: rb', 'Common'); + $this->addElement('rtc', true, false, 'Required: rt', 'Common'); + $rb =& $this->addElement('rb', true, false, 'Inline', 'Common'); + $rb->excludes = array('ruby' => true); + $rt =& $this->addElement('rt', true, false, 'Inline', 'Common', array('rbspan' => 'Number')); + $rt->excludes = array('ruby' => true); + $this->addElement('rp', true, false, 'Optional: #PCDATA', 'Common'); + } + +} + diff --git a/library/HTMLPurifier/HTMLModuleManager.php b/library/HTMLPurifier/HTMLModuleManager.php index 69628dd8..d4f10d0c 100644 --- a/library/HTMLPurifier/HTMLModuleManager.php +++ b/library/HTMLPurifier/HTMLModuleManager.php @@ -28,6 +28,7 @@ require_once 'HTMLPurifier/HTMLModule/Target.php'; require_once 'HTMLPurifier/HTMLModule/Scripting.php'; require_once 'HTMLPurifier/HTMLModule/XMLCommonAttributes.php'; require_once 'HTMLPurifier/HTMLModule/NonXMLCommonAttributes.php'; +require_once 'HTMLPurifier/HTMLModule/Ruby.php'; // tidy modules require_once 'HTMLPurifier/HTMLModule/Tidy.php'; @@ -215,8 +216,8 @@ class HTMLPurifier_HTMLModuleManager $this->doctypes->register( 'XHTML 1.1', true, - array_merge($common, $xml), - array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary'), // Tidy_XHTML1_1 + array_merge($common, $xml, array('Ruby')), + array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_XHTMLStrict'), // Tidy_XHTML1_1 array(), '-//W3C//DTD XHTML 1.1//EN', 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' diff --git a/library/HTMLPurifier/Injector.php b/library/HTMLPurifier/Injector.php index 939f307e..59017163 100644 --- a/library/HTMLPurifier/Injector.php +++ b/library/HTMLPurifier/Injector.php @@ -8,6 +8,11 @@ class HTMLPurifier_Injector { + /** + * Advisory name of injector, this is for friendly error messages + */ + var $name; + /** * Amount of tokens the injector needs to skip + 1. Because * the decrement is the first thing that happens, this needs to @@ -40,16 +45,37 @@ class HTMLPurifier_Injector var $inputIndex; /** - * Prepares the injector by giving it the config and context objects, - * so that important variables can be extracted and not passed via - * parameter constantly. Remember: always instantiate a new injector - * when handling a set of HTML. + * Array of elements and attributes this injector creates and therefore + * need to be allowed by the definition. Takes form of + * array('element' => array('attr', 'attr2'), 'element2') + */ + var $needed = array(); + + /** + * Prepares the injector by giving it the config and context objects: + * this allows references to important variables to be made within + * the injector. This function also checks if the HTML environment + * will work with the Injector: if p tags are not allowed, the + * Auto-Paragraphing injector should not be enabled. + * @param $config Instance of HTMLPurifier_Config + * @param $context Instance of HTMLPurifier_Context + * @return Boolean false if success, string of missing needed element/attribute if failure */ function prepare($config, &$context) { $this->htmlDefinition = $config->getHTMLDefinition(); + // perform $needed checks + foreach ($this->needed as $element => $attributes) { + if (is_int($element)) $element = $attributes; + if (!isset($this->htmlDefinition->info[$element])) return $element; + if (!is_array($attributes)) continue; + foreach ($attributes as $name) { + if (!isset($this->htmlDefinition->info[$element]->attr[$name])) return "$element.$name"; + } + } $this->currentNesting =& $context->get('CurrentNesting'); $this->inputTokens =& $context->get('InputTokens'); $this->inputIndex =& $context->get('InputIndex'); + return false; } /** @@ -74,12 +100,12 @@ class HTMLPurifier_Injector /** * Handler that is called when a text token is processed */ - function handleText(&$token, $config, &$context) {} + function handleText(&$token) {} /** - * Handler that is called when a start token is processed + * Handler that is called when a start or empty token is processed */ - function handleStart(&$token, $config, &$context) {} + function handleElement(&$token) {} } diff --git a/library/HTMLPurifier/Injector/AutoParagraph.php b/library/HTMLPurifier/Injector/AutoParagraph.php index e8e2e34f..6e0a6a3e 100644 --- a/library/HTMLPurifier/Injector/AutoParagraph.php +++ b/library/HTMLPurifier/Injector/AutoParagraph.php @@ -15,6 +15,11 @@ HTMLPurifier_ConfigSchema::define( block elements in nodes that allow paragraph tags</li> <li>There are double newlines in paragraph tags</li> </ul> +<p> + <code>p</code> tags must be allowed for this directive to take effect. + We do not use <code>br</code> tags for paragraphing, as that is + semantically incorrect. +</p> <p> This directive has been available since 2.0.1. </p> @@ -27,13 +32,16 @@ HTMLPurifier_ConfigSchema::define( class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector { + var $name = 'AutoParagraph'; + var $needed = array('p'); + function _pStart() { $par = new HTMLPurifier_Token_Start('p'); $par->armor['MakeWellFormed_TagClosedError'] = true; return $par; } - function handleText(&$token, $config, &$context) { + function handleText(&$token) { $text = $token->data; if (empty($this->currentNesting)) { if (!$this->allowsElement('p')) return; @@ -79,7 +87,7 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector } - function handleStart(&$token, $config, &$context) { + function handleElement(&$token) { // check if we're inside a tag already if (!empty($this->currentNesting)) { if ($this->allowsElement('p')) { @@ -88,11 +96,19 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector // this token is already paragraph, abort if ($token->name == 'p') return; - // check if this token is adjacent to the parent - if ($this->inputTokens[$this->inputIndex - 1]->type != 'start') { + // this token is a block level, abort + if (!$this->_isInline($token)) return; + + // check if this token is adjacent to the parent token + $prev = $this->inputTokens[$this->inputIndex - 1]; + if ($prev->type != 'start') { // not adjacent, we can abort early // add lead paragraph tag if our token is inline - if ($this->_isInline($token)) { + // and the previous tag was an end paragraph + if ( + $prev->name == 'p' && $prev->type == 'end' && + $this->_isInline($token) + ) { $token = array($this->_pStart(), $token); } return; @@ -105,8 +121,8 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector $ok = false; // maintain a mini-nesting counter, this lets us bail out // early if possible - $j = 2; // current nesting, is two due to parent and this start - for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) { + $j = 1; // current nesting, one is due to parent (we recalculate current token) + for ($i = $this->inputIndex; isset($this->inputTokens[$i]); $i++) { if ($this->inputTokens[$i]->type == 'start') $j++; if ($this->inputTokens[$i]->type == 'end') $j--; if ($this->inputTokens[$i]->type == 'text') { @@ -150,7 +166,14 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector $needs_start = false; $needs_end = false; - for ($i = 0, $c = count($raw_paragraphs); $i < $c; $i++) { + $c = count($raw_paragraphs); + if ($c == 1) { + // there were no double-newlines, abort quickly + $result[] = new HTMLPurifier_Token_Text($data); + return; + } + + for ($i = 0; $i < $c; $i++) { $par = $raw_paragraphs[$i]; if (trim($par) !== '') { $paragraphs[] = $par; diff --git a/library/HTMLPurifier/Injector/Linkify.php b/library/HTMLPurifier/Injector/Linkify.php index 7ada1d7a..bf7abfa9 100644 --- a/library/HTMLPurifier/Injector/Linkify.php +++ b/library/HTMLPurifier/Injector/Linkify.php @@ -6,7 +6,8 @@ HTMLPurifier_ConfigSchema::define( 'AutoFormat', 'Linkify', false, 'bool', ' <p> This directive turns on linkification, auto-linking http, ftp and - https URLs. This directive has been available since 2.0.1. + https URLs. <code>a</code> tags with the <code>href</code> attribute + must be allowed. This directive has been available since 2.0.1. </p> '); @@ -16,7 +17,10 @@ HTMLPurifier_ConfigSchema::define( class HTMLPurifier_Injector_Linkify extends HTMLPurifier_Injector { - function handleText(&$token, $config, &$context) { + var $name = 'Linkify'; + var $needed = array('a' => array('href')); + + function handleText(&$token) { if (!$this->allowsElement('a')) return; if (strpos($token->data, '://') === false) { diff --git a/library/HTMLPurifier/Injector/PurifierLinkify.php b/library/HTMLPurifier/Injector/PurifierLinkify.php index d6f2e607..a7686297 100644 --- a/library/HTMLPurifier/Injector/PurifierLinkify.php +++ b/library/HTMLPurifier/Injector/PurifierLinkify.php @@ -6,8 +6,9 @@ HTMLPurifier_ConfigSchema::define( 'AutoFormat', 'PurifierLinkify', false, 'bool', ' <p> Internal auto-formatter that converts configuration directives in - syntax <a>%Namespace.Directive</a> to links. This directive has been available - since 2.0.1. + syntax <a>%Namespace.Directive</a> to links. <code>a</code> tags + with the <code>href</code> attribute must be allowed. + This directive has been available since 2.0.1. </p> '); @@ -27,14 +28,16 @@ HTMLPurifier_ConfigSchema::define( class HTMLPurifier_Injector_PurifierLinkify extends HTMLPurifier_Injector { + var $name = 'PurifierLinkify'; var $docURL; + var $needed = array('a' => array('href')); function prepare($config, &$context) { - parent::prepare($config, $context); $this->docURL = $config->get('AutoFormatParam', 'PurifierLinkifyDocURL'); + return parent::prepare($config, $context); } - function handleText(&$token, $config, &$context) { + function handleText(&$token) { if (!$this->allowsElement('a')) return; if (strpos($token->data, '%') === false) return; diff --git a/library/HTMLPurifier/Language/messages/en.php b/library/HTMLPurifier/Language/messages/en.php index a64cf301..b16c3ff3 100644 --- a/library/HTMLPurifier/Language/messages/en.php +++ b/library/HTMLPurifier/Language/messages/en.php @@ -28,7 +28,7 @@ $messages = array( 'Strategy_RemoveForeignElements: Foreign element to text' => 'Unrecognized $CurrentToken.Serialized tag converted to text', 'Strategy_RemoveForeignElements: Foreign element removed' => 'Unrecognized $CurrentToken.Serialized tag removed', 'Strategy_RemoveForeignElements: Comment removed' => 'Comment containing "$CurrentToken.Data" removed', -'Strategy_RemoveForeignElements: Script removed' => 'Script removed', +'Strategy_RemoveForeignElements: Foreign meta element removed' => 'Unrecognized $CurrentToken.Serialized meta tag and all descendants removed', 'Strategy_RemoveForeignElements: Token removed to end' => 'Tags and text starting from $1 element where removed to end', 'Strategy_MakeWellFormed: Unnecessary end tag removed' => 'Unnecessary $CurrentToken.Serialized tag removed', diff --git a/library/HTMLPurifier/LanguageFactory.php b/library/HTMLPurifier/LanguageFactory.php index 71539ded..ac6e7dbf 100644 --- a/library/HTMLPurifier/LanguageFactory.php +++ b/library/HTMLPurifier/LanguageFactory.php @@ -82,7 +82,7 @@ class HTMLPurifier_LanguageFactory */ function setup() { $this->validator = new HTMLPurifier_AttrDef_Lang(); - $this->dir = dirname(__FILE__); + $this->dir = HTMLPURIFIER_PREFIX . '/HTMLPurifier'; } /** diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index 839782ca..29295db7 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -66,6 +66,16 @@ HTMLPurifier_ConfigSchema::define( </p> '); +HTMLPurifier_ConfigSchema::define( + 'Core', 'AggressivelyFixLt', false, 'bool', ' +This directive enables aggressive pre-filter fixes HTML Purifier can +perform in order to ensure that open angled-brackets do not get killed +during parsing stage. Enabling this will result in two preg_replace_callback +calls and one preg_replace call for every bit of HTML passed through here. +It is not necessary and will have no effect for PHP 4. +This directive has been available since 2.1.0. +'); + /** * Forgivingly lexes HTML (SGML-style) markup into tokens. * diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index 82865673..17f23e34 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -42,6 +42,16 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer $html = $this->normalize($html, $config, $context); + // attempt to armor stray angled brackets that cannot possibly + // form tags and thus are probably being used as emoticons + if ($config->get('Core', 'AggressivelyFixLt')) { + $char = '[^a-z!\/]'; + $comment = "/<!--(.*?)(-->|\z)/is"; + $html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackArmorCommentEntities'), $html); + $html = preg_replace("/<($char)/i", '<\\1', $html); + $html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackUndoCommentSubst'), $html); // fix comments + } + // preprocess html, essential for UTF-8 $html = '<!DOCTYPE html '. @@ -151,5 +161,21 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer */ public function muteErrorHandler($errno, $errstr) {} + /** + * Callback function for undoing escaping of stray angled brackets + * in comments + */ + static public function callbackUndoCommentSubst($matches) { + return '<!--' . strtr($matches[1], array('&'=>'&','<'=>'<')) . $matches[2]; + } + + /** + * Callback function that entity-izes ampersands in comments so that + * callbackUndoCommentSubst doesn't clobber them + */ + static public function callbackArmorCommentEntities($matches) { + return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2]; + } + } diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index 883f4956..cd7cb4c1 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -150,6 +150,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // We are in tag and it is well formed // Grab the internals of the tag $strlen_segment = $position_next_gt - $cursor; + + if ($strlen_segment < 1) { + // there's nothing to process! + $token = new HTMLPurifier_Token_Text('<'); + $cursor++; + continue; + } + $segment = substr($html, $cursor, $strlen_segment); // Check if it's a comment @@ -204,7 +212,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // Check leading character is alnum, if not, we may // have accidently grabbed an emoticon. Translate into // text and go our merry way - if (!ctype_alnum($segment[0])) { + if (!ctype_alpha($segment[0])) { + // XML: $segment[0] !== '_' && $segment[0] !== ':' if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt'); $token = new HTMLPurifier_Token_Text( @@ -371,6 +380,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $value = $quoted_value; } } + if ($value === false) $value = ''; return array($key => $value); } @@ -385,7 +395,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // infinite loop protection $loops = 0; - while(true) { // infinite loop protection @@ -399,7 +408,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer } $cursor += ($value = strspn($string, $this->_whitespace, $cursor)); - // grab the key $key_begin = $cursor; //we're currently at the start of the key @@ -435,6 +443,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $cursor++; $cursor += strspn($string, $this->_whitespace, $cursor); + if ($cursor === false) { + $array[$key] = ''; + break; + } + // we might be in front of a quote right now $char = @$string[$cursor]; @@ -452,7 +465,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $value_end = $cursor; } + // we reached a premature end + if ($cursor === false) { + $cursor = $size; + $value_end = $cursor; + } + $value = substr($string, $value_begin, $value_end - $value_begin); + if ($value === false) $value = ''; $array[$key] = $this->parseData($value); $cursor++; diff --git a/library/HTMLPurifier/Printer/ConfigForm.css b/library/HTMLPurifier/Printer/ConfigForm.css index 23c7f999..0653bbb0 100644 --- a/library/HTMLPurifier/Printer/ConfigForm.css +++ b/library/HTMLPurifier/Printer/ConfigForm.css @@ -1,7 +1,7 @@ .hp-config {} -.hp-config tbody th {text-align:right;} +.hp-config tbody th {text-align:right; padding-right:0.5em;} .hp-config thead, .hp-config .namespace {background:#3C578C; color:#FFF;} .hp-config .namespace th {text-align:center;} .hp-config .verbose {display:none;} diff --git a/library/HTMLPurifier/Printer/ConfigForm.php b/library/HTMLPurifier/Printer/ConfigForm.php index fb86f5f3..31da35f8 100644 --- a/library/HTMLPurifier/Printer/ConfigForm.php +++ b/library/HTMLPurifier/Printer/ConfigForm.php @@ -23,18 +23,52 @@ class HTMLPurifier_Printer_ConfigForm extends HTMLPurifier_Printer */ var $name; + /** + * Whether or not to compress directive names, clipping them off + * after a certain amount of letters + */ + var $compress = false; + /** * @param $name Form element name for directives to be stuffed into * @param $doc_url String documentation URL, will have fragment tagged on + * @param $compress Integer max length before compressing a directive name, set to false to turn off */ - function HTMLPurifier_Printer_ConfigForm($name, $doc_url = null) { + function HTMLPurifier_Printer_ConfigForm( + $name, $doc_url = null, $compress = false + ) { parent::HTMLPurifier_Printer(); $this->docURL = $doc_url; $this->name = $name; + $this->compress = $compress; $this->fields['default'] = new HTMLPurifier_Printer_ConfigForm_default(); $this->fields['bool'] = new HTMLPurifier_Printer_ConfigForm_bool(); } + /** + * @param $cols Integer columns of textarea, null to use default + * @param $rows Integer rows of textarea, null to use default + */ + function setTextareaDimensions($cols = null, $rows = null) { + if ($cols) $this->fields['default']->cols = $cols; + if ($rows) $this->fields['default']->rows = $rows; + } + + /** + * Retrieves styling, in case the directory it's in is not publically + * available + */ + function getCSS() { + return file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/Printer/ConfigForm.css'); + } + + /** + * Retrieves JavaScript, in case directory is not public + */ + function getJavaScript() { + return file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/Printer/ConfigForm.js'); + } + /** * Returns HTML output for a configuration form * @param $config Configuration object of current form state @@ -98,11 +132,12 @@ class HTMLPurifier_Printer_ConfigForm extends HTMLPurifier_Printer $ret .= $this->start('a', array('href' => $url)); } $attr = array('for' => "{$this->name}:$ns.$directive"); + // crop directive name if it's too long - if (strlen($directive) < 14) { + if (!$this->compress || (strlen($directive) < $this->compress)) { $directive_disp = $directive; } else { - $directive_disp = substr($directive, 0, 12) . '...'; + $directive_disp = substr($directive, 0, $this->compress - 2) . '...'; $attr['title'] = $directive; } @@ -176,6 +211,8 @@ class HTMLPurifier_Printer_ConfigForm_NullDecorator extends HTMLPurifier_Printer * Swiss-army knife configuration form field printer */ class HTMLPurifier_Printer_ConfigForm_default extends HTMLPurifier_Printer { + var $cols = 18; + var $rows = 5; function render($ns, $directive, $value, $name, $config) { $this->prepareGenerator($config); // this should probably be split up a little @@ -190,12 +227,12 @@ class HTMLPurifier_Printer_ConfigForm_default extends HTMLPurifier_Printer { $value[] = $val; } case 'list': - $value = implode(',', $value); + $value = implode(PHP_EOL, $value); break; case 'hash': $nvalue = ''; foreach ($value as $i => $v) { - $nvalue .= "$i:$v,"; + $nvalue .= "$i:$v" . PHP_EOL; } $value = $nvalue; break; @@ -220,6 +257,15 @@ class HTMLPurifier_Printer_ConfigForm_default extends HTMLPurifier_Printer { $ret .= $this->element('option', $val, $attr); } $ret .= $this->end('select'); + } elseif ( + $def->type == 'text' || $def->type == 'itext' || + $def->type == 'list' || $def->type == 'hash' || $def->type == 'lookup' + ) { + $attr['cols'] = $this->cols; + $attr['rows'] = $this->rows; + $ret .= $this->start('textarea', $attr); + $ret .= $this->text($value); + $ret .= $this->end('textarea'); } else { $attr['value'] = $value; $attr['type'] = 'text'; diff --git a/library/HTMLPurifier/Strategy/MakeWellFormed.php b/library/HTMLPurifier/Strategy/MakeWellFormed.php index 3a8109cf..b3e8aa74 100644 --- a/library/HTMLPurifier/Strategy/MakeWellFormed.php +++ b/library/HTMLPurifier/Strategy/MakeWellFormed.php @@ -67,7 +67,8 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy unset($injectors['Custom']); // special case foreach ($injectors as $injector => $b) { $injector = "HTMLPurifier_Injector_$injector"; - if ($b) $this->injectors[] = new $injector; + if (!$b) continue; + $this->injectors[] = new $injector; } foreach ($custom_injectors as $injector) { if (is_string($injector)) { @@ -87,7 +88,11 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy // give the injectors references to the definition and context // variables for performance reasons foreach ($this->injectors as $i => $x) { - $this->injectors[$i]->prepare($config, $context); + $error = $this->injectors[$i]->prepare($config, $context); + if (!$error) continue; + list($injector) = array_splice($this->injectors, $i, 1); + $name = $injector->name; + trigger_error("Cannot enable $name injector because $error is not allowed", E_USER_WARNING); } // -- end INJECTOR -- @@ -109,7 +114,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy if ($token->type === 'text') { // injector handler code; duplicated for performance reasons foreach ($this->injectors as $i => $x) { - if (!$x->skip) $x->handleText($token, $config, $context); + if (!$x->skip) $x->handleText($token); if (is_array($token)) { $this->currentInjector = $i; break; @@ -122,26 +127,24 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy $info = $definition->info[$token->name]->child; - // quick checks: - // test if it claims to be a start tag but is empty + // quick tag checks: anything that's *not* an end tag + $ok = false; if ($info->type == 'empty' && $token->type == 'start') { - $result[] = new HTMLPurifier_Token_Empty($token->name, $token->attr); - continue; - } - // test if it claims to be empty but really is a start tag - if ($info->type != 'empty' && $token->type == 'empty' ) { - $result[] = new HTMLPurifier_Token_Start($token->name, $token->attr); - $result[] = new HTMLPurifier_Token_End($token->name); - continue; - } - // automatically insert empty tags - if ($token->type == 'empty') { - $result[] = $token; - continue; - } - - // start tags have precedence, so they get passed through... - if ($token->type == 'start') { + // test if it claims to be a start tag but is empty + $token = new HTMLPurifier_Token_Empty($token->name, $token->attr); + $ok = true; + } elseif ($info->type != 'empty' && $token->type == 'empty' ) { + // claims to be empty but really is a start tag + $token = array( + new HTMLPurifier_Token_Start($token->name, $token->attr), + new HTMLPurifier_Token_End($token->name) + ); + $ok = true; + } elseif ($token->type == 'empty') { + // real empty token + $ok = true; + } elseif ($token->type == 'start') { + // start tag // ...unless they also have to close their parent if (!empty($this->currentNesting)) { @@ -163,16 +166,18 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy $this->currentNesting[] = $parent; // undo the pop } - - // injector handler code; duplicated for performance reasons + $ok = true; + } + + // injector handler code; duplicated for performance reasons + if ($ok) { foreach ($this->injectors as $i => $x) { - if (!$x->skip) $x->handleStart($token, $config, $context); + if (!$x->skip) $x->handleElement($token); if (is_array($token)) { $this->currentInjector = $i; break; } } - $this->processToken($token, $config, $context); continue; } @@ -280,9 +285,11 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy array_splice($this->inputTokens, $this->inputIndex--, 1, $token); // adjust the injector skips based on the array substitution - $offset = count($token) + 1; - for ($i = 0; $i <= $this->currentInjector; $i++) { - $this->injectors[$i]->skip += $offset; + if ($this->injectors) { + $offset = count($token) + 1; + for ($i = 0; $i <= $this->currentInjector; $i++) { + $this->injectors[$i]->skip += $offset; + } } } elseif ($token) { // regular case diff --git a/library/HTMLPurifier/Strategy/RemoveForeignElements.php b/library/HTMLPurifier/Strategy/RemoveForeignElements.php index c14662c3..2c280b23 100644 --- a/library/HTMLPurifier/Strategy/RemoveForeignElements.php +++ b/library/HTMLPurifier/Strategy/RemoveForeignElements.php @@ -8,19 +8,38 @@ require_once 'HTMLPurifier/TagTransform.php'; require_once 'HTMLPurifier/AttrValidator.php'; HTMLPurifier_ConfigSchema::define( - 'Core', 'RemoveInvalidImg', true, 'bool', - 'This directive enables pre-emptive URI checking in <code>img</code> '. - 'tags, as the attribute validation strategy is not authorized to '. - 'remove elements from the document. This directive has been available '. - 'since 1.3.0, revert to pre-1.3.0 behavior by setting to false.' + 'Core', 'RemoveInvalidImg', true, 'bool', ' +<p> + This directive enables pre-emptive URI checking in <code>img</code> + tags, as the attribute validation strategy is not authorized to + remove elements from the document. This directive has been available + since 1.3.0, revert to pre-1.3.0 behavior by setting to false. +</p> +' ); HTMLPurifier_ConfigSchema::define( - 'Core', 'RemoveScriptContents', true, 'bool', ' + 'Core', 'RemoveScriptContents', null, 'bool/null', ' <p> This directive enables HTML Purifier to remove not only script tags - but all of their contents. This directive has been available since 2.0.0, - revert to pre-2.0.0 behavior by setting to false. + but all of their contents. This directive has been deprecated since 2.1.0, + and when not set the value of %Core.HiddenElements will take + precedence. This directive has been available since 2.0.0, and can be used to + revert to pre-2.0.0 behavior by setting it to false. +</p> +' +); + +HTMLPurifier_ConfigSchema::define( + 'Core', 'HiddenElements', array('script' => true, 'style' => true), 'lookup', ' +<p> + This directive is a lookup array of elements which should have their + contents removed when they are not allowed by the HTML definition. + For example, the contents of a <code>script</code> tag are not + normally shown in a document, so if script tags are to be removed, + their contents should be removed to. This is opposed to a <code>b</code> + tag, which defines some presentational changes but does not hide its + contents. </p> ' ); @@ -43,7 +62,16 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy $escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags'); $remove_invalid_img = $config->get('Core', 'RemoveInvalidImg'); + $remove_script_contents = $config->get('Core', 'RemoveScriptContents'); + $hidden_elements = $config->get('Core', 'HiddenElements'); + + // remove script contents compatibility + if ($remove_script_contents === true) { + $hidden_elements['script'] = true; + } elseif ($remove_script_contents === false && isset($hidden_elements['script'])) { + unset($hidden_elements['script']); + } $attr_validator = new HTMLPurifier_AttrValidator(); @@ -107,7 +135,7 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy } // CAN BE GENERICIZED - if ($token->name == 'script' && $token->type == 'start') { + if (isset($hidden_elements[$token->name]) && $token->type == 'start') { $textify_comments = $token->name; } elseif ($token->name === $textify_comments && $token->type == 'end') { $textify_comments = false; @@ -122,7 +150,7 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy } else { // check if we need to destroy all of the tag's children // CAN BE GENERICIZED - if ($token->name == 'script' && $remove_script_contents) { + if (isset($hidden_elements[$token->name])) { if ($token->type == 'start') { $remove_until = $token->name; } elseif ($token->type == 'empty') { @@ -130,7 +158,7 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy } else { $remove_until = false; } - if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Script removed'); + if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign meta element removed'); } else { if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign element removed'); } diff --git a/library/HTMLPurifier/Strategy/ValidateAttributes.php b/library/HTMLPurifier/Strategy/ValidateAttributes.php index 4b3d7486..869f3fab 100644 --- a/library/HTMLPurifier/Strategy/ValidateAttributes.php +++ b/library/HTMLPurifier/Strategy/ValidateAttributes.php @@ -46,6 +46,7 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy } $context->destroy('IDAccumulator'); + $context->destroy('CurrentToken'); return $tokens; } diff --git a/library/HTMLPurifier/URI.php b/library/HTMLPurifier/URI.php new file mode 100644 index 00000000..ed7ffdd6 --- /dev/null +++ b/library/HTMLPurifier/URI.php @@ -0,0 +1,119 @@ +<?php + +require_once 'HTMLPurifier/URIParser.php'; +require_once 'HTMLPurifier/URIFilter.php'; + +/** + * HTML Purifier's internal representation of a URI + */ +class HTMLPurifier_URI +{ + + var $scheme, $userinfo, $host, $port, $path, $query, $fragment; + + /** + * @note Automatically normalizes scheme and port + */ + function HTMLPurifier_URI($scheme, $userinfo, $host, $port, $path, $query, $fragment) { + $this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme); + $this->userinfo = $userinfo; + $this->host = $host; + $this->port = is_null($port) ? $port : (int) $port; + $this->path = $path; + $this->query = $query; + $this->fragment = $fragment; + } + + /** + * Retrieves a scheme object corresponding to the URI's scheme/default + * @param $config Instance of HTMLPurifier_Config + * @param $context Instance of HTMLPurifier_Context + * @return Scheme object appropriate for validating this URI + */ + function getSchemeObj($config, &$context) { + $registry =& HTMLPurifier_URISchemeRegistry::instance(); + if ($this->scheme !== null) { + $scheme_obj = $registry->getScheme($this->scheme, $config, $context); + if (!$scheme_obj) return false; // invalid scheme, clean it out + } else { + // no scheme: retrieve the default one + $def = $config->getDefinition('URI'); + $scheme_obj = $registry->getScheme($def->defaultScheme, $config, $context); + if (!$scheme_obj) { + // something funky happened to the default scheme object + trigger_error( + 'Default scheme object "' . $def->defaultScheme . '" was not readable', + E_USER_WARNING + ); + return false; + } + } + return $scheme_obj; + } + + /** + * Generic validation method applicable for all schemes + * @param $config Instance of HTMLPurifier_Config + * @param $context Instance of HTMLPurifier_Context + * @return True if validation/filtering succeeds, false if failure + */ + function validate($config, &$context) { + + // validate host + if (!is_null($this->host)) { + $host_def = new HTMLPurifier_AttrDef_URI_Host(); + $this->host = $host_def->validate($this->host, $config, $context); + if ($this->host === false) $this->host = null; + } + + // validate port + if (!is_null($this->port)) { + if ($this->port < 1 || $this->port > 65535) $this->port = null; + } + + // query and fragment are quite simple in terms of definition: + // *( pchar / "/" / "?" ), so define their validation routines + // when we start fixing percent encoding + + // path gets to be validated against a hodge-podge of rules depending + // on the status of authority and scheme, but it's not that important, + // esp. since it won't be applicable to everyone + + return true; + + } + + /** + * Convert URI back to string + * @return String URI appropriate for output + */ + function toString() { + // reconstruct authority + $authority = null; + if (!is_null($this->host)) { + $authority = ''; + if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@'; + $authority .= $this->host; + if(!is_null($this->port)) $authority .= ':' . $this->port; + } + + // reconstruct the result + $result = ''; + if (!is_null($this->scheme)) $result .= $this->scheme . ':'; + if (!is_null($authority)) $result .= '//' . $authority; + $result .= $this->path; + if (!is_null($this->query)) $result .= '?' . $this->query; + if (!is_null($this->fragment)) $result .= '#' . $this->fragment; + + return $result; + } + + /** + * Returns a copy of the URI object + */ + function copy() { + return unserialize(serialize($this)); + } + +} + diff --git a/library/HTMLPurifier/URIDefinition.php b/library/HTMLPurifier/URIDefinition.php new file mode 100644 index 00000000..45c505ed --- /dev/null +++ b/library/HTMLPurifier/URIDefinition.php @@ -0,0 +1,145 @@ +<?php + +require_once 'HTMLPurifier/Definition.php'; +require_once 'HTMLPurifier/URIFilter.php'; +require_once 'HTMLPurifier/URIParser.php'; + +require_once 'HTMLPurifier/URIFilter/DisableExternal.php'; +require_once 'HTMLPurifier/URIFilter/DisableExternalResources.php'; +require_once 'HTMLPurifier/URIFilter/HostBlacklist.php'; +require_once 'HTMLPurifier/URIFilter/MakeAbsolute.php'; + +HTMLPurifier_ConfigSchema::define( + 'URI', 'DefinitionID', null, 'string/null', ' +<p> + Unique identifier for a custom-built URI definition. If you want + to add custom URIFilters, you must specify this value. + This directive has been available since 2.1.0. +</p> +'); + +HTMLPurifier_ConfigSchema::define( + 'URI', 'DefinitionRev', 1, 'int', ' +<p> + Revision identifier for your custom definition. See + %HTML.DefinitionRev for details. This directive has been available + since 2.1.0. +</p> +'); + +// informative URI directives + +HTMLPurifier_ConfigSchema::define( + 'URI', 'DefaultScheme', 'http', 'string', ' +<p> + Defines through what scheme the output will be served, in order to + select the proper object validator when no scheme information is present. +</p> +'); + +HTMLPurifier_ConfigSchema::define( + 'URI', 'Host', null, 'string/null', ' +<p> + Defines the domain name of the server, so we can determine whether or + an absolute URI is from your website or not. Not strictly necessary, + as users should be using relative URIs to reference resources on your + website. It will, however, let you use absolute URIs to link to + subdomains of the domain you post here: i.e. example.com will allow + sub.example.com. However, higher up domains will still be excluded: + if you set %URI.Host to sub.example.com, example.com will be blocked. + <strong>Note:</strong> This directive overrides %URI.Base because + a given page may be on a sub-domain, but you wish HTML Purifier to be + more relaxed and allow some of the parent domains too. + This directive has been available since 1.2.0. +</p> +'); + +HTMLPurifier_ConfigSchema::define( + 'URI', 'Base', null, 'string/null', ' +<p> + The base URI is the URI of the document this purified HTML will be + inserted into. This information is important if HTML Purifier needs + to calculate absolute URIs from relative URIs, such as when %URI.MakeAbsolute + is on. You may use a non-absolute URI for this value, but behavior + may vary (%URI.MakeAbsolute deals nicely with both absolute and + relative paths, but forwards-compatibility is not guaranteed). + <strong>Warning:</strong> If set, the scheme on this URI + overrides the one specified by %URI.DefaultScheme. This directive has + been available since 2.1.0. +</p> +'); + +class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition +{ + + var $type = 'URI'; + var $filters = array(); + var $registeredFilters = array(); + + /** + * HTMLPurifier_URI object of the base specified at %URI.Base + */ + var $base; + + /** + * String host to consider "home" base + */ + var $host; + + /** + * Name of default scheme based on %URI.DefaultScheme and %URI.Base + */ + var $defaultScheme; + + function HTMLPurifier_URIDefinition() { + $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal()); + $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources()); + $this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist()); + $this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute()); + } + + function registerFilter($filter) { + $this->registeredFilters[$filter->name] = $filter; + } + + function addFilter($filter, $config) { + $filter->prepare($config); + $this->filters[$filter->name] = $filter; + } + + function doSetup($config) { + $this->setupMemberVariables($config); + $this->setupFilters($config); + } + + function setupFilters($config) { + foreach ($this->registeredFilters as $name => $filter) { + $conf = $config->get('URI', $name); + if ($conf !== false && $conf !== null) { + $this->addFilter($filter, $config); + } + } + unset($this->registeredFilters); + } + + function setupMemberVariables($config) { + $this->host = $config->get('URI', 'Host'); + $base_uri = $config->get('URI', 'Base'); + if (!is_null($base_uri)) { + $parser = new HTMLPurifier_URIParser(); + $this->base = $parser->parse($base_uri); + $this->defaultScheme = $this->base->scheme; + if (is_null($this->host)) $this->host = $this->base->host; + } + if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI', 'DefaultScheme'); + } + + function filter(&$uri, $config, &$context) { + foreach ($this->filters as $name => $x) { + $result = $this->filters[$name]->filter($uri, $config, $context); + if (!$result) return false; + } + return true; + } + +} diff --git a/library/HTMLPurifier/URIFilter.php b/library/HTMLPurifier/URIFilter.php new file mode 100644 index 00000000..e0066f3b --- /dev/null +++ b/library/HTMLPurifier/URIFilter.php @@ -0,0 +1,24 @@ +<?php + +/** + * Chainable filters for custom URI processing + */ +class HTMLPurifier_URIFilter +{ + var $name; + + /** + * Performs initialization for the filter + */ + function prepare($config) {} + + /** + * Filter a URI object + * @param &$uri Reference to URI object + * @param $config Instance of HTMLPurifier_Config + * @param &$context Instance of HTMLPurifier_Context + */ + function filter(&$uri, $config, &$context) { + trigger_error('Cannot call abstract function', E_USER_ERROR); + } +} diff --git a/library/HTMLPurifier/URIFilter/DisableExternal.php b/library/HTMLPurifier/URIFilter/DisableExternal.php new file mode 100644 index 00000000..4e6dc187 --- /dev/null +++ b/library/HTMLPurifier/URIFilter/DisableExternal.php @@ -0,0 +1,34 @@ +<?php + +require_once 'HTMLPurifier/URIFilter.php'; + +HTMLPurifier_ConfigSchema::define( + 'URI', 'DisableExternal', false, 'bool', + 'Disables links to external websites. This is a highly effective '. + 'anti-spam and anti-pagerank-leech measure, but comes at a hefty price: no'. + 'links or images outside of your domain will be allowed. Non-linkified '. + 'URIs will still be preserved. If you want to be able to link to '. + 'subdomains or use absolute URIs, specify %URI.Host for your website. '. + 'This directive has been available since 1.2.0.' +); + +class HTMLPurifier_URIFilter_DisableExternal extends HTMLPurifier_URIFilter +{ + var $name = 'DisableExternal'; + var $ourHostParts = false; + function prepare($config) { + $our_host = $config->get('URI', 'Host'); + if ($our_host !== null) $this->ourHostParts = array_reverse(explode('.', $our_host)); + } + function filter(&$uri, $config, &$context) { + if (is_null($uri->host)) return true; + if ($this->ourHostParts === false) return false; + $host_parts = array_reverse(explode('.', $uri->host)); + foreach ($this->ourHostParts as $i => $x) { + if (!isset($host_parts[$i])) return false; + if ($host_parts[$i] != $this->ourHostParts[$i]) return false; + } + return true; + } +} + diff --git a/library/HTMLPurifier/URIFilter/DisableExternalResources.php b/library/HTMLPurifier/URIFilter/DisableExternalResources.php new file mode 100644 index 00000000..dc00e741 --- /dev/null +++ b/library/HTMLPurifier/URIFilter/DisableExternalResources.php @@ -0,0 +1,26 @@ +<?php + +require_once 'HTMLPurifier/URIFilter/DisableExternal.php'; + +HTMLPurifier_ConfigSchema::define( + 'URI', 'DisableExternalResources', false, 'bool', + 'Disables the embedding of external resources, preventing users from '. + 'embedding things like images from other hosts. This prevents '. + 'access tracking (good for email viewers), bandwidth leeching, '. + 'cross-site request forging, goatse.cx posting, and '. + 'other nasties, but also results in '. + 'a loss of end-user functionality (they can\'t directly post a pic '. + 'they posted from Flickr anymore). Use it if you don\'t have a '. + 'robust user-content moderation team. This directive has been '. + 'available since 1.3.0.' +); + +class HTMLPurifier_URIFilter_DisableExternalResources extends HTMLPurifier_URIFilter_DisableExternal +{ + var $name = 'DisableExternalResources'; + function filter(&$uri, $config, &$context) { + if (!$context->get('EmbeddedURI', true)) return true; + return parent::filter($uri, $config, $context); + } +} + diff --git a/library/HTMLPurifier/URIFilter/HostBlacklist.php b/library/HTMLPurifier/URIFilter/HostBlacklist.php new file mode 100644 index 00000000..d3429d5c --- /dev/null +++ b/library/HTMLPurifier/URIFilter/HostBlacklist.php @@ -0,0 +1,28 @@ +<?php + +require_once 'HTMLPurifier/URIFilter.php'; + +HTMLPurifier_ConfigSchema::define( + 'URI', 'HostBlacklist', array(), 'list', + 'List of strings that are forbidden in the host of any URI. Use it to '. + 'kill domain names of spam, etc. Note that it will catch anything in '. + 'the domain, so <tt>moo.com</tt> will catch <tt>moo.com.example.com</tt>. '. + 'This directive has been available since 1.3.0.' +); + +class HTMLPurifier_URIFilter_HostBlacklist extends HTMLPurifier_URIFilter +{ + var $name = 'HostBlacklist'; + var $blacklist = array(); + function prepare($config) { + $this->blacklist = $config->get('URI', 'HostBlacklist'); + } + function filter(&$uri, $config, &$context) { + foreach($this->blacklist as $blacklisted_host_fragment) { + if (strpos($uri->host, $blacklisted_host_fragment) !== false) { + return false; + } + } + return true; + } +} diff --git a/library/HTMLPurifier/URIFilter/MakeAbsolute.php b/library/HTMLPurifier/URIFilter/MakeAbsolute.php new file mode 100644 index 00000000..9935dc6e --- /dev/null +++ b/library/HTMLPurifier/URIFilter/MakeAbsolute.php @@ -0,0 +1,115 @@ +<?php + +// does not support network paths + +require_once 'HTMLPurifier/URIFilter.php'; + +HTMLPurifier_ConfigSchema::define( + 'URI', 'MakeAbsolute', false, 'bool', ' +<p> + Converts all URIs into absolute forms. This is useful when the HTML + being filtered assumes a specific base path, but will actually be + viewed in a different context (and setting an alternate base URI is + not possible). %URI.Base must be set for this directive to work. + This directive has been available since 2.1.0. +</p> +'); + +class HTMLPurifier_URIFilter_MakeAbsolute extends HTMLPurifier_URIFilter +{ + var $name = 'MakeAbsolute'; + var $base; + var $basePathStack = array(); + function prepare($config) { + $def = $config->getDefinition('URI'); + $this->base = $def->base; + if (is_null($this->base)) { + trigger_error('URI.MakeAbsolute is being ignored due to lack of value for URI.Base configuration', E_USER_ERROR); + return; + } + $this->base->fragment = null; // fragment is invalid for base URI + $stack = explode('/', $this->base->path); + array_pop($stack); // discard last segment + $stack = $this->_collapseStack($stack); // do pre-parsing + $this->basePathStack = $stack; + } + function filter(&$uri, $config, &$context) { + if (is_null($this->base)) return true; // abort early + if ( + $uri->path === '' && is_null($uri->scheme) && + is_null($uri->host) && is_null($uri->query) && is_null($uri->fragment) + ) { + // reference to current document + $uri = $this->base->copy(); + return true; + } + if (!is_null($uri->scheme)) { + // absolute URI already: don't change + if (!is_null($uri->host)) return true; + $scheme_obj = $uri->getSchemeObj($config, $context); + if (!$scheme_obj->hierarchical) { + // non-hierarchal URI with explicit scheme, don't change + return true; + } + // special case: had a scheme but always is hierarchical and had no authority + } + if (!is_null($uri->host)) { + // network path, don't bother + return true; + } + if ($uri->path === '') { + $uri->path = $this->base->path; + }elseif ($uri->path[0] !== '/') { + // relative path, needs more complicated processing + $stack = explode('/', $uri->path); + $new_stack = array_merge($this->basePathStack, $stack); + $new_stack = $this->_collapseStack($new_stack); + $uri->path = implode('/', $new_stack); + } + // re-combine + $uri->scheme = $this->base->scheme; + if (is_null($uri->userinfo)) $uri->userinfo = $this->base->userinfo; + if (is_null($uri->host)) $uri->host = $this->base->host; + if (is_null($uri->port)) $uri->port = $this->base->port; + return true; + } + + /** + * Resolve dots and double-dots in a path stack + * @private + */ + function _collapseStack($stack) { + $result = array(); + for ($i = 0; isset($stack[$i]); $i++) { + $is_folder = false; + // absorb an internally duplicated slash + if ($stack[$i] == '' && $i && isset($stack[$i+1])) continue; + if ($stack[$i] == '..') { + if (!empty($result)) { + $segment = array_pop($result); + if ($segment === '' && empty($result)) { + // error case: attempted to back out too far: + // restore the leading slash + $result[] = ''; + } elseif ($segment === '..') { + $result[] = '..'; // cannot remove .. with .. + } + } else { + // relative path, preserve the double-dots + $result[] = '..'; + } + $is_folder = true; + continue; + } + if ($stack[$i] == '.') { + // silently absorb + $is_folder = true; + continue; + } + $result[] = $stack[$i]; + } + if ($is_folder) $result[] = ''; + return $result; + } +} + diff --git a/library/HTMLPurifier/URIParser.php b/library/HTMLPurifier/URIParser.php new file mode 100644 index 00000000..dff7e28e --- /dev/null +++ b/library/HTMLPurifier/URIParser.php @@ -0,0 +1,62 @@ +<?php + +require_once 'HTMLPurifier/URI.php'; + +/** + * Parses a URI into the components and fragment identifier as specified + * by RFC 2396. + * @todo Replace regexps with a native PHP parser + */ +class HTMLPurifier_URIParser +{ + + /** + * Parses a URI + * @param $uri string URI to parse + * @return HTMLPurifier_URI representation of URI + */ + function parse($uri) { + $r_URI = '!'. + '(([^:/?#<>\'"]+):)?'. // 2. Scheme + '(//([^/?#<>\'"]*))?'. // 4. Authority + '([^?#<>\'"]*)'. // 5. Path + '(\?([^#<>\'"]*))?'. // 7. Query + '(#([^<>\'"]*))?'. // 8. Fragment + '!'; + + $matches = array(); + $result = preg_match($r_URI, $uri, $matches); + + if (!$result) return false; // *really* invalid URI + + // seperate out parts + $scheme = !empty($matches[1]) ? $matches[2] : null; + $authority = !empty($matches[3]) ? $matches[4] : null; + $path = $matches[5]; // always present, can be empty + $query = !empty($matches[6]) ? $matches[7] : null; + $fragment = !empty($matches[8]) ? $matches[9] : null; + + // further parse authority + if ($authority !== null) { + // ridiculously inefficient: it's a stacked regex! + $HEXDIG = '[A-Fa-f0-9]'; + $unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with [] + $sub_delims = '!$&\'()'; // needs [] + $pct_encoded = "%$HEXDIG$HEXDIG"; + $r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*"; + $r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/"; + $matches = array(); + preg_match($r_authority, $authority, $matches); + $userinfo = !empty($matches[1]) ? $matches[2] : null; + $host = !empty($matches[3]) ? $matches[3] : ''; + $port = !empty($matches[4]) ? (int) $matches[5] : null; + } else { + $port = $host = $userinfo = null; + } + + return new HTMLPurifier_URI( + $scheme, $userinfo, $host, $port, $path, $query, $fragment); + } + +} + diff --git a/library/HTMLPurifier/URIScheme.php b/library/HTMLPurifier/URIScheme.php index 9be99752..41c02f70 100644 --- a/library/HTMLPurifier/URIScheme.php +++ b/library/HTMLPurifier/URIScheme.php @@ -19,24 +19,24 @@ class HTMLPurifier_URIScheme */ var $browsable = false; + /** + * Whether or not the URI always uses <hier_part>, resolves edge cases + * with making relative URIs absolute + */ + var $hierarchical = false; + /** * Validates the components of a URI * @note This implementation should be called by children if they define * a default port, as it does port processing. - * @note Fragment is omitted as that is scheme independent - * @param $userinfo User info found before at sign in authority - * @param $host Hostname in authority - * @param $port Port found after colon in authority - * @param $path Path of URI - * @param $query Query of URI, found after question mark + * @param $uri Instance of HTMLPurifier_URI * @param $config HTMLPurifier_Config object * @param $context HTMLPurifier_Context object + * @return Bool success or failure */ - function validateComponents( - $userinfo, $host, $port, $path, $query, $config, &$context - ) { - if ($this->default_port == $port) $port = null; - return array($userinfo, $host, $port, $path, $query); + function validate(&$uri, $config, &$context) { + if ($this->default_port == $uri->port) $uri->port = null; + return true; } } diff --git a/library/HTMLPurifier/URIScheme/ftp.php b/library/HTMLPurifier/URIScheme/ftp.php index 3dbb1446..5555ef33 100644 --- a/library/HTMLPurifier/URIScheme/ftp.php +++ b/library/HTMLPurifier/URIScheme/ftp.php @@ -9,35 +9,35 @@ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme { var $default_port = 21; var $browsable = true; // usually + var $hierarchical = true; - function validateComponents( - $userinfo, $host, $port, $path, $query, $config, &$context - ) { - list($userinfo, $host, $port, $path, $query) = - parent::validateComponents( - $userinfo, $host, $port, $path, $query, $config, $context ); - $semicolon_pos = strrpos($path, ';'); // reverse + function validate(&$uri, $config, &$context) { + parent::validate($uri, $config, $context); + $uri->query = null; + + // typecode check + $semicolon_pos = strrpos($uri->path, ';'); // reverse if ($semicolon_pos !== false) { - // typecode check - $type = substr($path, $semicolon_pos + 1); // no semicolon - $path = substr($path, 0, $semicolon_pos); + $type = substr($uri->path, $semicolon_pos + 1); // no semicolon + $uri->path = substr($uri->path, 0, $semicolon_pos); $type_ret = ''; if (strpos($type, '=') !== false) { // figure out whether or not the declaration is correct list($key, $typecode) = explode('=', $type, 2); if ($key !== 'type') { // invalid key, tack it back on encoded - $path .= '%3B' . $type; + $uri->path .= '%3B' . $type; } elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') { $type_ret = ";type=$typecode"; } } else { - $path .= '%3B' . $type; + $uri->path .= '%3B' . $type; } - $path = str_replace(';', '%3B', $path); - $path .= $type_ret; + $uri->path = str_replace(';', '%3B', $uri->path); + $uri->path .= $type_ret; } - return array($userinfo, $host, $port, $path, null); + + return true; } } diff --git a/library/HTMLPurifier/URIScheme/http.php b/library/HTMLPurifier/URIScheme/http.php index 18a1cf87..7abc6680 100644 --- a/library/HTMLPurifier/URIScheme/http.php +++ b/library/HTMLPurifier/URIScheme/http.php @@ -9,14 +9,12 @@ class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme { var $default_port = 80; var $browsable = true; + var $hierarchical = true; - function validateComponents( - $userinfo, $host, $port, $path, $query, $config, &$context - ) { - list($userinfo, $host, $port, $path, $query) = - parent::validateComponents( - $userinfo, $host, $port, $path, $query, $config, $context ); - return array(null, $host, $port, $path, $query); + function validate(&$uri, $config, &$context) { + parent::validate($uri, $config, $context); + $uri->userinfo = null; + return true; } } diff --git a/library/HTMLPurifier/URIScheme/mailto.php b/library/HTMLPurifier/URIScheme/mailto.php index 8e552f5c..f6acc6af 100644 --- a/library/HTMLPurifier/URIScheme/mailto.php +++ b/library/HTMLPurifier/URIScheme/mailto.php @@ -15,14 +15,13 @@ class HTMLPurifier_URIScheme_mailto extends HTMLPurifier_URIScheme { var $browsable = false; - function validateComponents( - $userinfo, $host, $port, $path, $query, $config, &$context - ) { - list($userinfo, $host, $port, $path, $query) = - parent::validateComponents( - $userinfo, $host, $port, $path, $query, $config, $context ); + function validate(&$uri, $config, &$context) { + parent::validate($uri, $config, $context); + $uri->userinfo = null; + $uri->host = null; + $uri->port = null; // we need to validate path against RFC 2368's addr-spec - return array(null, null, null, $path, $query); + return true; } } diff --git a/library/HTMLPurifier/URIScheme/news.php b/library/HTMLPurifier/URIScheme/news.php index 7b81834f..87bda63c 100644 --- a/library/HTMLPurifier/URIScheme/news.php +++ b/library/HTMLPurifier/URIScheme/news.php @@ -9,14 +9,14 @@ class HTMLPurifier_URIScheme_news extends HTMLPurifier_URIScheme { var $browsable = false; - function validateComponents( - $userinfo, $host, $port, $path, $query, $config, &$context - ) { - list($userinfo, $host, $port, $path, $query) = - parent::validateComponents( - $userinfo, $host, $port, $path, $query, $config, $context ); + function validate(&$uri, $config, &$context) { + parent::validate($uri, $config, $context); + $uri->userinfo = null; + $uri->host = null; + $uri->port = null; + $uri->query = null; // typecode check needed on path - return array(null, null, null, $path, null); + return true; } } diff --git a/library/HTMLPurifier/URIScheme/nntp.php b/library/HTMLPurifier/URIScheme/nntp.php index 8f513419..caa85b26 100644 --- a/library/HTMLPurifier/URIScheme/nntp.php +++ b/library/HTMLPurifier/URIScheme/nntp.php @@ -10,13 +10,11 @@ class HTMLPurifier_URIScheme_nntp extends HTMLPurifier_URIScheme { var $default_port = 119; var $browsable = false; - function validateComponents( - $userinfo, $host, $port, $path, $query, $config, &$context - ) { - list($userinfo, $host, $port, $path, $query) = - parent::validateComponents( - $userinfo, $host, $port, $path, $query, $config, $context ); - return array(null, $host, $port, $path, null); + function validate(&$uri, $config, &$context) { + parent::validate($uri, $config, $context); + $uri->userinfo = null; + $uri->query = null; + return true; } } diff --git a/library/HTMLPurifier/URISchemeRegistry.php b/library/HTMLPurifier/URISchemeRegistry.php index 5d8c462c..7716042d 100644 --- a/library/HTMLPurifier/URISchemeRegistry.php +++ b/library/HTMLPurifier/URISchemeRegistry.php @@ -79,12 +79,14 @@ class HTMLPurifier_URISchemeRegistry } if (isset($this->schemes[$scheme])) return $this->schemes[$scheme]; - if (empty($this->_dir)) $this->_dir = dirname(__FILE__) . '/URIScheme/'; + if (empty($this->_dir)) $this->_dir = HTMLPURIFIER_PREFIX . '/HTMLPurifier/URIScheme/'; if (!isset($allowed_schemes[$scheme])) return $null; - @include_once $this->_dir . $scheme . '.php'; + // this bit of reflection is not very efficient, and a bit + // hacky too $class = 'HTMLPurifier_URIScheme_' . $scheme; + if (!class_exists($class)) include_once $this->_dir . $scheme . '.php'; if (!class_exists($class)) return $null; $this->schemes[$scheme] = new $class(); return $this->schemes[$scheme]; diff --git a/maintenance/common.php b/maintenance/common.php new file mode 100644 index 00000000..d5437b77 --- /dev/null +++ b/maintenance/common.php @@ -0,0 +1,9 @@ +<?php + +function assertCli() { + if (php_sapi_name() != 'cli' && !getenv('PHP_IS_CLI')) { + echo 'Script cannot be called from web-browser (if you are calling via cli, +set environment variable PHP_IS_CLI to work around this).'; + exit; + } +} diff --git a/maintenance/flush-definition-cache.php b/maintenance/flush-definition-cache.php new file mode 100755 index 00000000..6d51ab06 --- /dev/null +++ b/maintenance/flush-definition-cache.php @@ -0,0 +1,36 @@ +#!/usr/bin/php +<?php + +require_once 'common.php'; +assertCli(); + +/** + * Flushes the default HTMLDefinition serial cache + * @param Accepts one argument, cache type to flush; otherwise flushes all + * the caches. + */ + +echo "Flushing cache... \n"; + +require_once(dirname(__FILE__) . '/../library/HTMLPurifier.auto.php'); + +$config = HTMLPurifier_Config::createDefault(); + +$names = array('HTML', 'CSS', 'URI', 'Test'); +if (isset($argv[1])) { + if (in_array($argv[1], $names)) { + $names = array($argv[1]); + } else { + echo "Did not recognized cache parameter {$argv[1]} as valid cache, aborting.\n"; + exit; + } +} + +foreach ($names as $name) { + echo " - Flushing $name\n"; + $cache = new HTMLPurifier_DefinitionCache_Serializer($name); + $cache->flush($config); +} + +echo 'Cache flushed successfully.'; + diff --git a/maintenance/flush-htmldefinition-cache.php b/maintenance/flush-htmldefinition-cache.php deleted file mode 100644 index c6d31bfb..00000000 --- a/maintenance/flush-htmldefinition-cache.php +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/php -<?php - -/** - * Flushes the default HTMLDefinition serial cache - */ - -if (php_sapi_name() != 'cli') { - echo 'Script cannot be called from web-browser.'; - exit; -} - -echo 'Flushing cache... '; - -require_once(dirname(__FILE__) . '/../library/HTMLPurifier.auto.php'); - -$config = HTMLPurifier_Config::createDefault(); - -$cache = new HTMLPurifier_DefinitionCache_Serializer('HTML'); -$cache->flush($config); - -echo 'Cache flushed successfully.'; - diff --git a/maintenance/generate-entity-file.php b/maintenance/generate-entity-file.php old mode 100644 new mode 100755 index 062fed1c..01aca19a --- a/maintenance/generate-entity-file.php +++ b/maintenance/generate-entity-file.php @@ -1,16 +1,14 @@ #!/usr/bin/php <?php +require_once 'common.php'; +assertCli(); + /** * Parses *.ent files into an entity lookup table, and then serializes and * writes the whole kaboodle to a file. The resulting file should be versioned. */ -if (php_sapi_name() != 'cli') { - echo 'Script cannot be called from web-browser.'; - exit; -} - chdir( dirname(__FILE__) ); // here's where the entity files are located, assuming working directory diff --git a/maintenance/merge-library.php b/maintenance/merge-library.php new file mode 100755 index 00000000..46c3c891 --- /dev/null +++ b/maintenance/merge-library.php @@ -0,0 +1,207 @@ +#!/usr/bin/php +<?php + +require_once 'common.php'; +assertCli(); + +/** + * Compiles all of HTML Purifier's library files into one big file + * named HTMLPurifier.standalone.php. Operates recursively, and will + * barf if there are conditional includes. + * + * Details: also creates blank "include" files in the test/blank directory + * in order to simulate require_once's inside the test files. + */ + +/** + * Global array that tracks already loaded includes + */ +$GLOBALS['loaded'] = array('HTMLPurifier.php' => true); + +/** + * @param $text Text to replace includes from + */ +function replace_includes($text) { + return preg_replace_callback( + "/require_once ['\"]([^'\"]+)['\"];/", + 'replace_includes_callback', + $text + ); +} + +/** + * Removes leading PHP tags from included files. Assumes that there is + * no trailing tag. + */ +function remove_php_tags($text) { + return substr($text, 5); +} + +/** + * Creates an appropriate blank file, recursively generating directories + * if necessary + */ +function create_blank($file) { + $dir = dirname($file); + $base = realpath('../tests/blanks/') . DIRECTORY_SEPARATOR ; + if ($dir != '.') mkdir_deep($base . $dir); + file_put_contents($base . $file, ''); +} + +/** + * Recursively creates a directory + * @note Adapted from the PHP manual comment 76612 + */ +function mkdir_deep($folder) { + $folders = preg_split("#[\\\\/]#", $folder); + $base = ''; + for($i = 0, $c = count($folders); $i < $c; $i++) { + if(empty($folders[$i])) { + if (!$i) { + // special case for root level + $base .= DIRECTORY_SEPARATOR; + } + continue; + } + $base .= $folders[$i]; + if(!is_dir($base)){ + mkdir($base); + } + $base .= DIRECTORY_SEPARATOR; + } +} + +/** + * Copy a file, or recursively copy a folder and its contents + * + * @author Aidan Lister <aidan@php.net> + * @version 1.0.1 + * @link http://aidanlister.com/repos/v/function.copyr.php + * @param string $source Source path + * @param string $dest Destination path + * @return bool Returns TRUE on success, FALSE on failure + */ +function copyr($source, $dest) { + // Simple copy for a file + if (is_file($source)) { + return copy($source, $dest); + } + // Make destination directory + if (!is_dir($dest)) { + mkdir($dest); + } + // Loop through the folder + $dir = dir($source); + while (false !== $entry = $dir->read()) { + // Skip pointers + if ($entry == '.' || $entry == '..') { + continue; + } + // Skip hidden files + if ($entry[0] == '.') { + continue; + } + // Deep copy directories + if ($dest !== "$source/$entry") { + copyr("$source/$entry", "$dest/$entry"); + } + } + // Clean up + $dir->close(); + return true; +} + +/** + * Delete a file, or a folder and its contents + * + * @author Aidan Lister <aidan@php.net> + * @version 1.0.3 + * @link http://aidanlister.com/repos/v/function.rmdirr.php + * @param string $dirname Directory to delete + * @return bool Returns TRUE on success, FALSE on failure + */ +function rmdirr($dirname) +{ + // Sanity check + if (!file_exists($dirname)) { + return false; + } + + // Simple delete for a file + if (is_file($dirname) || is_link($dirname)) { + return unlink($dirname); + } + + // Loop through the folder + $dir = dir($dirname); + while (false !== $entry = $dir->read()) { + // Skip pointers + if ($entry == '.' || $entry == '..') { + continue; + } + + // Recurse + rmdirr($dirname . DIRECTORY_SEPARATOR . $entry); + } + + // Clean up + $dir->close(); + return rmdir($dirname); +} + +/** + * Copies the contents of a directory to the standalone directory + */ +function make_dir_standalone($dir) { + return copyr($dir, 'standalone/' . $dir); +} + +function make_file_standalone($file) { + mkdir_deep('standalone/' . dirname($file)); + return copy($file, 'standalone/' . $file); +} + +/** + * @param $matches preg_replace_callback matches array, where index 1 + * is the filename to include + */ +function replace_includes_callback($matches) { + $file = $matches[1]; + // PHP 5 only file + if ($file == 'HTMLPurifier/Lexer/DOMLex.php') { + return $matches[0]; + } + if (isset($GLOBALS['loaded'][$file])) return ''; + $GLOBALS['loaded'][$file] = true; + create_blank($file); + return replace_includes(remove_php_tags(file_get_contents($file))); +} + +chdir(dirname(__FILE__) . '/../library/'); +create_blank('HTMLPurifier.php'); + +echo 'Creating full file...'; +$contents = replace_includes(file_get_contents('HTMLPurifier.php')); +$contents = str_replace( + "define('HTMLPURIFIER_PREFIX', dirname(__FILE__));", + "define('HTMLPURIFIER_PREFIX', dirname(__FILE__) . '/standalone'); +set_include_path(HTMLPURIFIER_PREFIX . PATH_SEPARATOR . get_include_path());", + $contents +); +file_put_contents('HTMLPurifier.standalone.php', $contents); +echo ' done!' . PHP_EOL; + +echo 'Creating standalone directory...'; +rmdirr('standalone'); // ensure a clean copy +mkdir_deep('standalone/HTMLPurifier/DefinitionCache/Serializer'); +make_dir_standalone('HTMLPurifier/EntityLookup'); +make_dir_standalone('HTMLPurifier/Language'); +make_file_standalone('HTMLPurifier/Printer/ConfigForm.js'); +make_file_standalone('HTMLPurifier/Printer/ConfigForm.css'); +make_dir_standalone('HTMLPurifier/URIScheme'); +// PHP 5 only file +mkdir_deep('standalone/HTMLPurifier/Lexer'); +make_file_standalone('HTMLPurifier/Lexer/DOMLex.php'); +make_file_standalone('HTMLPurifier/TokenFactory.php'); +echo ' done!' . PHP_EOL; + diff --git a/plugins/phorum/config.default.php b/plugins/phorum/config.default.php new file mode 100644 index 00000000..2f9031cc --- /dev/null +++ b/plugins/phorum/config.default.php @@ -0,0 +1,56 @@ +<?php + +if(!defined("PHORUM")) exit; + +// default HTML Purifier configuration settings +$config->set('HTML', 'Allowed', + // alphabetically sorted +'a[href|title] +abbr[title] +acronym[title] +b +blockquote[cite] +br +caption +cite +code +dd +del +dfn +div +dl +dt +em +i +img[src|alt|title|class] +ins +kbd +li +ol +p +pre +s +strike +strong +sub +sup +table +tbody +td +tfoot +th +thead +tr +tt +u +ul +var'); +$config->set('AutoFormat', 'AutoParagraph', true); +$config->set('AutoFormat', 'Linkify', true); +$config->set('HTML', 'Doctype', 'XHTML 1.0 Transitional'); +$config->set('Core', 'AggressivelyFixLt', true); +$config->set('Core', 'Encoding', $GLOBALS['PHORUM']['DATA']['CHARSET']); // we'll change this eventually +if (strtolower($GLOBALS['PHORUM']['DATA']['CHARSET']) !== 'utf-8') { + $config->set('Core', 'EscapeNonASCIICharacters', true); +} + diff --git a/plugins/phorum/htmlpurifier.php b/plugins/phorum/htmlpurifier.php new file mode 100644 index 00000000..4654c65d --- /dev/null +++ b/plugins/phorum/htmlpurifier.php @@ -0,0 +1,272 @@ +<?php + +/** + * HTML Purifier Phorum Mod. Filter your HTML the Standards-Compliant Way! + * + * This Phorum mod enables users to post raw HTML into Phorum. But never + * fear: with the help of HTML Purifier, this HTML will be beat into + * de-XSSed and standards-compliant form, safe for general consumption. + * It is not recommended, but possible to run this mod in parallel + * with other formatters (in short, please DISABLE the BBcode mod). + * + * For help migrating from your previous markup language to pure HTML + * please check the migrate.bbcode.php file. + * + * If you'd like to use this with a WYSIWYG editor, make sure that + * editor sets $PHORUM['mod_htmlpurifier']['wysiwyg'] to true. Otherwise, + * administrators who need to edit other people's comments may be at + * risk for some nasty attacks. + * + * Tested with Phorum 5.1.22. This module will almost definitely need + * to be upgraded when Phorum 6 rolls around. + */ + +// Note: Cache data is base64 encoded because Phorum insists on flinging +// to the user and expecting it to come back unharmed, newlines and +// all, which ain't happening. It's slower, it takes up more space, but +// at least it won't get mutilated + +/** + * Purifies a data array + */ +function phorum_htmlpurifier_format($data) +{ + $PHORUM = $GLOBALS["PHORUM"]; + + $purifier =& HTMLPurifier::getInstance(); + $cache_serial = $PHORUM['mod_htmlpurifier']['body_cache_serial']; + + foreach($data as $message_id => $message){ + if(isset($message['body'])) { + + if ($message_id) { + // we're dealing with a real message, not a fake, so + // there a number of shortcuts that can be taken + + if (isset($message['meta']['htmlpurifier_light'])) { + // format hook was called outside of Phorum's normal + // functions, do the abridged purification + $data[$message_id]['body'] = $purifier->purify($message['body']); + continue; + } + + if (!empty($PHORUM['args']['purge'])) { + // purge the cache, must be below the following if + unset($message['meta']['body_cache']); + } + + if ( + isset($message['meta']['body_cache']) && + isset($message['meta']['body_cache_serial']) && + $message['meta']['body_cache_serial'] == $cache_serial + ) { + // cached version is present, bail out early + $data[$message_id]['body'] = base64_decode($message['meta']['body_cache']); + continue; + } + } + + // migration might edit this array, that's why it's defined + // so early + $updated_message = array(); + + // create the $body variable + if ( + $message_id && // message must be real to migrate + !isset($message['meta']['body_cache_serial']) + ) { + // perform migration + $fake_data = array(); + list($signature, $edit_message) = phorum_htmlpurifier_remove_sig_and_editmessage($message); + $fake_data[$message_id] = $message; + $fake_data = phorum_htmlpurifier_migrate($fake_data); + $body = $fake_data[$message_id]['body']; + $body = str_replace("<phorum break>", '', $body); + $updated_message['body'] = $body; // save it in + $body .= $signature . $edit_message; // add it back in + } else { + // reverse Phorum's pre-processing + $body = $message['body']; + // order is important + $body = str_replace("<phorum break>\n", "\n", $body); + $body = str_replace(array('<','>','&'), array('<','>','&'), $body); + if (!$message_id && defined('PHORUM_CONTROL_CENTER')) { + // we're in control.php, so it was double-escaped + $body = str_replace(array('<','>','&', '"'), array('<','>','&','"'), $body); + } + } + + $body = $purifier->purify($body); + + // dynamically update the cache (MUST BE DONE HERE!) + // this is inefficient because it's one db call per + // cache miss, but once the cache is in place things are + // a lot zippier. + + if ($message_id) { // make sure it's not a fake id + $updated_message['meta'] = $message['meta']; + $updated_message['meta']['body_cache'] = base64_encode($body); + $updated_message['meta']['body_cache_serial'] = $cache_serial; + phorum_db_update_message($message_id, $updated_message); + } + + // must not get overloaded until after we cache it, otherwise + // we'll inadvertently change the original text + $data[$message_id]['body'] = $body; + + } + } + + return $data; +} + +// ----------------------------------------------------------------------- +// This is fragile code, copied from read.php:359. It will break if +// that is changed + +/** + * Generates a signature based on a message array + */ +function phorum_htmlpurifier_generate_sig($row) { + $phorum_sig = ''; + if(isset($row["user"]["signature"]) + && isset($row['meta']['show_signature']) && $row['meta']['show_signature']==1){ + $phorum_sig=trim($row["user"]["signature"]); + if(!empty($phorum_sig)){ + $phorum_sig="\n\n$phorum_sig"; + } + } + return $phorum_sig; +} + +/** + * Generates an edit message based on a message array + */ +function phorum_htmlpurifier_generate_editmessage($row) { + $PHORUM = $GLOBALS['PHORUM']; + $editmessage = ''; + if(isset($row['meta']['edit_count']) && $row['meta']['edit_count'] > 0) { + $editmessage = str_replace ("%count%", $row['meta']['edit_count'], $PHORUM["DATA"]["LANG"]["EditedMessage"]); + $editmessage = str_replace ("%lastedit%", phorum_date($PHORUM["short_date"],$row['meta']['edit_date']), $editmessage); + $editmessage = str_replace ("%lastuser%", $row['meta']['edit_username'], $editmessage); + $editmessage="\n\n\n\n$editmessage"; + } + return $editmessage; +} + +// End fragile code +// ----------------------------------------------------------------------- + +/** + * Removes the signature and edit message from a message + * @param $row Message passed by reference + */ +function phorum_htmlpurifier_remove_sig_and_editmessage(&$row) { + // attempt to remove the Phorum's pre-processing: + // we must not process the signature or editmessage + $signature = phorum_htmlpurifier_generate_sig($row); + $editmessage = phorum_htmlpurifier_generate_editmessage($row); + $row['body'] = strtr($row['body'], array($signature => '', $editmessage => '')); + return array($signature, $editmessage); +} + +/** + * Indicate that data is fully HTML and not from migration, invalidate + * previous caches + * @note This function used to generate the actual cache entries, but + * since there's data missing that must be deferred to the first read + */ +function phorum_htmlpurifier_posting($message) { + $PHORUM = $GLOBALS["PHORUM"]; + unset($message['meta']['body_cache']); // invalidate the cache + $message['meta']['body_cache_serial'] = $PHORUM['mod_htmlpurifier']['body_cache_serial']; + return $message; +} + +/** + * Overload quoting mechanism to prevent default, mail-style quote from happening + */ +function phorum_htmlpurifier_quote($array) { + $PHORUM = $GLOBALS["PHORUM"]; + $purifier =& HTMLPurifier::getInstance(); + $text = $purifier->purify($array[1]); + return "<blockquote cite=\"$array[0]\">\n$text\n</blockquote>"; +} + +/** + * Ensure that our format hook is processed last. Also, loads the library. + * @credits <http://secretsauce.phorum.org/snippets/make_bbcode_last_formatter.php.txt> + */ +function phorum_htmlpurifier_common() { + + require_once(dirname(__FILE__).'/htmlpurifier/HTMLPurifier.auto.php'); + require(dirname(__FILE__).'/init-config.php'); + + $config = phorum_htmlpurifier_get_config(); + HTMLPurifier::getInstance($config); + + // increment revision.txt if you want to invalidate the cache + $GLOBALS['PHORUM']['mod_htmlpurifier']['body_cache_serial'] = $config->getSerial(); + + // load migration + if (file_exists(dirname(__FILE__) . '/migrate.php')) { + include(dirname(__FILE__) . '/migrate.php'); + } else { + echo '<strong>Error:</strong> No migration path specified for HTML Purifier, please check + <tt>modes/htmlpurifier/migrate.bbcode.php</tt> for instructions on + how to migrate from your previous markup language.'; + exit; + } + + // see if our hooks need to be bubbled to the end + phorum_htmlpurifier_bubble_hook('format'); + +} + +function phorum_htmlpurifier_bubble_hook($hook) { + global $PHORUM; + $our_idx = null; + $last_idx = null; + if (!isset($PHORUM['hooks'][$hook]['mods'])) return; + foreach ($PHORUM['hooks'][$hook]['mods'] as $idx => $mod) { + if ($mod == 'htmlpurifier') $our_idx = $idx; + $last_idx = $idx; + } + list($mod) = array_splice($PHORUM['hooks'][$hook]['mods'], $our_idx, 1); + $PHORUM['hooks'][$hook]['mods'][] = $mod; + list($func) = array_splice($PHORUM['hooks'][$hook]['funcs'], $our_idx, 1); + $PHORUM['hooks'][$hook]['funcs'][] = $func; +} + +/** + * Pre-emptively performs purification if it looks like a WYSIWYG editor + * is being used + */ +function phorum_htmlpurifier_before_editor($message) { + if (!empty($GLOBALS['PHORUM']['mod_htmlpurifier']['wysiwyg'])) { + if (!empty($message['body'])) { + $body = $message['body']; + // de-entity-ize contents + $body = str_replace(array('<','>','&'), array('<','>','&'), $body); + $purifier =& HTMLPurifier::getInstance(); + $body = $purifier->purify($message['body']); + // re-entity-ize contents + $body = htmlspecialchars($body, ENT_QUOTES, $GLOBALS['PHORUM']['DATA']['CHARSET']); + } + } + return $message; +} + +function phorum_htmlpurifier_editor_after_subject() { + // don't show this message if it's a WYSIWYG editor, since it will + // then be handled automatically + if (!empty($GLOBALS['PHORUM']['mod_htmlpurifier']['wysiwyg'])) return; + ?><tr><td colspan="2" style="padding:1em 0.3em;"> + HTML input is <strong>on</strong>. Make sure you escape all HTML and + angled-brackets with &lt; and &gt; (you can also use CDATA + tags, simply wrap the suspect text with +<![CDATA[<em>text</em>]]>. Paragraphs will only be applied to +double-spaces; single-spaces will not generate <tt><br></tt> tags. + </td></tr><?php +} + diff --git a/plugins/phorum/htmlpurifier/LICENSE b/plugins/phorum/htmlpurifier/LICENSE new file mode 100644 index 00000000..5ab7695a --- /dev/null +++ b/plugins/phorum/htmlpurifier/LICENSE @@ -0,0 +1,504 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + <one line to give the library's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + <signature of Ty Coon>, 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/plugins/phorum/htmlpurifier/README b/plugins/phorum/htmlpurifier/README new file mode 100644 index 00000000..65334fb3 --- /dev/null +++ b/plugins/phorum/htmlpurifier/README @@ -0,0 +1 @@ +The contents of the library/ folder should be here. diff --git a/plugins/phorum/info.txt b/plugins/phorum/info.txt new file mode 100644 index 00000000..ed2f4ae5 --- /dev/null +++ b/plugins/phorum/info.txt @@ -0,0 +1,8 @@ +hook: format|phorum_htmlpurifier_format +hook: quote|phorum_htmlpurifier_quote +hook: posting_custom_action|phorum_htmlpurifier_posting +hook: common|phorum_htmlpurifier_common +hook: before_editor|phorum_htmlpurifier_before_editor +hook: tpl_editor_after_subject|phorum_htmlpurifier_editor_after_subject +title: HTML Purifier Phorum Mod +desc: This module enables standards-compliant HTML filtering on Phorum. Please check migrate.bbcode.php before enabling this mod. \ No newline at end of file diff --git a/plugins/phorum/init-config.php b/plugins/phorum/init-config.php new file mode 100644 index 00000000..c279f67d --- /dev/null +++ b/plugins/phorum/init-config.php @@ -0,0 +1,27 @@ +<?php + +/** + * Initializes the appropriate configuration from either a PHP file + * or a module configuration value + * @return Instance of HTMLPurifier_Config + */ +function phorum_htmlpurifier_get_config() { + global $PHORUM; + $config_exists = phorum_htmlpurifier_config_file_exists(); + if ($config_exists || !isset($PHORUM['mod_htmlpurifier']['config'])) { + $config = HTMLPurifier_Config::createDefault(); + include(dirname(__FILE__) . '/config.default.php'); + if ($config_exists) { + include(dirname(__FILE__) . '/config.php'); + } + unset($PHORUM['mod_htmlpurifier']['config']); // unnecessary + } else { + $config = HTMLPurifier_Config::create($PHORUM['mod_htmlpurifier']['config']); + } + return $config; +} + +function phorum_htmlpurifier_config_file_exists() { + return file_exists(dirname(__FILE__) . '/config.php'); +} + diff --git a/plugins/phorum/install.txt b/plugins/phorum/install.txt new file mode 100644 index 00000000..d1848938 --- /dev/null +++ b/plugins/phorum/install.txt @@ -0,0 +1,33 @@ + +HTML Purifier Phorum Mod - Filter your HTML the Standards-Compliant Way! + +This Phorum mod enables HTML posting on Phorum. Under normal circumstances, +this would cause a huge security risk, but because we are running +HTML through HTML Purifier, output is guaranteed to be XSS free and +standards-compliant. + +This mod requires HTML input, and previous markup languages need to be +converted accordingly. Thus, it is vital that you create a 'migrate.php' +file that works with your installation. If you're using the built-in +BBCode formatting, simply move migrate.bbcode.php to that place; for +other markup languages, consult said file for instructions on how +to adapt it to your needs. + +This module will not work if 'migrate.php' is not created, and an improperly +made migration file may *CORRUPT* Phorum, so please take your time to +do this correctly. It should go without saying to *BACKUP YOUR DATABASE* +before attempting anything here. + +This module will not automatically migrate user signatures, because this +process may take a long time. After installing the HTML Purifier module and +then configuring 'migrate.php', navigate to Settings and click 'Migrate +Signatures' to migrate all user signatures. + +The version of HTML Purifier bundled with is a custom modified 2.0.1. +Do not attempt to replace it with a version equal to or less than +downloaded from the HTML Purifier website: the module will combust +spectacularly. (Greater versions, however, are okay, because the changes +made to accomodate this module have been committed to the trunk). + +Visit HTML Purifier at <http://htmlpurifier.org/>. May the force +be with you. diff --git a/plugins/phorum/migrate.bbcode.php b/plugins/phorum/migrate.bbcode.php new file mode 100644 index 00000000..58316b07 --- /dev/null +++ b/plugins/phorum/migrate.bbcode.php @@ -0,0 +1,28 @@ +<?php + +/** + * This file is responsible for migrating from a specific markup language + * like BBCode or Markdown to HTML. WARNING: THIS PROCESS IS NOT REVERSIBLE + * + * Copy this file to 'migrate.php' and it will automatically work for + * BBCode; you may need to tweak this a little to get it to work for other + * languages (usually, just replace the include name and the function name). + * + * If you do NOT want to have any migration performed (for instance, you + * are installing the module on a new forum with no posts), simply remove + * phorum_htmlpurifier_migrate() function. You still need migrate.php + * present, otherwise the module won't work. + */ + +if(!defined("PHORUM")) exit; + +require_once(dirname(__FILE__) . "/../bbcode/bbcode.php"); + +/** + * 'format' hook style function that will be called to convert + * legacy markup into HTML. + */ +function phorum_htmlpurifier_migrate($data) { + return phorum_bb_code($data); // bbcode's 'format' hook +} + diff --git a/plugins/phorum/settings.php b/plugins/phorum/settings.php new file mode 100644 index 00000000..4754d8b0 --- /dev/null +++ b/plugins/phorum/settings.php @@ -0,0 +1,63 @@ +<?php + +// based off of BBCode's settings file + +/** + * HTML Purifier Phorum mod settings configuration. This provides + * a convenient web-interface for editing the most common HTML Purifier + * configuration directives. You can also specify custom configuration + * by creating a 'config.php' file. + */ + +if(!defined("PHORUM_ADMIN")) exit; + +// error reporting is good! +error_reporting(E_ALL ^ E_NOTICE); + +// load library and other paraphenalia +require_once './include/admin/PhorumInputForm.php'; +require_once (dirname(__FILE__) . '/htmlpurifier/HTMLPurifier.auto.php'); +require_once (dirname(__FILE__) . '/init-config.php'); +require_once (dirname(__FILE__) . '/settings/migrate-sigs-form.php'); +require_once (dirname(__FILE__) . '/settings/migrate-sigs.php'); +require_once (dirname(__FILE__) . '/settings/form.php'); +require_once (dirname(__FILE__) . '/settings/save.php'); + +// define friendly configuration directives. you can expand this array +// to get more web-definable directives +$PHORUM['mod_htmlpurifier']['directives'] = array( + 'URI.Host', // auto-detectable + 'URI.DisableExternal', + 'URI.DisableExternalResources', + 'URI.DisableResources', + 'URI.Munge', + 'URI.HostBlacklist', + 'URI.Disable', + 'HTML.TidyLevel', + 'HTML.Doctype', // auto-detectable + 'HTML.Allowed', + 'AutoFormat', + '-AutoFormat.Custom', + '-AutoFormat.PurifierLinkify', + 'Output.TidyFormat', +); + +// lower this setting if you're getting time outs/out of memory +$PHORUM['mod_htmlpurifier']['migrate-sigs-increment'] = 100; + +if (isset($_POST['reset'])) { + unset($PHORUM['mod_htmlpurifier']['config']); +} + +if ($offset = phorum_htmlpurifier_migrate_sigs_check()) { + // migrate signatures + phorum_htmlpurifier_migrate_sigs($offset); +} elseif(!empty($_POST)){ + // save settings + phorum_htmlpurifier_save_settings(); +} + +phorum_htmlpurifier_show_migrate_sigs_form(); +echo '<br />'; +phorum_htmlpurifier_show_form(); + diff --git a/plugins/phorum/settings/form.php b/plugins/phorum/settings/form.php new file mode 100644 index 00000000..b957b8d2 --- /dev/null +++ b/plugins/phorum/settings/form.php @@ -0,0 +1,79 @@ +<?php + +function phorum_htmlpurifier_show_form() { + if (phorum_htmlpurifier_config_file_exists()) { + phorum_htmlpurifier_show_config_info(); + return; + } + + global $PHORUM; + + $config = phorum_htmlpurifier_get_config(); + + $frm = new PhorumInputForm ("", "post", "Save"); + $frm->hidden("module", "modsettings"); + $frm->hidden("mod", "htmlpurifier"); // this is the directory name that the Settings file lives in + + if (!empty($error)){ + echo "$error<br />"; + } + + $frm->addbreak("Edit settings for the HTML Purifier module"); + + $frm->addMessage('<p>Click on directive links to read what each option does + (links do not open in new windows).</p> + <p>For more flexibility (for instance, you want to edit the full + range of configuration directives), you can create a <tt>config.php</tt> + file in your <tt>mods/htmlpurifier/</tt> directory. Doing so will, + however, make the web configuration interface unavailable.</p>'); + + require_once 'HTMLPurifier/Printer/ConfigForm.php'; + $htmlpurifier_form = new HTMLPurifier_Printer_ConfigForm('config', 'http://htmlpurifier.org/live/configdoc/plain.html#%s'); + $htmlpurifier_form->setTextareaDimensions(23, 7); // widen a little, since we have space + + $frm->addMessage($htmlpurifier_form->render( + $config, $PHORUM['mod_htmlpurifier']['directives'], false)); + + $frm->addMessage("<strong>Warning: Changing HTML Purifier's configuration will invalidate + the cache. Expect to see a flurry of database activity after you change + any of these settings.</strong>"); + + $frm->addrow('Reset to defaults:', $frm->checkbox("reset", "1", "", false)); + + // hack to include extra styling + echo '<style type="text/css">' . $htmlpurifier_form->getCSS() . ' + .hp-config {margin-left:auto;margin-right:auto;} + </style>'; + $js = $htmlpurifier_form->getJavaScript(); + echo '<script type="text/javascript">'."<!--\n$js\n//-->".'</script>'; + + $frm->show(); +} + +function phorum_htmlpurifier_show_config_info() { + global $PHORUM; + + // update mod_htmlpurifier for housekeeping + phorum_htmlpurifier_commit_settings(); + + // politely tell user how to edit settings manually +?> + <div class="input-form-td-break">How to edit settings for HTML Purifier module</div> + <p> + A <tt>config.php</tt> file exists in your <tt>mods/htmlpurifier/</tt> + directory. This file contains your custom configuration: in order to + change it, please navigate to that file and edit it accordingly. + </p> + <p> + To use the web interface, delete <tt>config.php</tt> (or rename it to + <tt>config.php.bak</tt>). + </p> + <p> + <strong>Warning: Changing HTML Purifier's configuration will invalidate + the cache. Expect to see a flurry of database activity after you change + any of these settings.</strong> + </p> +<?php + +} + diff --git a/plugins/phorum/settings/migrate-sigs-form.php b/plugins/phorum/settings/migrate-sigs-form.php new file mode 100644 index 00000000..ad4877b5 --- /dev/null +++ b/plugins/phorum/settings/migrate-sigs-form.php @@ -0,0 +1,21 @@ +<?php + +function phorum_htmlpurifier_show_migrate_sigs_form() { + + $frm = new PhorumInputForm ('', "post", "Migrate"); + $frm->hidden("module", "modsettings"); + $frm->hidden("mod", "htmlpurifier"); + $frm->hidden("migrate-sigs", "1"); + $frm->addbreak("Migrate user signatures to HTML"); + $frm->addMessage('This operation will migrate your users signatures + to HTML. <strong>This process is irreversible and must only be performed once.</strong> + Type in yes in the confirmation field to migrate.'); + if (!file_exists(dirname(__FILE__) . '/../migrate.php')) { + $frm->addMessage('Migration file does not exist, cannot migrate signatures. + Please check <tt>migrate.bbcode.php</tt> on how to create an appropriate file.'); + } else { + $frm->addrow('Confirm:', $frm->text_box("confirmation", "")); + } + $frm->show(); +} + diff --git a/plugins/phorum/settings/migrate-sigs.php b/plugins/phorum/settings/migrate-sigs.php new file mode 100644 index 00000000..7896be36 --- /dev/null +++ b/plugins/phorum/settings/migrate-sigs.php @@ -0,0 +1,85 @@ +<?php + +function phorum_htmlpurifier_migrate_sigs_check() { + global $PHORUM; + $offset = 0; + if (!empty($_POST['migrate-sigs'])) { + if (!isset($_POST['confirmation']) || strtolower($_POST['confirmation']) !== 'yes') { + echo 'Invalid confirmation code.'; + exit; + } + $PHORUM['mod_htmlpurifier']['migrate-sigs'] = true; + phorum_db_update_settings(array("mod_htmlpurifier"=>$PHORUM["mod_htmlpurifier"])); + $offset = 1; + } elseif (!empty($_GET['migrate-sigs']) && $PHORUM['mod_htmlpurifier']['migrate-sigs']) { + $offset = (int) $_GET['migrate-sigs']; + } + return $offset; +} + +function phorum_htmlpurifier_migrate_sigs($offset) { + global $PHORUM; + + if(!$offset) return; // bail out quick of $offset == 0 + + @set_time_limit(0); // attempt to let this run + $increment = $PHORUM['mod_htmlpurifier']['migrate-sigs-increment']; + + require_once(dirname(__FILE__) . '/../migrate.php'); + // migrate signatures + // do this in batches so we don't run out of time/space + $end = $offset + $increment; + $user_ids = array(); + for ($i = $offset; $i < $end; $i++) { + $user_ids[] = $i; + } + $userinfos = phorum_db_user_get_fields($user_ids, 'signature'); + foreach ($userinfos as $i => $user) { + if (empty($user['signature'])) continue; + $sig = $user['signature']; + // perform standard Phorum processing on the sig + $sig = str_replace(array("&","<",">"), array("&","<",">"), $sig); + $sig = preg_replace("/<((http|https|ftp):\/\/[a-z0-9;\/\?:@=\&\$\-_\.\+!*'\(\),~%]+?)>/i", "$1", $sig); + // prepare fake data to pass to migration function + $fake_data = array(array("author"=>"", "email"=>"", "subject"=>"", 'body' => $sig)); + list($fake_message) = phorum_htmlpurifier_migrate($fake_data); + $user['signature'] = $fake_message['body']; + if (!phorum_user_save($user)) { + exit('Error while saving user data'); + } + } + unset($userinfos); // free up memory + + // query for highest ID in database + $type = $PHORUM['DBCONFIG']['type']; + if ($type == 'mysql') { + $conn = phorum_db_mysql_connect(); + $sql = "select MAX(user_id) from {$PHORUM['user_table']}"; + $res = mysql_query($sql, $conn); + $row = mysql_fetch_row($res); + $top_id = (int) $row[0]; + } elseif ($type == 'mysqli') { + $conn = phorum_db_mysqli_connect(); + $sql = "select MAX(user_id) from {$PHORUM['user_table']}"; + $res = mysqli_query($conn, $sql); + $row = mysqli_fetch_row($res); + $top_id = (int) $row[0]; + } else { + exit('Unrecognized database!'); + } + + $offset += $increment; + if ($offset > $top_id) { // test for end condition + echo 'Migration finished'; + $PHORUM['mod_htmlpurifier']['migrate-sigs'] = false; + phorum_htmlpurifier_commit_settings(); + return true; + } + $host = $_SERVER['HTTP_HOST']; + $uri = rtrim(dirname($_SERVER['PHP_SELF']), '/\\'); + $extra = 'admin.php?module=modsettings&mod=htmlpurifier&migrate-sigs=' . $offset; + // relies on output buffering to work + header("Location: http://$host$uri/$extra"); + exit; + +} diff --git a/plugins/phorum/settings/save.php b/plugins/phorum/settings/save.php new file mode 100644 index 00000000..a08b8314 --- /dev/null +++ b/plugins/phorum/settings/save.php @@ -0,0 +1,23 @@ +<?php + +function phorum_htmlpurifier_save_settings() { + global $PHORUM; + if (phorum_htmlpurifier_config_file_exists()) { + echo "Cannot update settings, <code>mods/htmlpurifier/config.php</code> already exists. To change + settings, edit that file. To use the web form, delete that file.<br />"; + } else { + $config = phorum_htmlpurifier_get_config(); + if (!isset($_POST['reset'])) $config->mergeArrayFromForm($_POST, 'config', $PHORUM['mod_htmlpurifier']['directives']); + $PHORUM['mod_htmlpurifier']['config'] = $config->getAll(); + if(!phorum_htmlpurifier_commit_settings()){ + $error="Database error while updating settings."; + } else { + echo "Settings Updated<br />"; + } + } +} + +function phorum_htmlpurifier_commit_settings() { + global $PHORUM; + return phorum_db_update_settings(array("mod_htmlpurifier"=>$PHORUM["mod_htmlpurifier"])); +} diff --git a/smoketests/testSchema.php b/smoketests/testSchema.php index 6e8bc74b..e6b721dc 100644 --- a/smoketests/testSchema.php +++ b/smoketests/testSchema.php @@ -37,3 +37,7 @@ HTMLPurifier_ConfigSchema::defineNamespace('ReportCard', 'It is for grades.'); HTMLPurifier_ConfigSchema::define('ReportCard', 'English', null, 'string/null', 'Grade from English class.'); HTMLPurifier_ConfigSchema::define('ReportCard', 'Absences', 0, 'int', 'How many times missing from school?'); +HTMLPurifier_ConfigSchema::defineNamespace('Text', 'This stuff is long, boring, and English.'); +HTMLPurifier_ConfigSchema::define('Text', 'AboutUs', 'Nothing much, but this should be decently long so that a textarea would be better', 'text', 'Who are we? What are we up to?'); +HTMLPurifier_ConfigSchema::define('Text', 'Hash', "not-case-sensitive\nstill-not-case-sensitive\nsuper-not-case-sensitive", 'itext', 'This is of limited utility, but of course it ends up being used.'); + diff --git a/test-settings.sample.php b/test-settings.sample.php index bd1f622b..74e2de54 100644 --- a/test-settings.sample.php +++ b/test-settings.sample.php @@ -1,16 +1,20 @@ <?php -// This file is necessary to run the unit tests and profiling -// scripts. +// ATTENTION! DO NOT EDIT THIS FILE! +// This file is necessary to run the unit tests and profiling scripts. +// Please copy it to 'test-settings.php' and make the necessary edits. -// Is PEAR available on your system? If it isn't, set to false. If PEAR -// is not part of the default include_path, add it. -$GLOBALS['HTMLPurifierTest']['PEAR'] = true; +// Some of these scripts run a long time, so it is recommended that you +// turn off the time limit +set_time_limit(0); + +// Turning off output buffering will prevent mysterious errors from core dumps +@ob_end_flush(); + +// Where is SimpleTest located? +$simpletest_location = '/path/to/simpletest/'; // How many times should profiling scripts iterate over the function? More runs // means more accurate results, but they'll take longer to perform. $GLOBALS['HTMLPurifierTest']['Runs'] = 2; -// Where is SimpleTest located? -$simpletest_location = '/path/to/simpletest/'; - diff --git a/tests/HTMLPurifier/AttrCollectionsTest.php b/tests/HTMLPurifier/AttrCollectionsTest.php index 6420a6ac..52fc88f1 100644 --- a/tests/HTMLPurifier/AttrCollectionsTest.php +++ b/tests/HTMLPurifier/AttrCollectionsTest.php @@ -9,7 +9,7 @@ class HTMLPurifier_AttrCollectionsTest_NoConstructor extends HTMLPurifier_AttrCo function performInclusions(&$a) {} } -class HTMLPurifier_AttrCollectionsTest extends UnitTestCase +class HTMLPurifier_AttrCollectionsTest extends HTMLPurifier_Harness { function testConstruction() { diff --git a/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php b/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php index 861cbb32..25571128 100644 --- a/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php @@ -16,6 +16,10 @@ class HTMLPurifier_AttrDef_CSS_FontFamilyTest extends HTMLPurifier_AttrDefHarnes $this->assertDef('01234'); $this->assertDef(',', false); $this->assertDef('Times New Roman, serif', '\'Times New Roman\', serif'); + $this->assertDef($d = "'John\\'s Font'"); + $this->assertDef("John's Font", $d); + $this->assertDef($d = "'\xE5\xAE\x8B\xE4\xBD\x93'"); + $this->assertDef("\xE5\xAE\x8B\xE4\xBD\x93", $d); } diff --git a/tests/HTMLPurifier/AttrDef/URITest.php b/tests/HTMLPurifier/AttrDef/URITest.php index a4d2521e..58b77248 100644 --- a/tests/HTMLPurifier/AttrDef/URITest.php +++ b/tests/HTMLPurifier/AttrDef/URITest.php @@ -2,317 +2,86 @@ require_once 'HTMLPurifier/AttrDefHarness.php'; require_once 'HTMLPurifier/AttrDef/URI.php'; +require_once 'HTMLPurifier/URIParser.php'; -// WARNING: INCOMPLETE UNIT TESTS! -// we also need to test all the configuration directives defined by this class - -// http: is returned quite often when a URL is invalid. We have to change -// this behavior to just a plain old "FALSE"! - +/** + * @todo Aim for complete code coverage with mocks + */ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness { - var $scheme, $components, $return_components; - - function testGenericURI() { - - generate_mock_once('HTMLPurifier_URIScheme'); - generate_mock_once('HTMLPurifier_URISchemeRegistry'); - - $old_registry = HTMLPurifier_URISchemeRegistry::instance(); - - // finally, lets get a copy of the actual class + function setUp() { $this->def = new HTMLPurifier_AttrDef_URI(); - - // initialize test inputs - $uri = // input URI - $components = // what components the URI should be parsed to - $return_components = // return components - $expect_uri = array(); // what reassembled URI to expect - - ////////////////////////////////////////////////////////////////////// - - // test a regular instance, return identical URI - $uri[0] = 'http://www.example.com/webhp?q=foo#result2'; - $components[0] = array( - null, // userinfo - 'www.example.com', // host - null, // port - '/webhp', // path - 'q=foo' // query - ); - - // test an amended URI (the actual logic is irrelevant) - // test that user and port get parsed correctly (3.2.1 and 3.2.3) - $uri[1] = 'http://user@authority.part:80/now/the/path?query#fragment'; - $components[1] = array( - 'user', 'authority.part', 80, - '/now/the/path', 'query' - ); - $return_components[1] = array( // removed port (it's standard) - 'user', 'authority.part', null, '/now/the/path', 'query' - ); - $expect_uri[1] = 'http://user@authority.part/now/the/path?query#fragment'; - - // percent encoded characters are not resolved during generic URI - // parsing even though RFC 3986 defines this notation - // also test what happens when query/fragment are missing - $uri[2] = 'http://en.wikipedia.org/wiki/Clich%C3%A9'; - $components[2] = array( - null, 'en.wikipedia.org', null, '/wiki/Clich%C3%A9', null - ); - - // test distinction between empty query and undefined query (above) - $uri[3] = 'http://www.example.com/?#'; - $components[3] = array(null, 'www.example.com', null, '/', ''); - - // path is always defined, even if empty - $uri[4] = 'http://www.example.com'; - $components[4] = array(null, 'www.example.com', null, '', null); - - // test parsing of an opaque URI - $uri[5] = 'mailto:bob@example.com'; - $components[5] = array(null, null, null, 'bob@example.com', null); - - // even though we don't resolve percent entities, we have to fix - // improper percent-encodes. Taken one at a time: - // %56 - V, which is an unreserved character - // %fc - u with an umlaut, normalize to uppercase - // %GJ - invalid characters in entity, encode % - // %5 - prematurely terminated, encode % - // %FC - u with umlaut, correct - // note that Apache doesn't do such fixing, rather, it just claims - // that the browser sent a "Bad Request". See PercentEncoder.php - // for more details - $uri[6] = 'http://www.example.com/%56%fc%GJ%5%FC'; - $components[6] = array(null, 'www.example.com', null, '/V%FC%25GJ%255%FC', null); - $expect_uri[6] = 'http://www.example.com/V%FC%25GJ%255%FC'; - - // test IPv4 address (behavior may vary with configuration) - $uri[7] = 'http://192.0.34.166/'; - $components[7] = array(null, '192.0.34.166', null, '/', null); - - // while it may look like an IPv4 address, it's really a reg-name. - // don't destroy it - $uri[8] = 'http://333.123.32.123/'; - $components[8] = array(null, '333.123.32.123', null, '/', null); - - // test IPv6 address, using amended form of RFC's example - $uri[9] = 'http://[2001:db8::7]/c=GB?objectClass?one'; - $components[9] = array(null, '[2001:db8::7]', null, '/c=GB', - 'objectClass?one'); - - // We will not implement punycode encoding, that's up to the browsers - // We also will not implement percent to IDNA encoding transformations: - // if you need to use an international domain in a link, make sure that - // you've got it in UTF-8 and send it in raw (no encoding). - - // break the RFC a little and allow international characters - // WARNING: UTF-8 encoded! - $uri[10] = 'http://tūdaliņ.lv'; - $components[10] = array(null, 'tūdaliņ.lv', null, '', null); - - // test invalid IPv6 address and invalid reg-name - $uri[11] = 'http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]'; - $components[11] = array(null, null, null, '', null); - $expect_uri[11] = 'http:'; - - // test invalid port - $uri[12] = 'http://example.com:foobar'; - $components[12] = array(null, 'example.com', null, '', null); - $expect_uri[12] = 'http://example.com'; - - // test overlarge port (max is 65535, although this isn't official) - $uri[13] = 'http://example.com:65536'; - $components[13] = array(null, 'example.com', null, '', null); - $expect_uri[13] = 'http://example.com'; - - // some spec abnf tests - - // "authority . path-abempty" omitted, it is a trivial case - - // "path-absolute", note this is different from path-rootless - $uri[14] = 'http:/this/is/path'; - $components[14] = array(null, null, null, '/this/is/path', null); - $expect_uri[14] = 'http:/this/is/path'; // do not munge scheme off - - // scheme munging is not being tested yet, it's an extra feature - - // "path-rootless" - this should not be used but is allowed - $uri[15] = 'http:this/is/path'; - $components[15] = array(null, null, null, 'this/is/path', null); - //$expect_uri[15] = 'this/is/path'; // munge scheme off - - // "path-empty" - a rather interesting case, remove the scheme - $uri[16] = 'http:'; - $components[16] = array(null, null, null, '', null); - //$expect_uri[16] = ''; // munge scheme off - - // test invalid scheme, components shouldn't be passed - $uri[17] = 'javascript:alert("moo");'; - $expect_uri[17] = false; - - // relative URIs - basic case - $uri[18] = '/a/b'; - $components[18] = array(null, null, null, '/a/b', null); - - // result of malformed tag, gracefully handle error - $uri[19] = 'http://www.google.com/\'>"'; - $components[19] = array(null, 'www.google.com', null, '/', null); - $expect_uri[19] = 'http://www.google.com/'; - - // test empty - $uri[20] = ''; - $components[20] = array(null, null, null, '', null); - $expect_uri[20] = ''; - - foreach ($uri as $i => $value) { - - // the read in values - $this->config = isset($config[$i]) ? $config[$i] : HTMLPurifier_Config::createDefault(); - $this->context = isset($context[$i]) ? $context[$i] : new HTMLPurifier_Context(); - - // setUpAssertDef - if ( isset($components[$i]) ) { - $this->components = $components[$i]; - } else { - $this->components = false; - } - if ( isset($return_components[$i]) ) { - $this->return_components = $return_components[$i]; - } else { - $this->return_components = $this->components; - } - - // parameters - if (!isset($expect_uri[$i])) { - $expect_uri[$i] = $value; // untouched - } - - $this->assertDef($value, $expect_uri[$i], true, "Test $i: %s"); - - } - - // reset to regular implementation - HTMLPurifier_URISchemeRegistry::instance($old_registry); - - } - - function setUpAssertDef() { - // $fake_registry isn't the real mock, because due to PHP 4 weirdness - // I cannot set a default value to function parameters that are passed - // by reference. So we use the value instance() returns. - $fake_registry = new HTMLPurifier_URISchemeRegistryMock(); - $registry =& HTMLPurifier_URISchemeRegistry::instance($fake_registry); - - // now, let's add a pseudo-scheme to the registry - $this->scheme = new HTMLPurifier_URISchemeMock(); - - // here are the schemes we will support with overloaded mocks - $registry->setReturnReference('getScheme', $this->scheme, array('http', '*', '*')); - $registry->setReturnReference('getScheme', $this->scheme, array('mailto', '*', '*')); - - // default return value is false (meaning no scheme defined: reject) - $registry->setReturnValue('getScheme', false, array('*', '*', '*')); - - if ($this->components === false) { - $this->scheme->expectNever('validateComponents'); - } else { - $this->components[] = '*'; // append the configuration - $this->components[] = '*'; // append context - $this->scheme->setReturnValue( - 'validateComponents', $this->return_components, $this->components); - $this->scheme->expectOnce('validateComponents', $this->components); - } - } - - function tearDownAssertDef() { - $this->scheme->tally(); + parent::setUp(); } function testIntegration() { - - $this->def = new HTMLPurifier_AttrDef_URI(); - $this->assertDef('http://www.google.com/'); + $this->assertDef('http:', ''); + $this->assertDef('http:/foo', '/foo'); $this->assertDef('javascript:bad_stuff();', false); $this->assertDef('ftp://www.example.com/'); $this->assertDef('news:rec.alt'); $this->assertDef('nntp://news.example.com/324234'); $this->assertDef('mailto:bob@example.com'); - } - function testDisableExternal() { - - $this->def = new HTMLPurifier_AttrDef_URI(); - $this->config->set('URI', 'DisableExternal', true); - $this->config->set('URI', 'Host', 'sub.example.com'); - - $this->assertDef('/foobar.txt'); - $this->assertDef('http://google.com/', false); - $this->assertDef('http://sub.example.com/alas?foo=asd'); - $this->assertDef('http://example.com/teehee', false); - $this->assertDef('http://www.example.com/#man', false); - $this->assertDef('http://go.sub.example.com/perhaps?p=foo'); - + function testIntegrationWithPercentEncoder() { + $this->assertDef( + 'http://www.example.com/%56%fc%GJ%5%FC', + 'http://www.example.com/V%FC%25GJ%255%FC' + ); } function testEmbeds() { - - // embedded URI $this->def = new HTMLPurifier_AttrDef_URI(true); - $this->assertDef('http://sub.example.com/alas?foo=asd'); $this->assertDef('mailto:foo@example.com', false); - } - function testDisableExternalResources() { - - $this->config->set('URI', 'DisableExternalResources', true); - - $this->def = new HTMLPurifier_AttrDef_URI(); - $this->assertDef('http://sub.example.com/alas?foo=asd'); - $this->assertDef('/img.png'); - - $this->def = new HTMLPurifier_AttrDef_URI(true); - $this->assertDef('http://sub.example.com/alas?foo=asd', false); - $this->assertDef('/img.png'); - - } - - function testMunge() { - + function testConfigMunge() { $this->config->set('URI', 'Munge', 'http://www.google.com/url?q=%s'); - $this->def = new HTMLPurifier_AttrDef_URI(); - $this->assertDef( 'http://www.example.com/', 'http://www.google.com/url?q=http%3A%2F%2Fwww.example.com%2F' ); - $this->assertDef('index.html'); $this->assertDef('javascript:foobar();', false); - } - function testBlacklist() { - - $this->config->set('URI', 'HostBlacklist', array('example.com', 'moo')); - - $this->assertDef('foo.txt'); - $this->assertDef('http://www.google.com/example.com/moo'); - - $this->assertDef('http://example.com/#23', false); - $this->assertDef('https://sub.domain.example.com/foobar', false); - $this->assertDef('http://example.com.example.net/?whoo=foo', false); - $this->assertDef('ftp://moo-moo.net/foo/foo/', false); - + function testDefaultSchemeRemovedInBlank() { + $this->assertDef('http:', ''); } - function testWhitelist() { - /* + function testDefaultSchemeRemovedInRelativeURI() { + $this->assertDef('http:/foo/bar', '/foo/bar'); + } + + function testDefaultSchemeNotRemovedInAbsoluteURI() { + $this->assertDef('http://example.com/foo/bar'); + } + + function testAltSchemeNotRemoved() { + $this->assertDef('mailto:this-looks-like-a-path@example.com'); + } + + function testURIDefinitionValidation() { + $parser = new HTMLPurifier_URIParser(); + $uri = $parser->parse('http://example.com'); + $this->config->set('URI', 'DefinitionID', 'HTMLPurifier_AttrDef_URITest->testURIDefinitionValidation'); + $uri_def =& $this->config->getDefinition('URI'); + // overload with mock + generate_mock_once('HTMLPurifier_URIDefinition'); + $uri_def = new HTMLPurifier_URIDefinitionMock(); + $uri_def->expectOnce('filter', array($uri, '*', '*')); + $uri_def->setReturnValue('filter', true, array($uri, '*', '*')); + $uri_def->setup = true; + $this->assertDef('http://example.com'); + } + + /* + function test_validate_configWhitelist() { + $this->config->set('URI', 'HostPolicy', 'DenyAll'); $this->config->set('URI', 'HostWhitelist', array(null, 'google.com')); @@ -320,8 +89,9 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness $this->assertDef('server.txt'); $this->assertDef('ftp://www.google.com/?t=a'); $this->assertDef('http://google.com.tricky.spamsite.net', false); - */ + } + */ } diff --git a/tests/HTMLPurifier/AttrDefHarness.php b/tests/HTMLPurifier/AttrDefHarness.php index 84d8cc9e..9d811484 100644 --- a/tests/HTMLPurifier/AttrDefHarness.php +++ b/tests/HTMLPurifier/AttrDefHarness.php @@ -1,11 +1,10 @@ <?php -class HTMLPurifier_AttrDefHarness extends UnitTestCase +class HTMLPurifier_AttrDefHarness extends HTMLPurifier_Harness { var $def; - var $context; - var $config; + var $context, $config; function setUp() { $this->config = HTMLPurifier_Config::createDefault(); @@ -13,20 +12,15 @@ class HTMLPurifier_AttrDefHarness extends UnitTestCase } // cannot be used for accumulator - function assertDef($string, $expect = true, $ini = false, $message = '%s') { + function assertDef($string, $expect = true) { // $expect can be a string or bool - if ($ini) $this->setUpAssertDef(); $result = $this->def->validate($string, $this->config, $this->context); if ($expect === true) { - $this->assertIdentical($string, $result, $message); + $this->assertIdentical($string, $result); } else { - $this->assertIdentical($expect, $result, $message); + $this->assertIdentical($expect, $result); } - if ($ini) $this->tearDownAssertDef(); } - function setUpAssertDef() {} - function tearDownAssertDef() {} - } diff --git a/tests/HTMLPurifier/AttrDefTest.php b/tests/HTMLPurifier/AttrDefTest.php index 0cd11310..84889bb3 100644 --- a/tests/HTMLPurifier/AttrDefTest.php +++ b/tests/HTMLPurifier/AttrDefTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/AttrDef.php'; -class HTMLPurifier_AttrDefTest extends UnitTestCase +class HTMLPurifier_AttrDefTest extends HTMLPurifier_Harness { function test_parseCDATA() { diff --git a/tests/HTMLPurifier/AttrTransformHarness.php b/tests/HTMLPurifier/AttrTransformHarness.php index 1f7839d0..e6ae1a93 100644 --- a/tests/HTMLPurifier/AttrTransformHarness.php +++ b/tests/HTMLPurifier/AttrTransformHarness.php @@ -1,8 +1,8 @@ <?php -require_once 'HTMLPurifier/Harness.php'; +require_once 'HTMLPurifier/ComplexHarness.php'; -class HTMLPurifier_AttrTransformHarness extends HTMLPurifier_Harness +class HTMLPurifier_AttrTransformHarness extends HTMLPurifier_ComplexHarness { function setUp() { diff --git a/tests/HTMLPurifier/AttrTransformTest.php b/tests/HTMLPurifier/AttrTransformTest.php index e75352fb..5694fdd4 100644 --- a/tests/HTMLPurifier/AttrTransformTest.php +++ b/tests/HTMLPurifier/AttrTransformTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/AttrTransform.php'; -class HTMLPurifier_AttrTransformTest extends UnitTestCase +class HTMLPurifier_AttrTransformTest extends HTMLPurifier_Harness { function test_prependCSS() { diff --git a/tests/HTMLPurifier/AttrTypesTest.php b/tests/HTMLPurifier/AttrTypesTest.php index 3f09dd1f..c207c320 100644 --- a/tests/HTMLPurifier/AttrTypesTest.php +++ b/tests/HTMLPurifier/AttrTypesTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/AttrTypes.php'; -class HTMLPurifier_AttrTypesTest extends UnitTestCase +class HTMLPurifier_AttrTypesTest extends HTMLPurifier_Harness { function test_get() { diff --git a/tests/HTMLPurifier/ChildDefHarness.php b/tests/HTMLPurifier/ChildDefHarness.php index 1ea04089..b0acb0bf 100644 --- a/tests/HTMLPurifier/ChildDefHarness.php +++ b/tests/HTMLPurifier/ChildDefHarness.php @@ -1,9 +1,9 @@ <?php -require_once 'HTMLPurifier/Harness.php'; +require_once 'HTMLPurifier/ComplexHarness.php'; require_once 'HTMLPurifier/ChildDef.php'; -class HTMLPurifier_ChildDefHarness extends HTMLPurifier_Harness +class HTMLPurifier_ChildDefHarness extends HTMLPurifier_ComplexHarness { function setUp() { diff --git a/tests/HTMLPurifier/ComplexHarness.php b/tests/HTMLPurifier/ComplexHarness.php new file mode 100644 index 00000000..8ea7378d --- /dev/null +++ b/tests/HTMLPurifier/ComplexHarness.php @@ -0,0 +1,129 @@ +<?php + +require_once 'HTMLPurifier/Lexer/DirectLex.php'; + +/** + * General-purpose test-harness that makes testing functions that require + * configuration and context objects easier when those two parameters are + * meaningless. See HTMLPurifier_ChildDefTest for a good example of usage. + */ +class HTMLPurifier_ComplexHarness extends HTMLPurifier_Harness +{ + + /** + * Instance of the object that will execute the method + */ + var $obj; + + /** + * Name of the function to be executed + */ + var $func; + + /** + * Whether or not the method deals in tokens. If set to true, assertResult() + * will transparently convert HTML to and back from tokens. + */ + var $to_tokens = false; + + /** + * Whether or not to convert tokens back into HTML before performing + * equality check, has no effect on bools. + */ + var $to_html = false; + + /** + * Instance of an HTMLPurifier_Lexer implementation. + */ + var $lexer; + + /** + * Instance of HTMLPurifier_Generator + */ + var $generator; + + /** + * Default config to fall back on if no config is available + */ + var $config; + + /** + * Default context to fall back on if no context is available + */ + var $context; + + function HTMLPurifier_ComplexHarness() { + $this->lexer = new HTMLPurifier_Lexer_DirectLex(); + $this->generator = new HTMLPurifier_Generator(); + parent::HTMLPurifier_Harness(); + } + + /** + * Asserts a specific result from a one parameter + config/context function + * @param $input Input parameter + * @param $expect Expectation + * @param $config Configuration array in form of Ns.Directive => Value. + * Has no effect if $this->config is set. + * @param $context_array Context array in form of Key => Value or an actual + * context object. + */ + function assertResult($input, $expect = true, + $config_array = array(), $context_array = array() + ) { + + // setup config + if ($this->config) { + $config = HTMLPurifier_Config::create($this->config); + $config->autoFinalize = false; + $config->loadArray($config_array); + } else { + $config = HTMLPurifier_Config::create($config_array); + } + + // setup context object. Note that we are operating on a copy of it! + // When necessary, extend the test harness to allow post-tests + // on the context object + if (empty($this->context)) { + $context = new HTMLPurifier_Context(); + $context->loadArray($context_array); + } else { + $context =& $this->context; + } + + if ($this->to_tokens && is_string($input)) { + // $func may cause $input to change, so "clone" another copy + // to sacrifice + $input = $this->lexer->tokenizeHTML($s = $input, $config, $context); + $input_c = $this->lexer->tokenizeHTML($s, $config, $context); + } else { + $input_c = $input; + } + + // call the function + $func = $this->func; + $result = $this->obj->$func($input_c, $config, $context); + + // test a bool result + if (is_bool($result)) { + $this->assertIdentical($expect, $result); + return; + } elseif (is_bool($expect)) { + $expect = $input; + } + + if ($this->to_html) { + $result = $this->generator-> + generateFromTokens($result, $config, $context); + if (is_array($expect)) { + $expect = $this->generator-> + generateFromTokens($expect, $config, $context); + } + } + + $this->assertIdentical($expect, $result); + + } + +} + + diff --git a/tests/HTMLPurifier/ConfigSchemaTest.php b/tests/HTMLPurifier/ConfigSchemaTest.php index 8dfb8f4c..8ad8f03c 100644 --- a/tests/HTMLPurifier/ConfigSchemaTest.php +++ b/tests/HTMLPurifier/ConfigSchemaTest.php @@ -6,7 +6,7 @@ if (!class_exists('CS')) { class CS extends HTMLPurifier_ConfigSchema {} } -class HTMLPurifier_ConfigSchemaTest extends UnitTestCase +class HTMLPurifier_ConfigSchemaTest extends HTMLPurifier_Harness { /** @@ -260,7 +260,9 @@ class HTMLPurifier_ConfigSchemaTest extends UnitTestCase function testValidate() { $this->assertValid('foobar', 'string'); + $this->assertValid('foobar', 'text'); // aliases, lstring = long string $this->assertValid('FOOBAR', 'istring', 'foobar'); + $this->assertValid('FOOBAR', 'itext', 'foobar'); $this->assertValid(34, 'int'); @@ -278,10 +280,14 @@ class HTMLPurifier_ConfigSchemaTest extends UnitTestCase $this->assertValid(array('1', '2', '3'), 'list'); $this->assertValid('foo,bar, cow', 'list', array('foo', 'bar', 'cow')); $this->assertValid('', 'list', array()); + $this->assertValid("foo\nbar", 'list', array('foo', 'bar')); + $this->assertValid("foo\nbar,baz", 'list', array('foo', 'bar', 'baz')); $this->assertValid(array('1' => true, '2' => true), 'lookup'); $this->assertValid(array('1', '2'), 'lookup', array('1' => true, '2' => true)); $this->assertValid('foo,bar', 'lookup', array('foo' => true, 'bar' => true)); + $this->assertValid("foo\nbar", 'lookup', array('foo' => true, 'bar' => true)); + $this->assertValid("foo\nbar,baz", 'lookup', array('foo' => true, 'bar' => true, 'baz' => true)); $this->assertValid('', 'lookup', array()); $this->assertValid(array('foo' => 'bar'), 'hash'); @@ -289,6 +295,7 @@ class HTMLPurifier_ConfigSchemaTest extends UnitTestCase $this->assertInvalid(array(0 => 'moo'), 'hash'); $this->assertValid('', 'hash', array()); $this->assertValid('foo:bar,too:two', 'hash', array('foo' => 'bar', 'too' => 'two')); + $this->assertValid("foo:bar\ntoo:two,three:free", 'hash', array('foo' => 'bar', 'too' => 'two', 'three' => 'free')); $this->assertValid('foo:bar,too', 'hash', array('foo' => 'bar')); $this->assertValid('foo:bar,', 'hash', array('foo' => 'bar')); $this->assertValid('foo:bar:baz', 'hash', array('foo' => 'bar:baz')); diff --git a/tests/HTMLPurifier/ConfigTest.php b/tests/HTMLPurifier/ConfigTest.php index a2dff2d2..941dc3da 100644 --- a/tests/HTMLPurifier/ConfigTest.php +++ b/tests/HTMLPurifier/ConfigTest.php @@ -6,7 +6,7 @@ if (!class_exists('CS')) { class CS extends HTMLPurifier_ConfigSchema {} } -class HTMLPurifier_ConfigTest extends UnitTestCase +class HTMLPurifier_ConfigTest extends HTMLPurifier_Harness { var $our_copy, $old_copy; diff --git a/tests/HTMLPurifier/ContextTest.php b/tests/HTMLPurifier/ContextTest.php index 8e038159..b072542f 100644 --- a/tests/HTMLPurifier/ContextTest.php +++ b/tests/HTMLPurifier/ContextTest.php @@ -5,7 +5,7 @@ require_once 'HTMLPurifier/Context.php'; // mocks require_once 'HTMLPurifier/IDAccumulator.php'; -class HTMLPurifier_ContextTest extends UnitTestCase +class HTMLPurifier_ContextTest extends HTMLPurifier_Harness { var $context; diff --git a/tests/HTMLPurifier/DefinitionCache/SerializerTest.php b/tests/HTMLPurifier/DefinitionCache/SerializerTest.php index 4925ff8e..cf4249af 100644 --- a/tests/HTMLPurifier/DefinitionCache/SerializerTest.php +++ b/tests/HTMLPurifier/DefinitionCache/SerializerTest.php @@ -17,8 +17,7 @@ class HTMLPurifier_DefinitionCache_SerializerTest extends HTMLPurifier_Definitio $config_md5 = '1.0.0-serial-2'; $file = realpath( - $rel_file = dirname(__FILE__) . - '/../../../library/HTMLPurifier/DefinitionCache/Serializer/Test/' . + $rel_file = HTMLPURIFIER_PREFIX . '/HTMLPurifier/DefinitionCache/Serializer/Test/' . $config_md5 . '.ser' ); if($file && file_exists($file)) unlink($file); // prevent previous failures from causing problems diff --git a/tests/HTMLPurifier/DefinitionCacheFactoryTest.php b/tests/HTMLPurifier/DefinitionCacheFactoryTest.php index beabc33c..a2768d7b 100644 --- a/tests/HTMLPurifier/DefinitionCacheFactoryTest.php +++ b/tests/HTMLPurifier/DefinitionCacheFactoryTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/DefinitionCacheFactory.php'; -class HTMLPurifier_DefinitionCacheFactoryTest extends UnitTestCase +class HTMLPurifier_DefinitionCacheFactoryTest extends HTMLPurifier_Harness { var $newFactory; diff --git a/tests/HTMLPurifier/DefinitionCacheHarness.php b/tests/HTMLPurifier/DefinitionCacheHarness.php index 7304ecdd..e6bd839f 100644 --- a/tests/HTMLPurifier/DefinitionCacheHarness.php +++ b/tests/HTMLPurifier/DefinitionCacheHarness.php @@ -1,6 +1,6 @@ <?php -class HTMLPurifier_DefinitionCacheHarness extends UnitTestCase +class HTMLPurifier_DefinitionCacheHarness extends HTMLPurifier_Harness { /** diff --git a/tests/HTMLPurifier/DefinitionCacheTest.php b/tests/HTMLPurifier/DefinitionCacheTest.php index eef49f14..70fb1760 100644 --- a/tests/HTMLPurifier/DefinitionCacheTest.php +++ b/tests/HTMLPurifier/DefinitionCacheTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/DefinitionCache.php'; -class HTMLPurifier_DefinitionCacheTest extends UnitTestCase +class HTMLPurifier_DefinitionCacheTest extends HTMLPurifier_Harness { function test_isOld() { diff --git a/tests/HTMLPurifier/DefinitionTest.php b/tests/HTMLPurifier/DefinitionTest.php index 250e0845..e48817ea 100644 --- a/tests/HTMLPurifier/DefinitionTest.php +++ b/tests/HTMLPurifier/DefinitionTest.php @@ -7,7 +7,7 @@ Mock::generatePartial( 'HTMLPurifier_Definition_Testable', array('doSetup')); -class HTMLPurifier_DefinitionTest extends UnitTestCase +class HTMLPurifier_DefinitionTest extends HTMLPurifier_Harness { function test_setup() { $def = new HTMLPurifier_Definition_Testable(); diff --git a/tests/HTMLPurifier/DoctypeRegistryTest.php b/tests/HTMLPurifier/DoctypeRegistryTest.php index 23375411..cec9dfcd 100644 --- a/tests/HTMLPurifier/DoctypeRegistryTest.php +++ b/tests/HTMLPurifier/DoctypeRegistryTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/DoctypeRegistry.php'; -class HTMLPurifier_DoctypeRegistryTest extends UnitTestCase +class HTMLPurifier_DoctypeRegistryTest extends HTMLPurifier_Harness { function test_register() { diff --git a/tests/HTMLPurifier/ElementDefTest.php b/tests/HTMLPurifier/ElementDefTest.php index 04b4b0af..a947b4c4 100644 --- a/tests/HTMLPurifier/ElementDefTest.php +++ b/tests/HTMLPurifier/ElementDefTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/ElementDef.php'; -class HTMLPurifier_ElementDefTest extends UnitTestCase +class HTMLPurifier_ElementDefTest extends HTMLPurifier_Harness { function test_mergeIn() { diff --git a/tests/HTMLPurifier/EncoderTest.php b/tests/HTMLPurifier/EncoderTest.php index 5cf6a240..6007bf6a 100644 --- a/tests/HTMLPurifier/EncoderTest.php +++ b/tests/HTMLPurifier/EncoderTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/Encoder.php'; -class HTMLPurifier_EncoderTest extends UnitTestCase +class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness { var $_entity_lookup; diff --git a/tests/HTMLPurifier/EntityLookupTest.php b/tests/HTMLPurifier/EntityLookupTest.php index 706b7c18..f50ee611 100644 --- a/tests/HTMLPurifier/EntityLookupTest.php +++ b/tests/HTMLPurifier/EntityLookupTest.php @@ -4,7 +4,7 @@ require_once 'HTMLPurifier/EntityLookup.php'; -class HTMLPurifier_EntityLookupTest extends UnitTestCase +class HTMLPurifier_EntityLookupTest extends HTMLPurifier_Harness { function test() { diff --git a/tests/HTMLPurifier/EntityParserTest.php b/tests/HTMLPurifier/EntityParserTest.php index 2d3a4d29..c3b605c1 100644 --- a/tests/HTMLPurifier/EntityParserTest.php +++ b/tests/HTMLPurifier/EntityParserTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/EntityParser.php'; -class HTMLPurifier_EntityParserTest extends UnitTestCase +class HTMLPurifier_EntityParserTest extends HTMLPurifier_Harness { var $EntityParser; diff --git a/tests/HTMLPurifier/ErrorCollectorTest.php b/tests/HTMLPurifier/ErrorCollectorTest.php index 508efcab..0c8db720 100644 --- a/tests/HTMLPurifier/ErrorCollectorTest.php +++ b/tests/HTMLPurifier/ErrorCollectorTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/ErrorCollector.php'; -class HTMLPurifier_ErrorCollectorTest extends UnitTestCase +class HTMLPurifier_ErrorCollectorTest extends HTMLPurifier_Harness { function setup() { diff --git a/tests/HTMLPurifier/ErrorsHarness.php b/tests/HTMLPurifier/ErrorsHarness.php index 359492a3..67f7c6b3 100644 --- a/tests/HTMLPurifier/ErrorsHarness.php +++ b/tests/HTMLPurifier/ErrorsHarness.php @@ -3,7 +3,7 @@ require_once 'HTMLPurifier/ErrorCollectorEMock.php'; require_once 'HTMLPurifier/Lexer/DirectLex.php'; -class HTMLPurifier_ErrorsHarness extends UnitTestCase +class HTMLPurifier_ErrorsHarness extends HTMLPurifier_Harness { var $config, $context; diff --git a/tests/HTMLPurifier/GeneratorTest.php b/tests/HTMLPurifier/GeneratorTest.php index 9039d1fb..b18d9ad1 100644 --- a/tests/HTMLPurifier/GeneratorTest.php +++ b/tests/HTMLPurifier/GeneratorTest.php @@ -3,16 +3,16 @@ require_once 'HTMLPurifier/Generator.php'; require_once 'HTMLPurifier/EntityLookup.php'; -require_once 'HTMLPurifier/Harness.php'; +require_once 'HTMLPurifier/ComplexHarness.php'; -class HTMLPurifier_GeneratorTest extends HTMLPurifier_Harness +class HTMLPurifier_GeneratorTest extends HTMLPurifier_ComplexHarness { var $gen; var $_entity_lookup; function HTMLPurifier_GeneratorTest() { - $this->UnitTestCase(); + $this->HTMLPurifier_Harness(); $this->gen = new HTMLPurifier_Generator(); $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); } diff --git a/tests/HTMLPurifier/HTMLDefinitionTest.php b/tests/HTMLPurifier/HTMLDefinitionTest.php index 3581f8cf..28fb28cc 100644 --- a/tests/HTMLPurifier/HTMLDefinitionTest.php +++ b/tests/HTMLPurifier/HTMLDefinitionTest.php @@ -2,13 +2,22 @@ require_once 'HTMLPurifier/HTMLDefinition.php'; -class HTMLPurifier_HTMLDefinitionTest extends UnitTestCase +class HTMLPurifier_HTMLDefinitionTest extends HTMLPurifier_Harness { function test_parseTinyMCEAllowedList() { $def = new HTMLPurifier_HTMLDefinition(); + // note: this is case-sensitive, but its config schema + // counterpart is not. This is generally a good thing for users, + // but it's a slight internal inconsistency + + $this->assertEqual( + $def->parseTinyMCEAllowedList(''), + array(array(), array()) + ); + $this->assertEqual( $def->parseTinyMCEAllowedList('a,b,c'), array(array('a' => true, 'b' => true, 'c' => true), array()) @@ -35,6 +44,17 @@ class HTMLPurifier_HTMLDefinitionTest extends UnitTestCase array('span.style' => true, 'a.href' => true, 'a.title' => true)) ); + $this->assertEqual( + // alternate form: + $def->parseTinyMCEAllowedList( +'span[style] +strong +a[href|title] +'), + array(array('span' => true, 'strong' => true, 'a' => true), + array('span.style' => true, 'a.href' => true, 'a.title' => true)) + ); + } function test_Allowed() { diff --git a/tests/HTMLPurifier/HTMLModule/RubyTest.php b/tests/HTMLPurifier/HTMLModule/RubyTest.php new file mode 100644 index 00000000..15abbcb7 --- /dev/null +++ b/tests/HTMLPurifier/HTMLModule/RubyTest.php @@ -0,0 +1,56 @@ +<?php + +require_once 'HTMLPurifier/HTMLModuleHarness.php'; + +class HTMLPurifier_HTMLModule_RubyTest extends HTMLPurifier_HTMLModuleHarness +{ + + function setUp() { + parent::setUp(); + $this->config->set('HTML', 'Doctype', 'XHTML 1.1'); + } + + function testBasicUse() { + $this->assertResult( + '<ruby><rb>WWW</rb><rt>World Wide Web</rt></ruby>' + ); + } + + function testRPUse() { + $this->assertResult( + '<ruby><rb>WWW</rb><rp>(</rp><rt>World Wide Web</rt><rp>)</rp></ruby>' + ); + } + + function testComplexUse() { + $this->assertResult( +'<ruby> + <rbc> + <rb>10</rb> + <rb>31</rb> + <rb>2002</rb> + </rbc> + <rtc> + <rt>Month</rt> + <rt>Day</rt> + <rt>Year</rt> + </rtc> + <rtc> + <rt rbspan="3">Expiration Date</rt> + </rtc> +</ruby>' + ); + + /* not implemented + function testBackwardsCompat() { + $this->assertResult( + '<ruby>A<rp>(</rp><rt>aaa</rt><rp>)</rp></ruby>', + '<ruby><rb>A</rb><rp>(</rp><rt>aaa</rt><rp>)</rp></ruby>' + ); + } + */ + + } + +} + diff --git a/tests/HTMLPurifier/HTMLModule/TidyTest.php b/tests/HTMLPurifier/HTMLModule/TidyTest.php index ff8d844d..f2522d9d 100644 --- a/tests/HTMLPurifier/HTMLModule/TidyTest.php +++ b/tests/HTMLPurifier/HTMLModule/TidyTest.php @@ -8,7 +8,7 @@ Mock::generatePartial( array('makeFixes', 'makeFixesForLevel', 'populate') ); -class HTMLPurifier_HTMLModule_TidyTest extends UnitTestCase +class HTMLPurifier_HTMLModule_TidyTest extends HTMLPurifier_Harness { function test_getFixesForLevel() { diff --git a/tests/HTMLPurifier/HTMLModuleManagerTest.php b/tests/HTMLPurifier/HTMLModuleManagerTest.php index d5219ca1..50dcb154 100644 --- a/tests/HTMLPurifier/HTMLModuleManagerTest.php +++ b/tests/HTMLPurifier/HTMLModuleManagerTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/HTMLModuleManager.php'; -class HTMLPurifier_HTMLModuleManagerTest extends UnitTestCase +class HTMLPurifier_HTMLModuleManagerTest extends HTMLPurifier_Harness { function test_addModule() { diff --git a/tests/HTMLPurifier/HTMLModuleTest.php b/tests/HTMLPurifier/HTMLModuleTest.php index 238ca42a..ace5805f 100644 --- a/tests/HTMLPurifier/HTMLModuleTest.php +++ b/tests/HTMLPurifier/HTMLModuleTest.php @@ -3,7 +3,7 @@ require_once 'HTMLPurifier/HTMLModule.php'; require_once 'HTMLPurifier/AttrDef.php'; -class HTMLPurifier_HTMLModuleTest extends UnitTestCase +class HTMLPurifier_HTMLModuleTest extends HTMLPurifier_Harness { function test_addElementToContentSet() { diff --git a/tests/HTMLPurifier/Harness.php b/tests/HTMLPurifier/Harness.php index 84cea5eb..4af4384b 100644 --- a/tests/HTMLPurifier/Harness.php +++ b/tests/HTMLPurifier/Harness.php @@ -1,128 +1,69 @@ <?php -require_once 'HTMLPurifier/Lexer/DirectLex.php'; +require_once 'HTMLPurifier/URIParser.php'; /** - * General-purpose test-harness that makes testing functions that require - * configuration and context objects easier when those two parameters are - * meaningless. See HTMLPurifier_ChildDefTest for a good example of usage. + * All-use harness, use this rather than SimpleTest's */ class HTMLPurifier_Harness extends UnitTestCase { - /** - * Instance of the object that will execute the method - */ - var $obj; - - /** - * Name of the function to be executed - */ - var $func; - - /** - * Whether or not the method deals in tokens. If set to true, assertResult() - * will transparently convert HTML to and back from tokens. - */ - var $to_tokens = false; - - /** - * Whether or not to convert tokens back into HTML before performing - * equality check, has no effect on bools. - */ - var $to_html = false; - - /** - * Instance of an HTMLPurifier_Lexer implementation. - */ - var $lexer; - - /** - * Instance of HTMLPurifier_Generator - */ - var $generator; - - /** - * Default config to fall back on if no config is available - */ - var $config; - - /** - * Default context to fall back on if no context is available - */ - var $context; - function HTMLPurifier_Harness() { - $this->lexer = new HTMLPurifier_Lexer_DirectLex(); - $this->generator = new HTMLPurifier_Generator(); parent::UnitTestCase(); } + var $config, $context; + /** - * Asserts a specific result from a one parameter + config/context function - * @param $input Input parameter - * @param $expect Expectation - * @param $config Configuration array in form of Ns.Directive => Value. - * Has no effect if $this->config is set. - * @param $context_array Context array in form of Key => Value or an actual - * context object. + * Generates easily accessible default config/context */ - function assertResult($input, $expect = true, - $config_array = array(), $context_array = array() - ) { - - // setup config - if ($this->config) { - $config = HTMLPurifier_Config::create($this->config); - $config->loadArray($config_array); + function setUp() { + list($this->config, $this->context) = $this->createCommon(); + } + + /** + * Accepts config and context and prepares them into a valid state + * @param &$config Reference to config variable + * @param &$context Reference to context variable + */ + function prepareCommon(&$config, &$context) { + $config = HTMLPurifier_Config::create($config); + if (!$context) $context = new HTMLPurifier_Context(); + } + + /** + * Generates default configuration and context objects + * @return Defaults in form of array($config, $context) + */ + function createCommon() { + return array(HTMLPurifier_Config::createDefault(), new HTMLPurifier_Context); + } + + /** + * If $expect is false, ignore $result and check if status failed. + * Otherwise, check if $status if true and $result === $expect. + * @param $status Boolean status + * @param $result Mixed result from processing + * @param $expect Mixed expectation for result + */ + function assertEitherFailOrIdentical($status, $result, $expect) { + if ($expect === false) { + $this->assertFalse($status, 'Expected false result, got true'); } else { - $config = HTMLPurifier_Config::create($config_array); + $this->assertTrue($status, 'Expected true result, got false'); + $this->assertIdentical($result, $expect); } - - // setup context object. Note that we are operating on a copy of it! - // When necessary, extend the test harness to allow post-tests - // on the context object - if (empty($this->context)) { - $context = new HTMLPurifier_Context(); - $context->loadArray($context_array); - } else { - $context =& $this->context; - } - - if ($this->to_tokens && is_string($input)) { - // $func may cause $input to change, so "clone" another copy - // to sacrifice - $input = $this->lexer->tokenizeHTML($s = $input, $config, $context); - $input_c = $this->lexer->tokenizeHTML($s, $config, $context); - } else { - $input_c = $input; - } - - // call the function - $func = $this->func; - $result = $this->obj->$func($input_c, $config, $context); - - // test a bool result - if (is_bool($result)) { - $this->assertIdentical($expect, $result); - return; - } elseif (is_bool($expect)) { - $expect = $input; - } - - if ($this->to_html) { - $result = $this->generator-> - generateFromTokens($result, $config, $context); - if (is_array($expect)) { - $expect = $this->generator-> - generateFromTokens($expect, $config, $context); + } + + function getTests() { + // __onlytest makes only one test get triggered + foreach (get_class_methods(get_class($this)) as $method) { + if (strtolower(substr($method, 0, 10)) == '__onlytest') { + return array($method); } } - - $this->assertIdentical($expect, $result); - + return parent::getTests(); } } - diff --git a/tests/HTMLPurifier/IDAccumulatorTest.php b/tests/HTMLPurifier/IDAccumulatorTest.php index 05db0b2a..006d689c 100644 --- a/tests/HTMLPurifier/IDAccumulatorTest.php +++ b/tests/HTMLPurifier/IDAccumulatorTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/IDAccumulator.php'; -class HTMLPurifier_IDAccumulatorTest extends UnitTestCase +class HTMLPurifier_IDAccumulatorTest extends HTMLPurifier_Harness { function test() { diff --git a/tests/HTMLPurifier/Injector/AutoParagraphTest.php b/tests/HTMLPurifier/Injector/AutoParagraphTest.php index 8611fd9b..0f0b5e5e 100644 --- a/tests/HTMLPurifier/Injector/AutoParagraphTest.php +++ b/tests/HTMLPurifier/Injector/AutoParagraphTest.php @@ -237,6 +237,35 @@ Par1 '<p>Par1</p><p><b>Par2</b></p>' ); + $this->assertResult( +'<img /> Foo', +'<p><img /> Foo</p>' + ); + + $this->assertResult( +'<li>Foo <a>bar</a></li>' + ); + + $this->assertResult( +'<li><b>baz</b><a>bar</a></li>' + ); + + $this->assertResult( +'<div><div>asdf</div><b>asdf</b></div>' + ); + + $this->assertResult( +'<div><div>asdf</div> + +<b>asdf</b></div>', +'<div><div>asdf</div><p><b>asdf</b></p></div>' + ); + + $this->assertResult( +'<b>One</b> <i>Two</i>', +'<p><b>One</b> <i>Two</i></p>' + ); + } function testInlineRootNode() { @@ -249,5 +278,10 @@ Par2', ); } + function testNeeded() { + $this->expectError('Cannot enable AutoParagraph injector because p is not allowed'); + $this->assertResult('<b>foobar</b>', true, array('AutoFormat.AutoParagraph' => true, 'HTML.Allowed' => 'b')); + } + } diff --git a/tests/HTMLPurifier/Injector/LinkifyTest.php b/tests/HTMLPurifier/Injector/LinkifyTest.php index b91908ee..66a06956 100644 --- a/tests/HTMLPurifier/Injector/LinkifyTest.php +++ b/tests/HTMLPurifier/Injector/LinkifyTest.php @@ -34,5 +34,10 @@ class HTMLPurifier_Injector_LinkifyTest extends HTMLPurifier_InjectorHarness } + function testNeeded() { + $this->expectError('Cannot enable Linkify injector because a is not allowed'); + $this->assertResult('http://example.com/', true, array('AutoFormat.Linkify' => true, 'HTML.Allowed' => 'b')); + } + } diff --git a/tests/HTMLPurifier/Injector/PurifierLinkifyTest.php b/tests/HTMLPurifier/Injector/PurifierLinkifyTest.php index d538c489..e820d677 100644 --- a/tests/HTMLPurifier/Injector/PurifierLinkifyTest.php +++ b/tests/HTMLPurifier/Injector/PurifierLinkifyTest.php @@ -38,5 +38,10 @@ class HTMLPurifier_Injector_PurifierLinkifyTest extends HTMLPurifier_InjectorHar } + function testNeeded() { + $this->expectError('Cannot enable PurifierLinkify injector because a is not allowed'); + $this->assertResult('%Namespace.Directive', true, array('AutoFormat.PurifierLinkify' => true, 'HTML.Allowed' => 'b')); + } + } diff --git a/tests/HTMLPurifier/LanguageFactoryTest.php b/tests/HTMLPurifier/LanguageFactoryTest.php index eb0f4556..2cadb1c1 100644 --- a/tests/HTMLPurifier/LanguageFactoryTest.php +++ b/tests/HTMLPurifier/LanguageFactoryTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/LanguageFactory.php'; -class HTMLPurifier_LanguageFactoryTest extends UnitTestCase +class HTMLPurifier_LanguageFactoryTest extends HTMLPurifier_Harness { function test() { diff --git a/tests/HTMLPurifier/LanguageTest.php b/tests/HTMLPurifier/LanguageTest.php index f846c619..ec4244a8 100644 --- a/tests/HTMLPurifier/LanguageTest.php +++ b/tests/HTMLPurifier/LanguageTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/Language.php'; -class HTMLPurifier_LanguageTest extends UnitTestCase +class HTMLPurifier_LanguageTest extends HTMLPurifier_Harness { var $lang; diff --git a/tests/HTMLPurifier/Lexer/DirectLexTest.php b/tests/HTMLPurifier/Lexer/DirectLexTest.php index ba7d0fe7..37835790 100644 --- a/tests/HTMLPurifier/Lexer/DirectLexTest.php +++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/Lexer/DirectLex.php'; -class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase +class HTMLPurifier_Lexer_DirectLexTest extends HTMLPurifier_Harness { var $DirectLex; @@ -59,6 +59,12 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase $input[12] = '="" =""'; $expect[12] = array('"' => ''); // tough to say, just don't throw a loop + $input[13] = 'href="'; + $expect[13] = array('href' => ''); + + $input[14] = 'href=" <'; + $expect[14] = array('href' => ' <'); + $config = HTMLPurifier_Config::createDefault(); $context = new HTMLPurifier_Context(); $size = count($input); diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index c16b0611..75c05b78 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/Lexer/DirectLex.php'; -class HTMLPurifier_LexerTest extends UnitTestCase +class HTMLPurifier_LexerTest extends HTMLPurifier_Harness { var $Lexer; @@ -287,16 +287,21 @@ class HTMLPurifier_LexerTest extends UnitTestCase $expect[18] = array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) ); // test emoticon protection - $input[19] = '<b>Whoa! >.< That\'s not good >.></b>'; + $input[19] = '<b>Whoa! <3 That\'s not good >.></b>'; $expect[19] = array( new HTMLPurifier_Token_Start('b'), - new HTMLPurifier_Token_Text('Whoa! >.'), - new HTMLPurifier_Token_Text('< That\'s not good >'), + new HTMLPurifier_Token_Text('Whoa! '), + new HTMLPurifier_Token_Text('<3 That\'s not good >'), new HTMLPurifier_Token_Text('.>'), new HTMLPurifier_Token_End('b'), ); + $dom_expect[19] = array( + new HTMLPurifier_Token_Start('b'), + new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'), + new HTMLPurifier_Token_End('b'), + ); $sax_expect[19] = false; // SAX drops the < character - $dom_expect[19] = false; // DOM drops the entire pseudo-tag + $config[19] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true)); // test comment parsing with funky characters inside $input[20] = '<!-- This >< comment --><br />'; @@ -305,6 +310,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase new HTMLPurifier_Token_Empty('br') ); $sax_expect[20] = false; + $config[20] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true)); // test comment parsing of missing end $input[21] = '<!-- This >< comment'; @@ -313,6 +319,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase ); $sax_expect[21] = false; $dom_expect[21] = false; + $config[21] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true)); // test CDATA tags $input[22] = '<script>alert("<foo>");</script>'; @@ -323,7 +330,25 @@ class HTMLPurifier_LexerTest extends UnitTestCase ); $config[22] = HTMLPurifier_Config::create(array('HTML.Trusted' => true)); $sax_expect[22] = false; - //$dom_expect[22] = false; + + // test escaping + $input[23] = '<!-- This comment < < & -->'; + $expect[23] = array( + new HTMLPurifier_Token_Comment(' This comment < < & ') ); + $sax_expect[23] = false; $config[23] = + HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => + true)); + + // more DirectLex edge-cases + $input[24] = '<a href="><>">'; + $expect[24] = array( + new HTMLPurifier_Token_Start('a', array('href' => '')), + new HTMLPurifier_Token_Text('<">') + ); + $sax_expect[24] = false; + $dom_expect[24] = array( + new HTMLPurifier_Token_Empty('a', array('href' => '><>')) + ); $default_config = HTMLPurifier_Config::createDefault(); $default_context = new HTMLPurifier_Context(); diff --git a/tests/HTMLPurifier/PercentEncoderTest.php b/tests/HTMLPurifier/PercentEncoderTest.php index ea52021d..4b01ac3a 100644 --- a/tests/HTMLPurifier/PercentEncoderTest.php +++ b/tests/HTMLPurifier/PercentEncoderTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/PercentEncoder.php'; -class HTMLPurifier_PercentEncoderTest extends UnitTestCase +class HTMLPurifier_PercentEncoderTest extends HTMLPurifier_Harness { var $PercentEncoder; diff --git a/tests/HTMLPurifier/Strategy/CompositeTest.php b/tests/HTMLPurifier/Strategy/CompositeTest.php index 606d786b..db4ab040 100644 --- a/tests/HTMLPurifier/Strategy/CompositeTest.php +++ b/tests/HTMLPurifier/Strategy/CompositeTest.php @@ -15,7 +15,7 @@ class HTMLPurifier_Strategy_Composite_Test } // doesn't use Strategy harness -class HTMLPurifier_Strategy_CompositeTest extends UnitTestCase +class HTMLPurifier_Strategy_CompositeTest extends HTMLPurifier_Harness { function test() { diff --git a/tests/HTMLPurifier/Strategy/FixNestingTest.php b/tests/HTMLPurifier/Strategy/FixNestingTest.php index 2a323c57..ac651684 100644 --- a/tests/HTMLPurifier/Strategy/FixNestingTest.php +++ b/tests/HTMLPurifier/Strategy/FixNestingTest.php @@ -63,12 +63,6 @@ class HTMLPurifier_Strategy_FixNestingTest extends HTMLPurifier_StrategyHarness '<span><ins>Not allowed!</ins></span>' ); - $this->assertResult( // alt config - '<span><ins><div>Not allowed!</div></ins></span>', - '<span><ins><div>Not allowed!</div></ins></span>', - array('Core.EscapeInvalidChildren' => true) - ); - // test block element that has inline content $this->assertResult( '<h1><ins><div>Not allowed!</div></ins></h1>', @@ -84,6 +78,12 @@ class HTMLPurifier_Strategy_FixNestingTest extends HTMLPurifier_StrategyHarness '<div><ins><del><div>Allowed!</div></del></ins></div>' ); + $this->assertResult( // alt config + '<span><ins><div>Not allowed!</div></ins></span>', + '<span><ins><div>Not allowed!</div></ins></span>', + array('Core.EscapeInvalidChildren' => true) + ); + } function testExclusionsIntegration() { diff --git a/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php b/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php index c4289d98..87a4b38c 100644 --- a/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php +++ b/tests/HTMLPurifier/Strategy/RemoveForeignElementsTest.php @@ -30,12 +30,23 @@ class HTMLPurifier_Strategy_RemoveForeignElementsTest '' ); + $this->assertResult( + '<style>.foo {blink;}</style>', + '' + ); + $this->assertResult( '<script>alert();</script>', 'alert();', array('Core.RemoveScriptContents' => false) ); + $this->assertResult( + '<script>alert();</script>', + 'alert();', + array('Core.HiddenElements' => array()) + ); + $this->assertResult( '<menu><li>Item 1</li></menu>', '<ul><li>Item 1</li></ul>' diff --git a/tests/HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php b/tests/HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php index 6b7ac74f..5843b722 100644 --- a/tests/HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php +++ b/tests/HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php @@ -48,8 +48,8 @@ class HTMLPurifier_Strategy_RemoveForeignElements_ErrorsTest extends HTMLPurifie $this->invoke('<!-- test -->'); } - function testScriptRemoved() { - $this->collector->expectAt(0, 'send', array(E_ERROR, 'Strategy_RemoveForeignElements: Script removed')); + function testForeignMetaElementRemoved() { + $this->collector->expectAt(0, 'send', array(E_ERROR, 'Strategy_RemoveForeignElements: Foreign meta element removed')); $this->collector->expectContextAt(0, 'CurrentToken', new HTMLPurifier_Token_Start('script', array(), 1)); $this->collector->expectAt(1, 'send', array(E_ERROR, 'Strategy_RemoveForeignElements: Token removed to end', 'script')); $this->invoke('<script>asdf'); diff --git a/tests/HTMLPurifier/StrategyHarness.php b/tests/HTMLPurifier/StrategyHarness.php index d8b2d5fd..fe20b646 100644 --- a/tests/HTMLPurifier/StrategyHarness.php +++ b/tests/HTMLPurifier/StrategyHarness.php @@ -1,11 +1,12 @@ <?php -require_once 'HTMLPurifier/Harness.php'; +require_once 'HTMLPurifier/ComplexHarness.php'; -class HTMLPurifier_StrategyHarness extends HTMLPurifier_Harness +class HTMLPurifier_StrategyHarness extends HTMLPurifier_ComplexHarness { function setUp() { + parent::setUp(); $this->func = 'execute'; $this->to_tokens = true; $this->to_html = true; diff --git a/tests/HTMLPurifier/TagTransformTest.php b/tests/HTMLPurifier/TagTransformTest.php index b3d6f461..92b28fb0 100644 --- a/tests/HTMLPurifier/TagTransformTest.php +++ b/tests/HTMLPurifier/TagTransformTest.php @@ -6,7 +6,7 @@ require_once 'HTMLPurifier/TagTransform.php'; require_once 'HTMLPurifier/TagTransform/Font.php'; require_once 'HTMLPurifier/TagTransform/Simple.php'; -class HTMLPurifier_TagTransformTest extends UnitTestCase +class HTMLPurifier_TagTransformTest extends HTMLPurifier_Harness { /** diff --git a/tests/HTMLPurifier/TokenFactoryTest.php b/tests/HTMLPurifier/TokenFactoryTest.php index 8a35cbbc..54be5305 100644 --- a/tests/HTMLPurifier/TokenFactoryTest.php +++ b/tests/HTMLPurifier/TokenFactoryTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/TokenFactory.php'; -class HTMLPurifier_TokenFactoryTest extends UnitTestCase +class HTMLPurifier_TokenFactoryTest extends HTMLPurifier_Harness { public function test() { diff --git a/tests/HTMLPurifier/TokenTest.php b/tests/HTMLPurifier/TokenTest.php index 22926fb5..2f440ff9 100644 --- a/tests/HTMLPurifier/TokenTest.php +++ b/tests/HTMLPurifier/TokenTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/Token.php'; -class HTMLPurifier_TokenTest extends UnitTestCase +class HTMLPurifier_TokenTest extends HTMLPurifier_Harness { function assertTokenConstruction($name, $attr, diff --git a/tests/HTMLPurifier/URIDefinitionTest.php b/tests/HTMLPurifier/URIDefinitionTest.php new file mode 100644 index 00000000..149f89b2 --- /dev/null +++ b/tests/HTMLPurifier/URIDefinitionTest.php @@ -0,0 +1,59 @@ +<?php + +require_once 'HTMLPurifier/URIHarness.php'; +require_once 'HTMLPurifier/URIDefinition.php'; + +class HTMLPurifier_URIDefinitionTest extends HTMLPurifier_URIHarness +{ + + function createFilterMock($expect = true, $result = true) { + generate_mock_once('HTMLPurifier_URIFilter'); + $mock = new HTMLPurifier_URIFilterMock(); + if ($expect) $mock->expectOnce('filter'); + else $mock->expectNever('filter'); + $mock->setReturnValue('filter', $result); + return $mock; + } + + function test_filter() { + $def = new HTMLPurifier_URIDefinition(); + $def->filters[] = $this->createFilterMock(); + $def->filters[] = $this->createFilterMock(); + $uri = $this->createURI('test'); + $this->assertTrue($def->filter($uri, $this->config, $this->context)); + } + + function test_filter_earlyAbortIfFail() { + $def = new HTMLPurifier_URIDefinition(); + $def->filters[] = $this->createFilterMock(true, false); + $def->filters[] = $this->createFilterMock(false); // never called + $uri = $this->createURI('test'); + $this->assertFalse($def->filter($uri, $this->config, $this->context)); + } + + function test_setupMemberVariables_collisionPrecedenceIsHostBaseScheme() { + $this->config->set('URI', 'Host', $host = 'example.com'); + $this->config->set('URI', 'Base', $base = 'http://sub.example.com/foo/bar.html'); + $this->config->set('URI', 'DefaultScheme', 'ftp'); + $def = new HTMLPurifier_URIDefinition(); + $def->setupMemberVariables($this->config); + $this->assertIdentical($def->host, $host); + $this->assertIdentical($def->base, $this->createURI($base)); + $this->assertIdentical($def->defaultScheme, 'http'); // not ftp! + } + + function test_setupMemberVariables_onlyScheme() { + $this->config->set('URI', 'DefaultScheme', 'ftp'); + $def = new HTMLPurifier_URIDefinition(); + $def->setupMemberVariables($this->config); + $this->assertIdentical($def->defaultScheme, 'ftp'); + } + + function test_setupMemberVariables_onlyBase() { + $this->config->set('URI', 'Base', 'http://sub.example.com/foo/bar.html'); + $def = new HTMLPurifier_URIDefinition(); + $def->setupMemberVariables($this->config); + $this->assertIdentical($def->host, 'sub.example.com'); + } + +} diff --git a/tests/HTMLPurifier/URIFilter/DisableExternalResourcesTest.php b/tests/HTMLPurifier/URIFilter/DisableExternalResourcesTest.php new file mode 100644 index 00000000..545e421b --- /dev/null +++ b/tests/HTMLPurifier/URIFilter/DisableExternalResourcesTest.php @@ -0,0 +1,24 @@ +<?php + +require_once 'HTMLPurifier/URIFilter/DisableExternalTest.php'; +require_once 'HTMLPurifier/URIFilter/DisableExternalResources.php'; + +class HTMLPurifier_URIFilter_DisableExternalResourcesTest extends + HTMLPurifier_URIFilter_DisableExternalTest +{ + + function setUp() { + parent::setUp(); + $this->filter = new HTMLPurifier_URIFilter_DisableExternalResources(); + $var = true; + $this->context->register('EmbeddedURI', $var); + } + + function testPreserveWhenNotEmbedded() { + $this->context->destroy('EmbeddedURI'); // undo setUp + $this->assertFiltering( + 'http://example.com' + ); + } + +} diff --git a/tests/HTMLPurifier/URIFilter/DisableExternalTest.php b/tests/HTMLPurifier/URIFilter/DisableExternalTest.php new file mode 100644 index 00000000..e4a0e89f --- /dev/null +++ b/tests/HTMLPurifier/URIFilter/DisableExternalTest.php @@ -0,0 +1,47 @@ +<?php + +require_once 'HTMLPurifier/URIFilter/DisableExternal.php'; +require_once 'HTMLPurifier/URIFilterHarness.php'; + +class HTMLPurifier_URIFilter_DisableExternalTest extends HTMLPurifier_URIFilterHarness +{ + + function setUp() { + parent::setUp(); + $this->filter = new HTMLPurifier_URIFilter_DisableExternal(); + } + + function testRemoveExternal() { + $this->assertFiltering( + 'http://example.com', false + ); + } + + function testPreserveInternal() { + $this->assertFiltering( + '/foo/bar' + ); + } + + function testPreserveOurHost() { + $this->config->set('URI', 'Host', 'example.com'); + $this->assertFiltering( + 'http://example.com' + ); + } + + function testPreserveOurSubdomain() { + $this->config->set('URI', 'Host', 'example.com'); + $this->assertFiltering( + 'http://www.example.com' + ); + } + + function testRemoveSuperdomain() { + $this->config->set('URI', 'Host', 'www.example.com'); + $this->assertFiltering( + 'http://example.com', false + ); + } + +} diff --git a/tests/HTMLPurifier/URIFilter/HostBlacklistTest.php b/tests/HTMLPurifier/URIFilter/HostBlacklistTest.php new file mode 100644 index 00000000..d9a3fdd2 --- /dev/null +++ b/tests/HTMLPurifier/URIFilter/HostBlacklistTest.php @@ -0,0 +1,30 @@ +<?php + +require_once 'HTMLPurifier/URIFilter/HostBlacklist.php'; +require_once 'HTMLPurifier/URIFilterHarness.php'; + +class HTMLPurifier_URIFilter_HostBlacklistTest extends HTMLPurifier_URIFilterHarness +{ + + function setUp() { + parent::setUp(); + $this->filter = new HTMLPurifier_URIFilter_HostBlacklist(); + } + + function testRejectBlacklistedHost() { + $this->config->set('URI', 'HostBlacklist', 'example.com'); + $this->assertFiltering('http://example.com', false); + } + + function testRejectBlacklistedHostThoughNotTrue() { + // maybe this behavior should change + $this->config->set('URI', 'HostBlacklist', 'example.com'); + $this->assertFiltering('http://example.comcast.com', false); + } + + function testPreserveNonBlacklistedHost() { + $this->config->set('URI', 'HostBlacklist', 'example.com'); + $this->assertFiltering('http://google.com'); + } + +} diff --git a/tests/HTMLPurifier/URIFilter/MakeAbsoluteTest.php b/tests/HTMLPurifier/URIFilter/MakeAbsoluteTest.php new file mode 100644 index 00000000..d509a6a1 --- /dev/null +++ b/tests/HTMLPurifier/URIFilter/MakeAbsoluteTest.php @@ -0,0 +1,122 @@ +<?php + +require_once 'HTMLPurifier/URIFilter/MakeAbsolute.php'; +require_once 'HTMLPurifier/URIFilterHarness.php'; + +class HTMLPurifier_URIFilter_MakeAbsoluteTest extends HTMLPurifier_URIFilterHarness +{ + + function setUp() { + parent::setUp(); + $this->filter = new HTMLPurifier_URIFilter_MakeAbsolute(); + $this->setBase(); + } + + function setBase($base = 'http://example.com/foo/bar.html?q=s#frag') { + $this->config->set('URI', 'Base', $base); + } + + // corresponding to RFC 2396 + + function testPreserveAbsolute() { + $this->assertFiltering('http://example.com/foo.html'); + } + + function testFilterBlank() { + $this->assertFiltering('', 'http://example.com/foo/bar.html?q=s'); + } + + function testFilterEmptyPath() { + $this->assertFiltering('?q=s#frag', 'http://example.com/foo/bar.html?q=s#frag'); + } + + function testPreserveAltScheme() { + $this->assertFiltering('mailto:bob@example.com'); + } + + function testFilterIgnoreHTTPSpecialCase() { + $this->assertFiltering('http:/', 'http://example.com/'); + } + + function testFilterAbsolutePath() { + $this->assertFiltering('/foo.txt', 'http://example.com/foo.txt'); + } + + function testFilterRelativePath() { + $this->assertFiltering('baz.txt', 'http://example.com/foo/baz.txt'); + } + + function testFilterRelativePathWithInternalDot() { + $this->assertFiltering('./baz.txt', 'http://example.com/foo/baz.txt'); + } + + function testFilterRelativePathWithEndingDot() { + $this->assertFiltering('baz/.', 'http://example.com/foo/baz/'); + } + + function testFilterRelativePathDot() { + $this->assertFiltering('.', 'http://example.com/foo/'); + } + + function testFilterRelativePathWithInternalDotDot() { + $this->assertFiltering('../baz.txt', 'http://example.com/baz.txt'); + } + + function testFilterRelativePathWithEndingDotDot() { + $this->assertFiltering('..', 'http://example.com/'); + } + + function testFilterRelativePathTooManyDotDots() { + $this->assertFiltering('../../', 'http://example.com/'); + } + + function testFilterAppendingQueryAndFragment() { + $this->assertFiltering('/foo.php?q=s#frag', 'http://example.com/foo.php?q=s#frag'); + } + + // edge cases below + + function testFilterAbsolutePathBase() { + $this->setBase('/foo/baz.txt'); + $this->assertFiltering('test.php', '/foo/test.php'); + } + + function testFilterAbsolutePathBaseDirectory() { + $this->setBase('/foo/'); + $this->assertFiltering('test.php', '/foo/test.php'); + } + + function testFilterAbsolutePathBaseBelow() { + $this->setBase('/foo/baz.txt'); + $this->assertFiltering('../../test.php', '/test.php'); + } + + function testFilterRelativePathBase() { + $this->setBase('foo/baz.html'); + $this->assertFiltering('foo.php', 'foo/foo.php'); + } + + function testFilterRelativePathBaseBelow() { + $this->setBase('../baz.html'); + $this->assertFiltering('test/strike.html', '../test/strike.html'); + } + + function testFilterRelativePathBaseWithAbsoluteURI() { + $this->setBase('../baz.html'); + $this->assertFiltering('/test/strike.html'); + } + + function testFilterRelativePathBaseWithDot() { + $this->setBase('../baz.html'); + $this->assertFiltering('.', '../'); + } + + // error case + + function testErrorNoBase() { + $this->setBase(null); + $this->expectError('URI.MakeAbsolute is being ignored due to lack of value for URI.Base configuration'); + $this->assertFiltering('foo/bar.txt'); + } + +} diff --git a/tests/HTMLPurifier/URIFilterHarness.php b/tests/HTMLPurifier/URIFilterHarness.php new file mode 100644 index 00000000..04e101f2 --- /dev/null +++ b/tests/HTMLPurifier/URIFilterHarness.php @@ -0,0 +1,15 @@ +<?php + +require_once 'HTMLPurifier/URIHarness.php'; + +class HTMLPurifier_URIFilterHarness extends HTMLPurifier_URIHarness +{ + + function assertFiltering($uri, $expect_uri = true) { + $this->prepareURI($uri, $expect_uri); + $this->filter->prepare($this->config, $this->context); + $result = $this->filter->filter($uri, $this->config, $this->context); + $this->assertEitherFailOrIdentical($result, $uri, $expect_uri); + } + +} diff --git a/tests/HTMLPurifier/URIHarness.php b/tests/HTMLPurifier/URIHarness.php new file mode 100644 index 00000000..63e6d7d6 --- /dev/null +++ b/tests/HTMLPurifier/URIHarness.php @@ -0,0 +1,31 @@ +<?php + +require_once 'HTMLPurifier/URIParser.php'; + +class HTMLPurifier_URIHarness extends HTMLPurifier_Harness +{ + + /** + * Prepares two URIs into object form + * @param &$uri Reference to string input URI + * @param &$expect_uri Reference to string expectation URI + * @note If $expect_uri is false, it will stay false + */ + function prepareURI(&$uri, &$expect_uri) { + $parser = new HTMLPurifier_URIParser(); + if ($expect_uri === true) $expect_uri = $uri; + $uri = $parser->parse($uri); + if ($expect_uri !== false) { + $expect_uri = $parser->parse($expect_uri); + } + } + + /** + * Generates a URI object from the corresponding string + */ + function createURI($uri) { + $parser = new HTMLPurifier_URIParser(); + return $parser->parse($uri); + } + +} diff --git a/tests/HTMLPurifier/URIParserTest.php b/tests/HTMLPurifier/URIParserTest.php new file mode 100644 index 00000000..370e90ca --- /dev/null +++ b/tests/HTMLPurifier/URIParserTest.php @@ -0,0 +1,140 @@ +<?php + +require_once 'HTMLPurifier/URIParser.php'; +require_once 'HTMLPurifier/URI.php'; + +class HTMLPurifier_URIParserTest extends HTMLPurifier_Harness +{ + + function assertParsing( + $uri, $scheme, $userinfo, $host, $port, $path, $query, $fragment, $config = null, $context = null + ) { + $this->prepareCommon($config, $context); + $parser = new HTMLPurifier_URIParser(); + $result = $parser->parse($uri, $config, $context); + $expect = new HTMLPurifier_URI($scheme, $userinfo, $host, $port, $path, $query, $fragment); + $this->assertEqual($result, $expect); + } + + function testRegular() { + $this->assertParsing( + 'http://www.example.com/webhp?q=foo#result2', + 'http', null, 'www.example.com', null, '/webhp', 'q=foo', 'result2' + ); + } + + function testPortAndUsername() { + $this->assertParsing( + 'http://user@authority.part:80/now/the/path?query#fragment', + 'http', 'user', 'authority.part', 80, '/now/the/path', 'query', 'fragment' + ); + } + + function testPercentEncoding() { + $this->assertParsing( + 'http://en.wikipedia.org/wiki/Clich%C3%A9', + 'http', null, 'en.wikipedia.org', null, '/wiki/Clich%C3%A9', null, null + ); + } + + function testEmptyQuery() { + $this->assertParsing( + 'http://www.example.com/?#', + 'http', null, 'www.example.com', null, '/', '', null + ); + } + + function testEmptyPath() { + $this->assertParsing( + 'http://www.example.com', + 'http', null, 'www.example.com', null, '', null, null + ); + } + + function testOpaqueURI() { + $this->assertParsing( + 'mailto:bob@example.com', + 'mailto', null, null, null, 'bob@example.com', null, null + ); + } + + function testIPv4Address() { + $this->assertParsing( + 'http://192.0.34.166/', + 'http', null, '192.0.34.166', null, '/', null, null + ); + } + + function testFakeIPv4Address() { + $this->assertParsing( + 'http://333.123.32.123/', + 'http', null, '333.123.32.123', null, '/', null, null + ); + } + + function testIPv6Address() { + $this->assertParsing( + 'http://[2001:db8::7]/c=GB?objectClass?one', + 'http', null, '[2001:db8::7]', null, '/c=GB', 'objectClass?one', null + ); + } + + function testInternationalizedDomainName() { + $this->assertParsing( + "http://t\xC5\xABdali\xC5\x86.lv", + 'http', null, "t\xC5\xABdali\xC5\x86.lv", null, '', null, null + ); + } + + function testInvalidPort() { + $this->assertParsing( + 'http://example.com:foobar', + 'http', null, 'example.com', null, '', null, null + ); + } + + function testPathAbsolute() { + $this->assertParsing( + 'http:/this/is/path', + 'http', null, null, null, '/this/is/path', null, null + ); + } + + function testPathRootless() { + // this should not be used but is allowed + $this->assertParsing( + 'http:this/is/path', + 'http', null, null, null, 'this/is/path', null, null + ); + } + + function testPathEmpty() { + $this->assertParsing( + 'http:', + 'http', null, null, null, '', null, null + ); + } + + function testRelativeURI() { + $this->assertParsing( + '/a/b', + null, null, null, null, '/a/b', null, null + ); + } + + function testMalformedTag() { + $this->assertParsing( + 'http://www.example.com/\'>"', + 'http', null, 'www.example.com', null, '/', null, null + ); + } + + function testEmpty() { + $this->assertParsing( + '', + null, null, null, null, '', null, null + ); + } + +} + diff --git a/tests/HTMLPurifier/URISchemeRegistryTest.php b/tests/HTMLPurifier/URISchemeRegistryTest.php index 02b938e4..497d0f66 100644 --- a/tests/HTMLPurifier/URISchemeRegistryTest.php +++ b/tests/HTMLPurifier/URISchemeRegistryTest.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/URISchemeRegistry.php'; -class HTMLPurifier_URISchemeRegistryTest extends UnitTestCase +class HTMLPurifier_URISchemeRegistryTest extends HTMLPurifier_Harness { function test() { diff --git a/tests/HTMLPurifier/URISchemeTest.php b/tests/HTMLPurifier/URISchemeTest.php index 1e236a80..5b1f99a3 100644 --- a/tests/HTMLPurifier/URISchemeTest.php +++ b/tests/HTMLPurifier/URISchemeTest.php @@ -1,6 +1,10 @@ <?php +require_once 'HTMLPurifier/URI.php'; +require_once 'HTMLPurifier/URIHarness.php'; + require_once 'HTMLPurifier/URIScheme.php'; +require_once 'HTMLPurifier/URISchemeRegistry.php'; require_once 'HTMLPurifier/URIScheme/http.php'; require_once 'HTMLPurifier/URIScheme/ftp.php'; @@ -12,145 +16,133 @@ require_once 'HTMLPurifier/URIScheme/nntp.php'; // WARNING: All the URI schemes are far to relaxed, we need to tighten // the checks. -class HTMLPurifier_URISchemeTest extends UnitTestCase +class HTMLPurifier_URISchemeTest extends HTMLPurifier_URIHarness { - function test_http() { - $scheme = new HTMLPurifier_URIScheme_http(); - $config = HTMLPurifier_Config::createDefault(); - $context = new HTMLPurifier_Context(); - - $this->assertIdentical( - $scheme->validateComponents( - null, 'www.example.com', null, '/', 's=foobar', $config, $context), - array(null, 'www.example.com', null, '/', 's=foobar') - ); - - // absorb default port and userinfo - $this->assertIdentical( - $scheme->validateComponents( - 'user', 'www.example.com', 80, '/', 's=foobar', $config, $context), - array(null, 'www.example.com', null, '/', 's=foobar') - ); - - // do not absorb non-default port - $this->assertIdentical( - $scheme->validateComponents( - null, 'www.example.com', 8080, '/', 's=foobar', $config, $context), - array(null, 'www.example.com', 8080, '/', 's=foobar') - ); - - // https is basically the same - - $scheme = new HTMLPurifier_URIScheme_https(); - $this->assertIdentical( - $scheme->validateComponents( - 'user', 'www.example.com', 443, '/', 's=foobar', $config, $context), - array(null, 'www.example.com', null, '/', 's=foobar') - ); - + function assertValidation($uri, $expect_uri = true) { + $this->prepareURI($uri, $expect_uri); + // convenience hack: the scheme should be explicitly specified + $scheme = $uri->getSchemeObj($this->config, $this->context); + $result = $scheme->validate($uri, $this->config, $this->context); + $this->assertEitherFailOrIdentical($result, $uri, $expect_uri); } - function test_ftp() { - - $scheme = new HTMLPurifier_URIScheme_ftp(); - $config = HTMLPurifier_Config::createDefault(); - $context = new HTMLPurifier_Context(); - - $this->assertIdentical( - $scheme->validateComponents( - 'user', 'www.example.com', 21, '/', 's=foobar', $config, $context), - array('user', 'www.example.com', null, '/', null) - ); - - // valid typecode - $this->assertIdentical( - $scheme->validateComponents( - null, 'www.example.com', null, '/file.txt;type=a', null, $config, $context), - array(null, 'www.example.com', null, '/file.txt;type=a', null) - ); - - // remove invalid typecode - $this->assertIdentical( - $scheme->validateComponents( - null, 'www.example.com', null, '/file.txt;type=z', null, $config, $context), - array(null, 'www.example.com', null, '/file.txt', null) - ); - - // encode errant semicolons - $this->assertIdentical( - $scheme->validateComponents( - null, 'www.example.com', null, '/too;many;semicolons=1', null, $config, $context), - array(null, 'www.example.com', null, '/too%3Bmany%3Bsemicolons=1', null) - ); - - } - - function test_news() { - - $scheme = new HTMLPurifier_URIScheme_news(); - $config = HTMLPurifier_Config::createDefault(); - $context = new HTMLPurifier_Context(); - - $this->assertIdentical( - $scheme->validateComponents( - null, null, null, 'gmane.science.linguistics', null, $config, $context), - array(null, null, null, 'gmane.science.linguistics', null) - ); - - $this->assertIdentical( - $scheme->validateComponents( - null, null, null, '642@eagle.ATT.COM', null, $config, $context), - array(null, null, null, '642@eagle.ATT.COM', null) - ); - - // test invalid field removal - $this->assertIdentical( - $scheme->validateComponents( - 'user', 'www.google.com', 80, 'rec.music', 'path=foo', $config, $context), - array(null, null, null, 'rec.music', null) - ); - - } - - function test_nntp() { - - $scheme = new HTMLPurifier_URIScheme_nntp(); - $config = HTMLPurifier_Config::createDefault(); - $context = new HTMLPurifier_Context(); - - $this->assertIdentical( - $scheme->validateComponents( - null, 'news.example.com', null, '/alt.misc/12345', null, $config, $context), - array(null, 'news.example.com', null, '/alt.misc/12345', null) - ); - - - $this->assertIdentical( - $scheme->validateComponents( - 'user', 'news.example.com', 119, '/alt.misc/12345', 'foo=asdf', $config, $context), - array(null, 'news.example.com', null, '/alt.misc/12345', null) + function test_http_regular() { + $this->assertValidation( + 'http://example.com/?s=q#fragment' ); } - function test_mailto() { - - $scheme = new HTMLPurifier_URIScheme_mailto(); - $config = HTMLPurifier_Config::createDefault(); - $context = new HTMLPurifier_Context(); - - $this->assertIdentical( - $scheme->validateComponents( - null, null, null, 'bob@example.com', null, $config, $context), - array(null, null, null, 'bob@example.com', null) + function test_http_removeDefaultPort() { + $this->assertValidation( + 'http://example.com:80', + 'http://example.com' ); - - $this->assertIdentical( - $scheme->validateComponents( - 'user', 'example.com', 80, 'bob@example.com', 'subject=Foo!', $config, $context), - array(null, null, null, 'bob@example.com', 'subject=Foo!') + } + + function test_http_removeUserInfo() { + $this->assertValidation( + 'http://bob@example.com', + 'http://example.com' + ); + } + + function test_http_preserveNonDefaultPort() { + $this->assertValidation( + 'http://example.com:8080' + ); + } + + function test_https_regular() { + $this->assertValidation( + 'https://user@example.com:443/?s=q#frag', + 'https://example.com/?s=q#frag' + ); + } + + function test_ftp_regular() { + $this->assertValidation( + 'ftp://user@example.com/path' + ); + } + + function test_ftp_removeDefaultPort() { + $this->assertValidation( + 'ftp://example.com:21', + 'ftp://example.com' + ); + } + + function test_ftp_removeQueryString() { + $this->assertValidation( + 'ftp://example.com?s=q', + 'ftp://example.com' + ); + } + + function test_ftp_preserveValidTypecode() { + $this->assertValidation( + 'ftp://example.com/file.txt;type=a' + ); + } + + function test_ftp_removeInvalidTypecode() { + $this->assertValidation( + 'ftp://example.com/file.txt;type=z', + 'ftp://example.com/file.txt' + ); + } + + function test_ftp_encodeExtraSemicolons() { + $this->assertValidation( + 'ftp://example.com/too;many;semicolons=1', + 'ftp://example.com/too%3Bmany%3Bsemicolons=1' + ); + } + + function test_news_regular() { + $this->assertValidation( + 'news:gmane.science.linguistics' + ); + } + + function test_news_explicit() { + $this->assertValidation( + 'news:642@eagle.ATT.COM' + ); + } + + function test_news_removeNonPathComponents() { + $this->assertValidation( + 'news://user@example.com:80/rec.music?path=foo#frag', + 'news:/rec.music#frag' + ); + } + + function test_nntp_regular() { + $this->assertValidation( + 'nntp://news.example.com/alt.misc/42#frag' + ); + } + + function test_nntp_removalOfRedundantOrUselessComponents() { + $this->assertValidation( + 'nntp://user@news.example.com:119/alt.misc/42?s=q#frag', + 'nntp://news.example.com/alt.misc/42#frag' + ); + } + + function test_mailto_regular() { + $this->assertValidation( + 'mailto:bob@example.com' + ); + } + + function test_mailto_removalOfRedundantOrUselessComponents() { + $this->assertValidation( + 'mailto://user@example.com:80/bob@example.com?subject=Foo#frag', + 'mailto:/bob@example.com?subject=Foo#frag' ); - } } diff --git a/tests/HTMLPurifier/URITest.php b/tests/HTMLPurifier/URITest.php new file mode 100644 index 00000000..9da37a7a --- /dev/null +++ b/tests/HTMLPurifier/URITest.php @@ -0,0 +1,166 @@ +<?php + +require_once 'HTMLPurifier/URI.php'; +require_once 'HTMLPurifier/URIParser.php'; + +class HTMLPurifier_URITest extends HTMLPurifier_URIHarness +{ + + function createURI($uri) { + $parser = new HTMLPurifier_URIParser(); + return $parser->parse($uri); + } + + function test_construct() { + $uri1 = new HTMLPurifier_URI('HTTP', 'bob', 'example.com', '23', '/foo', 'bar=2', 'slash'); + $uri2 = new HTMLPurifier_URI('http', 'bob', 'example.com', 23, '/foo', 'bar=2', 'slash'); + $this->assertIdentical($uri1, $uri2); + } + + var $oldRegistry; + + function &setUpSchemeRegistryMock() { + $this->oldRegistry = HTMLPurifier_URISchemeRegistry::instance(); + generate_mock_once('HTMLPurifier_URIScheme'); + generate_mock_once('HTMLPurifier_URISchemeRegistry'); + $registry =& HTMLPurifier_URISchemeRegistry::instance( + new HTMLPurifier_URISchemeRegistryMock() + ); + return $registry; + } + + function &setUpSchemeMock($name) { + $registry =& $this->setUpSchemeRegistryMock(); + $scheme_mock = new HTMLPurifier_URISchemeMock(); + $registry->setReturnValue('getScheme', $scheme_mock, array($name, '*', '*')); + return $scheme_mock; + } + + function setUpNoValidSchemes() { + $registry =& $this->setUpSchemeRegistryMock(); + $registry->setReturnValue('getScheme', false, array('*', '*', '*')); + } + + function tearDownSchemeRegistryMock() { + HTMLPurifier_URISchemeRegistry::instance($this->oldRegistry); + } + + function test_getSchemeObj() { + $scheme_mock =& $this->setUpSchemeMock('http'); + + $uri = $this->createURI('http:'); + $scheme_obj = $uri->getSchemeObj($this->config, $this->context); + $this->assertIdentical($scheme_obj, $scheme_mock); + + $this->tearDownSchemeRegistryMock(); + } + + function test_getSchemeObj_invalidScheme() { + $this->setUpNoValidSchemes(); + + $uri = $this->createURI('http:'); + $result = $uri->getSchemeObj($this->config, $this->context); + $this->assertIdentical($result, false); + + $this->tearDownSchemeRegistryMock(); + } + + function test_getSchemaObj_defaultScheme() { + $scheme = 'foobar'; + + $scheme_mock =& $this->setUpSchemeMock($scheme); + $this->config->set('URI', 'DefaultScheme', $scheme); + + $uri = $this->createURI('hmm'); + $scheme_obj = $uri->getSchemeObj($this->config, $this->context); + $this->assertIdentical($scheme_obj, $scheme_mock); + + $this->tearDownSchemeRegistryMock(); + } + + function test_getSchemaObj_invalidDefaultScheme() { + $this->setUpNoValidSchemes(); + $this->config->set('URI', 'DefaultScheme', 'foobar'); + + $uri = $this->createURI('hmm'); + + $this->expectError('Default scheme object "foobar" was not readable'); + $result = $uri->getSchemeObj($this->config, $this->context); + $this->assertIdentical($result, false); + + $this->tearDownSchemeRegistryMock(); + } + + function assertToString($expect_uri, $scheme, $userinfo, $host, $port, $path, $query, $fragment) { + $uri = new HTMLPurifier_URI($scheme, $userinfo, $host, $port, $path, $query, $fragment); + $string = $uri->toString(); + $this->assertIdentical($string, $expect_uri); + } + + function test_toString_full() { + $this->assertToString( + 'http://bob@example.com:300/foo?bar=baz#fragment', + 'http', 'bob', 'example.com', 300, '/foo', 'bar=baz', 'fragment' + ); + } + + function test_toString_scheme() { + $this->assertToString( + 'http:', + 'http', null, null, null, '', null, null + ); + } + + function test_toString_authority() { + $this->assertToString( + '//bob@example.com:8080', + null, 'bob', 'example.com', 8080, '', null, null + ); + } + + function test_toString_path() { + $this->assertToString( + '/path/to', + null, null, null, null, '/path/to', null, null + ); + } + + function test_toString_query() { + $this->assertToString( + '?q=string', + null, null, null, null, '', 'q=string', null + ); + } + + function test_toString_fragment() { + $this->assertToString( + '#fragment', + null, null, null, null, '', null, 'fragment' + ); + } + + function assertValidation($uri, $expect_uri = true) { + if ($expect_uri === true) $expect_uri = $uri; + $uri = $this->createURI($uri); + $result = $uri->validate($this->config, $this->context); + if ($expect_uri === false) { + $this->assertFalse($result); + } else { + $this->assertTrue($result); + $this->assertIdentical($uri->toString(), $expect_uri); + } + } + + function test_validate_overlongPort() { + $this->assertValidation('http://example.com:65536', 'http://example.com'); + } + + function test_validate_zeroPort() { + $this->assertValidation('http://example.com:00', 'http://example.com'); + } + + function test_validate_invalidHostThatLooksLikeIPv6() { + $this->assertValidation('http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]', 'http:'); + } + +} diff --git a/tests/HTMLPurifierTest.php b/tests/HTMLPurifierTest.php index ba338fba..3ad307bb 100644 --- a/tests/HTMLPurifierTest.php +++ b/tests/HTMLPurifierTest.php @@ -4,7 +4,7 @@ require_once 'HTMLPurifier.php'; // integration test -class HTMLPurifierTest extends UnitTestCase +class HTMLPurifierTest extends HTMLPurifier_Harness { var $purifier; @@ -132,5 +132,22 @@ alert("<This is compatible with XHTML>"); ); } + function testGetInstance() { + $purifier =& HTMLPurifier::getInstance(); + $purifier2 =& HTMLPurifier::getInstance(); + $this->assertReference($purifier, $purifier2); + } + + function testMakeAbsolute() { + $this->assertPurification( + '<a href="foo.txt">Foobar</a>', + '<a href="http://example.com/bar/foo.txt">Foobar</a>', + array( + 'URI.Base' => 'http://example.com/bar/baz.php', + 'URI.MakeAbsolute' => true + ) + ); + } + } diff --git a/tests/index.php b/tests/index.php old mode 100644 new mode 100755 index aad6a94c..69be2981 --- a/tests/index.php +++ b/tests/index.php @@ -5,6 +5,7 @@ error_reporting(E_ALL | E_STRICT); define('HTMLPurifierTest', 1); +define('HTMLPURIFIER_SCHEMA_STRICT', true); // wishlist: automated calling of this file from multiple PHP versions so we // don't have to constantly switch around @@ -38,7 +39,14 @@ if ( is_string($GLOBALS['HTMLPurifierTest']['PEAR']) ) { } // initialize and load HTML Purifier -require_once '../library/HTMLPurifier.auto.php'; +// use ?standalone to load the alterative standalone stub +if (isset($_GET['standalone']) || (isset($argv[1]) && $argv[1] == 'standalone')) { + set_include_path(realpath('blanks') . PATH_SEPARATOR . get_include_path()); + require_once '../library/HTMLPurifier.standalone.php'; +} else { + require_once '../library/HTMLPurifier.auto.php'; +} +require_once 'HTMLPurifier/Harness.php'; // setup special DefinitionCacheFactory decorator $factory =& HTMLPurifier_DefinitionCacheFactory::instance(); diff --git a/tests/test_files.php b/tests/test_files.php index 5920981e..f9fa71c1 100644 --- a/tests/test_files.php +++ b/tests/test_files.php @@ -79,6 +79,7 @@ $test_files[] = 'HTMLPurifier/GeneratorTest.php'; $test_files[] = 'HTMLPurifier/HTMLDefinitionTest.php'; $test_files[] = 'HTMLPurifier/HTMLModuleManagerTest.php'; $test_files[] = 'HTMLPurifier/HTMLModuleTest.php'; +$test_files[] = 'HTMLPurifier/HTMLModule/RubyTest.php'; $test_files[] = 'HTMLPurifier/HTMLModule/ScriptingTest.php'; $test_files[] = 'HTMLPurifier/HTMLModule/TidyTest.php'; $test_files[] = 'HTMLPurifier/IDAccumulatorTest.php'; @@ -102,8 +103,15 @@ $test_files[] = 'HTMLPurifier/Strategy/RemoveForeignElements_ErrorsTest.php'; $test_files[] = 'HTMLPurifier/Strategy/ValidateAttributesTest.php'; $test_files[] = 'HTMLPurifier/TagTransformTest.php'; $test_files[] = 'HTMLPurifier/TokenTest.php'; +$test_files[] = 'HTMLPurifier/URIDefinitionTest.php'; +$test_files[] = 'HTMLPurifier/URIFilter/DisableExternalTest.php'; +$test_files[] = 'HTMLPurifier/URIFilter/DisableExternalResourcesTest.php'; +$test_files[] = 'HTMLPurifier/URIFilter/HostBlacklistTest.php'; +$test_files[] = 'HTMLPurifier/URIFilter/MakeAbsoluteTest.php'; +$test_files[] = 'HTMLPurifier/URIParserTest.php'; $test_files[] = 'HTMLPurifier/URISchemeRegistryTest.php'; $test_files[] = 'HTMLPurifier/URISchemeTest.php'; +$test_files[] = 'HTMLPurifier/URITest.php'; $test_files[] = 'HTMLPurifierTest.php'; if (version_compare(PHP_VERSION, '5', '>=')) {