From 2bf912d5284d49bf0b20bcf4242f234899295b8d Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Tue, 16 Jan 2007 21:59:29 +0000 Subject: [PATCH] Commit strict version of HTML Purifier. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk-strict@647 48356398-32a2-884e-a903-53898d9a118a --- INSTALL | 1 + NEWS | 10 +- README | 25 +- SLOW | 40 -- WYSIWYG | 3 +- docs/dev-progress.html | 8 +- docs/enduser-slow.html | 116 ++++ docs/enduser-utf8.html | 623 ++++++++++++++++++ docs/examples/basic.php | 9 +- docs/index.html | 3 + docs/style.css | 4 +- library/HTMLPurifier.func.php | 2 +- library/HTMLPurifier.php | 2 +- library/HTMLPurifier/AttrDef/CSS.php | 8 + library/HTMLPurifier/AttrDef/CSSURI.php | 58 ++ library/HTMLPurifier/AttrDef/ListStyle.php | 71 +- library/HTMLPurifier/AttrDef/URI.php | 4 +- library/HTMLPurifier/AttrTransform/BdoDir.php | 2 +- .../AttrTransform/ImgRequired.php | 2 +- library/HTMLPurifier/AttrTransform/Lang.php | 2 +- .../HTMLPurifier/AttrTransform/TextAlign.php | 2 +- library/HTMLPurifier/CSSDefinition.php | 27 +- library/HTMLPurifier/Config.php | 6 +- library/HTMLPurifier/ConfigSchema.php | 12 +- library/HTMLPurifier/Encoder.php | 4 +- library/HTMLPurifier/EntityLookup.php | 2 +- library/HTMLPurifier/HTMLDefinition.php | 13 +- library/HTMLPurifier/Lexer.php | 6 +- library/HTMLPurifier/Lexer/DOMLex.php | 5 + library/HTMLPurifier/Lexer/PEARSax3.php | 2 +- .../HTMLPurifier/Printer/HTMLDefinition.php | 4 +- library/HTMLPurifier/URISchemeRegistry.php | 2 +- smoketests/common.php | 1 + smoketests/printDefinition.php | 6 +- smoketests/utf8.php | 2 + smoketests/xssAttacks.xml | 2 - tests/Debugger.php | 2 +- tests/HTMLPurifier/AttrDef/CSSTest.php | 7 + tests/HTMLPurifier/AttrDef/CSSURITest.php | 37 ++ tests/HTMLPurifier/AttrDef/CompositeTest.php | 18 +- tests/HTMLPurifier/AttrDef/ListStyleTest.php | 11 + tests/HTMLPurifier/AttrDef/URITest.php | 2 +- tests/HTMLPurifier/ContextTest.php | 2 +- tests/HTMLPurifier/LexerTest.php | 4 +- tests/index.php | 3 +- 45 files changed, 1022 insertions(+), 153 deletions(-) delete mode 100644 SLOW create mode 100644 docs/enduser-slow.html create mode 100644 docs/enduser-utf8.html create mode 100644 library/HTMLPurifier/AttrDef/CSSURI.php create mode 100644 tests/HTMLPurifier/AttrDef/CSSURITest.php diff --git a/INSTALL b/INSTALL index 52991e33..0013705c 100644 --- a/INSTALL +++ b/INSTALL @@ -8,6 +8,7 @@ installation GUI, you've come to the wrong place!) The impatient can scroll down to the bottom of this INSTALL document to see the code, but you really should make sure a few things are properly done. +Todo: Convert to using the array syntax for configuration. 1. Compatibility diff --git a/NEWS b/NEWS index 34d9e1c8..b8ae6e24 100644 --- a/NEWS +++ b/NEWS @@ -10,10 +10,14 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier ========================== 1.4.0, unknown release date -(major feature release) +! Implemented list-style-image, URIs now allowed in list-style +! Implemented background-image, background-repeat and background-attachment + CSS properties. background shorthand property HAS NOT been extended + to allow these, and background-position IS NOT implemented yet. +. Implemented AttrDef_CSSURI for url(http://google.com) style declarations -1.3.3, unknown release date, may be dropped -(security/bugfix/minor feature release) +1.3.3, unknown release date, likely to be dropped +! Moved SLOW to docs/enduser-slow.html and added code examples 1.3.2, released 2006-12-25 ! HTMLPurifier object now accepts configuration arrays, no need to manually diff --git a/README b/README index 78e171ad..bfd270d8 100644 --- a/README +++ b/README @@ -1,13 +1,22 @@ README - All about HTMLPurifier + All about HTML Purifier -HTMLPurifier is an HTML filtering solution. It uses a unique combination of -robust whitelists and agressive parsing to ensure that not only are XSS -attacks thwarted, but the resulting HTML is standards compliant. +HTML Purifier is an HTML filtering solution that uses a unique combination +of robust whitelists and agressive parsing to ensure that not only are +XSS attacks thwarted, but the resulting HTML is standards compliant. -See INSTALL on how to use the library. See docs/ for more developer-oriented -documentation as well as some code examples. Users of TinyMCE or FCKeditor -may be especially interested in WYSIWYG. +HTML Purifier is oriented towards richly formatted documents from +untrusted sources that require CSS and a full tag-set. This library can +be configured to accept a more restrictive set of tags, but it won't be +as efficient as more bare-bones parsers. It will, however, do the job +right, which may be more important. -HTMLPurifier can be found on the web at: http://hp.jpsband.org/ +Places to go: + +* See INSTALL for a quick installation guide +* See docs/ for developer-oriented documentation, code examples and + an in-depth installation guide. +* See WYSIWYG for information on editors like TinyMCE and FCKeditor + +HTML Purifier can be found on the web at: http://hp.jpsband.org/ diff --git a/SLOW b/SLOW deleted file mode 100644 index bc8616d9..00000000 --- a/SLOW +++ /dev/null @@ -1,40 +0,0 @@ - -SLOW - also known as the HELP ME LIBRARY IS TOO SLOW MY PAGE TAKE TOO LONG LOAD page - -HTML Purifier is a very powerful library. But with power comes great -responsibility, or, at least, longer execution times. Remember, this -library isn't lightly grazing over submitted HTML: it's deconstructing -the whole thing, rigorously checking the parts, and then putting it -back together. - -So, if it so turns out that HTML Purifier is kinda too slow for outbound -filtering, you've got a few options: - -1. Inbound filtering - perform filtering of HTML when it's submitted by the -user. Since the user is already submitting something, an extra half a -second tacked on to the load time probably isn't going to be that huge of -a problem. Then, displaying the content is a simple a manner of outputting -it directly from your database/filesystem. The trouble with this method is -that your user loses the original text, and when doing edits, will be -handling the filtered text. While this may be a good thing, especially if -you're using a WYSIWYG editor, it can also result in data-loss if a user -makes a typo. - -2. Caching the filtered output - accept the submitted text and put it -unaltered into the database, but then also generate a filtered version and -stash that in the database. Serve the filtered version to readers, and the -unaltered version to editors. If need be, you can invalidate the cache and -have the cached filtered version be regenerated on the first page view. Pros? -Full data retention. Cons? It's more complicated, and opens other editors -up to XSS if they are using a WYSIWYG editor (to fix that, they'd have to -be able to get their hands on the *really* original text served in plaintext -mode). - -In short, inbound filtering is almost as simple as outbound filtering, but -it has some drawbacks which cannot be fixed unless you save both the original -and the filtered versions. - -There is a third option: profile and optimize HTMLPurifier yourself. Be sure -to report back your results if you decide to do that! Especially if you -port HTML Purifier to C++. ;-) diff --git a/WYSIWYG b/WYSIWYG index 6fab8bcc..718f8959 100644 --- a/WYSIWYG +++ b/WYSIWYG @@ -18,4 +18,5 @@ HTML Purifier is perfect for filtering pure-HTML input from WYSIWYG editors. Enough said. There is a proof-of-concept integration of HTML Purifier with the Mantis -bugtracker at http://hp.jpsband.org/mantis/ +bugtracker at http://hp.jpsband.org/mantis/ You can see notes on how +this integration was acheived at http://hp.jpsband.org/mantis_notes.txt diff --git a/docs/dev-progress.html b/docs/dev-progress.html index 78cd56fc..0262f170 100644 --- a/docs/dev-progress.html +++ b/docs/dev-progress.html @@ -59,7 +59,7 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;} Standard background-colorCOMPOSITE(<color>, transparent) -backgroundSHORTHAND, only for color, see below for info on background-image and friends +backgroundSHORTHAND borderSHORTHAND, MULTIPLE border-colorMULTIPLE border-styleMULTIPLE @@ -141,8 +141,8 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;} Unknown -background-imageDangerous, target milestone 1.3 -background-attachmentENUM(scroll, fixed), +background-imageDangerous, target milestone 1.3 +background-attachmentENUM(scroll, fixed), Depends on background-image background-positionDepends on background-image cursorDangerous but fluffy @@ -151,7 +151,7 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;} inline-block has incomplete IE6 support and requires -moz-inline-box for Mozilla. Unknown target milestone. heightInteresting, why use it? Unknown target milestone. -list-style-imageDangerous? Target milestone 1.3 +list-style-imageDangerous? max-heightNo IE 5/6 min-height max-width diff --git a/docs/enduser-slow.html b/docs/enduser-slow.html new file mode 100644 index 00000000..bac0704d --- /dev/null +++ b/docs/enduser-slow.html @@ -0,0 +1,116 @@ + + + + + + + +Speeding up HTML Purifier - HTML Purifier + + + +

Speeding up HTML Purifier

+
...also known as the HELP ME LIBRARY IS TOO SLOW MY PAGE TAKE TOO LONG page
+ +
Filed under End-User
+
Return to the index.
+ +

HTML Purifier is a very powerful library. But with power comes great +responsibility, in the form of longer execution times. Remember, this +library isn't lightly grazing over submitted HTML: it's deconstructing +the whole thing, rigorously checking the parts, and then putting it back +together.

+ +

So, if it so turns out that HTML Purifier is kinda too slow for outbound +filtering, you've got a few options:

+ +

Inbound filtering

+ +

Perform filtering of HTML when it's submitted by the user. Since the +user is already submitting something, an extra half a second tacked on +to the load time probably isn't going to be that huge of a problem. +Then, displaying the content is a simple a manner of outputting it +directly from your database/filesystem. The trouble with this method is +that your user loses the original text, and when doing edits, will be +handling the filtered text. While this may be a good thing, especially +if you're using a WYSIWYG editor, it can also result in data-loss if a +user makes a typo.

+ +

Example (non-functional):

+ +
<?php
+    /**
+     * FORM SUBMISSION PAGE
+     * display_error($message) : displays nice error page with message
+     * display_success() : displays a nice success page
+     * display_form() : displays the HTML submission form
+     * database_insert($html) : inserts data into database as new row
+     */
+    if (!empty($_POST)) {
+        require_once '/path/to/library/HTMLPurifier.auto.php';
+        require_once 'HTMLPurifier.func.php';
+        $dirty_html = isset($_POST['html']) ? $_POST['html'] : false;
+        if (!$dirty_html) {
+            display_error('You must write some HTML!');
+        }
+        $html = HTMLPurifier($dirty_html);
+        database_insert($html);
+        display_success();
+        // notice that $dirty_html is *not* saved
+    } else {
+        display_form();
+    }
+?>
+ +

Caching the filtered output

+ +

Accept the submitted text and put it unaltered into the database, but +then also generate a filtered version and stash that in the database. +Serve the filtered version to readers, and the unaltered version to +editors. If need be, you can invalidate the cache and have the cached +filtered version be regenerated on the first page view. Pros? Full data +retention. Cons? It's more complicated, and opens other editors up to +XSS if they are using a WYSIWYG editor (to fix that, they'd have to be +able to get their hands on the *really* original text served in +plaintext mode).

+ +

Example (non-functional):

+ +
<?php
+    /**
+     * VIEW PAGE
+     * display_error($message) : displays nice error page with message
+     * cache_get($id) : retrieves HTML from fast cache (db or file)
+     * cache_insert($id, $html) : inserts good HTML into cache system
+     * database_get($id) : retrieves raw HTML from database
+     */
+    $id = isset($_GET['id']) ? (int) $_GET['id'] : false;
+    if (!$id) {
+        display_error('Must specify ID.');
+        exit;
+    }
+    $html = cache_get($id); // filesystem or database
+    if ($html === false) {
+        // cache didn't have the HTML, generate it
+        $raw_html = database_get($id);
+        require_once '/path/to/library/HTMLPurifier.auto.php';
+        require_once 'HTMLPurifier.func.php';
+        $html = HTMLPurifier($raw_html);
+        cache_insert($id, $html);
+    }
+    echo $html;
+?>
+ +

Summary

+ +

In short, inbound filtering is the simple option and caching is the +robust option (albeit with bigger storage requirements).

+ +

There is a third option, independent of the two we've discussed: profile +and optimize HTMLPurifier yourself. Be sure to report back your results +if you decide to do that! Especially if you port HTML Purifier to C++. +;-)

+ + + \ No newline at end of file diff --git a/docs/enduser-utf8.html b/docs/enduser-utf8.html new file mode 100644 index 00000000..2b8338f4 --- /dev/null +++ b/docs/enduser-utf8.html @@ -0,0 +1,623 @@ + + + + + + + + + +UTF-8 - HTML Purifier + + + + + +

UTF-8

+ +
Filed under End-User
+
Return to the index.
+ +

Character encoding and character sets, in truth, are not that +difficult to understand. But if you don't understand them, you are going +to be caught by surprise by some of HTML Purifier's behavior, namely +the fact that it operates UTF-8 or the limitations of the character +encoding transformations it does. This document will walk you through +determining the encoding of your system and how you should handle +this information. It will stay away from excessive discussion on +the internals of character encoding, but offer the information in +asides that can easily be skipped.

+ +
+
Asides
+

Text in this formatting is an aside, + interesting tidbits for the curious but not strictly necessary material to + do the tutorial. If you read this text, you'll come out + with a greater understanding of the underlying issues.

+
+ +

Finding the real encoding

+ +

In the beginning, there was ASCII, and things were simple. But they +weren't good, for no one could write in Cryllic or Thai. So there +exploded a proliferation of character encodings to remedy the problem +by extending the characters ASCII could express. This ridiculously +simplified version of the history of character encodings shows us that +there are now many character encodings floating around.

+ +
+

A character encoding tells the computer how to + interpret raw zeroes and ones into real characters. It + usually does this by pairing numbers with characters.

+

There are many different types of character encodings floating + around, but the ones we deal most frequently with are ASCII, + 8-bit encodings, and Unicode-based encodings.

+ +
+ +

The first step of our journey is to find out what the encoding of +your website is. The most reliable way is to ask your +browser:

+ +
+
Mozilla Firefox
+
Tools > Page Info: Encoding
+
Internet Explorer
+
View > Encoding: bulleted item is unofficial name
+
+ +

Internet Explorer won't give you the mime (i.e. useful/real) name of the +character encoding, so you'll have to look it up using their description. +Some common ones:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IE's DescriptionMime Name
Windows
Arabic (Windows)Windows-1256
Baltic (Windows)Windows-1257
Central European (Windows)Windows-1250
Cyrillic (Windows)Windows-1251
Greek (Windows)Windows-1253
Hebrew (Windows)Windows-1255
Thai (Windows)TIS-620
Turkish (Windows)Windows-1254
Vietnamese (Windows)Windows-1258
Western European (Windows)Windows-1252
ISO
Arabic (ISO)ISO-8859-6
Baltic (ISO)ISO-8859-4
Central European (ISO)ISO-8859-2
Cyrillic (ISO)ISO-8859-5
Estonian (ISO)ISO-8859-13
Greek (ISO)ISO-8859-7
Hebrew (ISO-Logical)ISO-8859-8-l
Hebrew (ISO-Visual)ISO-8859-8
Latin 9 (ISO)ISO-8859-15
Turkish (ISO)ISO-8859-9
Western European (ISO)ISO-8859-1
Other
Chinese Simplified (GB18030)GB18030
Chinese Simplified (GB2312)GB2312
Chinese Simplified (HZ)HZ
Chinese Traditional (Big5)Big5
Japanese (Shift-JIS)Shift_JIS
Japanese (EUC)EUC-JP
KoreanEUC-KR
Unicode (UTF-8)UTF-8
+ +

Internet Explorer does not recognize some of the more obscure +character encodings, and having to lookup the real names with a table +is a pain, so I recommend using Mozilla Firefox to find out your +character encoding.

+ +

Finding the embedded encoding

+ +

At this point, you may be asking, "Didn't we already find out our +encoding?" Well, as it turns out, there are multiple places where +a web developer can specify a character encoding, and one such place +is in a META tag:

+ +
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+ +

You'll find this in the HEAD section of an HTML document. +The text to the right of charset= is the "claimed" +encoding: the HTML claims to be this encoding, but whether or not this +is actually the case depends on other factors. For now, take note +if your META tag claims that either:

+ +
    +
  1. The character encoding is the same as the one reported by the + browser,
  2. +
  3. The character encoding is different from the browser's, or
  4. +
  5. There is no META tag at all! (horror, horror!)
  6. +
+ +

Fixing the encoding

+ +

If your META encoding and your real encoding match, +savvy! You can skip this section. If they don't...

+ +

No embedded encoding

+ +

If this is the case, you'll want to add in the appropriate +META tag to your website. It's as simple as copy-pasting +the code snippet above and replacing UTF-8 with whatever is the mime name +of your real encoding.

+ +
+

For all those skeptics out there, there is a very good reason + why the character encoding should be explicitly stated. When the + browser isn't told what the character encoding of a text is, it + has to guess: and sometimes the guess is wrong. Hackers can manipulate + this guess in order to slip XSS pass filters and then fool the + browser into executing it as active code. A great example of this + is the Google UTF-7 + exploit.

+

You might be able to get away with not specifying a character + encoding with the META tag as long as your webserver + sends the right Content-Type header, but why risk it? Besides, if + the user downloads the HTML file, there is no longer any webserver + to define the character encoding.

+
+ +

Embedded encoding disagrees

+ +

This is an extremely common mistake: another source is telling +the browser what the +character encoding is and is overriding the embedded encoding. This +source usually is the Content-Type HTTP header that the webserver (i.e. +Apache) sends. A usual Content-Type header sent with a page might +look like this:

+ +
Content-Type: text/html; charset=ISO-8859-1
+ +

Notice how there is a charset parameter: this is the webserver's +way of telling a browser what the character encoding is, much like +the META tags we touched upon previously.

+ +

In fact, the META tag is +designed as a substitute for the HTTP header for contexts where +sending headers is impossible (such as locally stored files without +a webserver). Thus the name http-equiv (HTTP equivalent). +

+ +

There are two ways to go about fixing this: changing the META +tag to match the HTTP header, or changing the HTTP header to match +the META tag. How do we know which to do? It depends +on the website's content: after all, headers and tags are only ways of +describing the actual characters on the web page.

+ +

If your website:

+ +
+
...only uses ASCII characters,
+
Either way is fine, but I recommend switching both to + UTF-8 (more on this later).
+
...uses special characters, and they display + properly,
+
Change the embedded encoding to the server encoding.
+
...uses special characters, but users often complain that + they come out garbled,
+
Change the server encoding to the embedded encoding.
+
+ +

Changing a META tag is easy: just swap out the old encoding +for the new. Changing the server (HTTP header) encoding, however, +is slightly more difficult.

+ +

Changing the server encoding

+ +

PHP header() function

+ +

The simplest way to handle this problem is to send the encoding +yourself, via your programming language. Since you're using HTML +Purifier, I'll assume PHP, although it's not too difficult to do +similar things in +other +languages. The appropriate code is:

+ +
header('Content-Type:text/html; charset=UTF-8');
+ +

...replacing UTF-8 with whatever your embedded encoding is. +This code must come before any output, so be careful about +stray whitespace in your application.

+ +

PHP ini directive

+ +

PHP also has a neat little ini directive that can save you a +header call: default_charset. Using this code:

+ +
ini_set('default_charset', 'UTF-8');
+ +

...will also do the trick. If PHP is running as an Apache module (and +not as FastCGI, consult +phpinfo() for details), you can even use htaccess do apply this property +globally:

+ +
php_value default_charset "UTF-8"
+ +

As with all INI directives, this can +also go in your php.ini file. Some hosting providers allow you to customize +your own php.ini file, ask your support for details. Use:

+
default_charset = "utf-8"
+ +

Non-PHP

+ +

You may, for whatever reason, may need to set the character encoding +on non-PHP files, usually plain ol' HTML files. Doing this +is more of a hit-or-miss process: depending on the software being +used as a webserver and the configuration of that software, certain +techniques may work, or may not work.

+ +

.htaccess

+ +

On Apache, you can use an .htaccess file to change the character +encoding. I'll defer to +W3C +for the in-depth explanation, but it boils down to creating a file +named .htaccess with the contents:

+ +
AddCharset UTF-8 .html
+ +

Where UTF-8 is replaced with the character encoding you want to +use and .html is a file extension that this will be applied to. This +character encoding will then be set for any file directly in +or in the subdirectories of directory you place this file in.

+ +

If you're feeling particularly courageous, you can use:

+ +
AddDefaultCharset UTF-8
+ +

...which changes the character set Apache adds to any document that +doesn't have any Content-Type parameters. This directive, which the +default configuration file sets to iso-8859-1 for security +reasons, is probably why your headers mismatch +with the META tag. If you would prefer Apache not to be +butting in on your character encodings, you can tell it not +to send anything at all:

+ +
AddDefaultCharset Off
+ +

...making your META tags the sole source of +character encoding information. In these cases, it is +especially important to make sure you have valid META +tags on your pages and all the text before them is ASCII.

+ +

These directives can also be +placed in httpd.conf file for Apache, but +in most shared hosting situations you won't be able to edit this file. +

+ +

File extensions

+ +

If you're not allowed to use .htaccess files, you can often +piggy-back off of Apache's default AddCharset declarations to get +your files in the proper extension. Here are Apache's default +character set declarations:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CharsetFile extension(s)
ISO-8859-1.iso8859-1 .latin1
ISO-8859-2.iso8859-2 .latin2 .cen
ISO-8859-3.iso8859-3 .latin3
ISO-8859-4.iso8859-4 .latin4
ISO-8859-5.iso8859-5 .latin5 .cyr .iso-ru
ISO-8859-6.iso8859-6 .latin6 .arb
ISO-8859-7.iso8859-7 .latin7 .grk
ISO-8859-8.iso8859-8 .latin8 .heb
ISO-8859-9.iso8859-9 .latin9 .trk
ISO-2022-JP.iso2022-jp .jis
ISO-2022-KR.iso2022-kr .kis
ISO-2022-CN.iso2022-cn .cis
Big5.Big5 .big5 .b5
WINDOWS-1251.cp-1251 .win-1251
CP866.cp866
KOI8-r.koi8-r .koi8-ru
KOI8-ru.koi8-uk .ua
ISO-10646-UCS-2.ucs2
ISO-10646-UCS-4.ucs4
UTF-8.utf8
GB2312.gb2312 .gb
utf-7.utf7
EUC-TW.euc-tw
EUC-JP.euc-jp
EUC-KR.euc-kr
shift_jis.sjis
+ +

So, for example, a file named page.utf8.html or +page.html.utf8 will probably be sent with the UTF-8 charset +attached, the difference being that if there is an +AddCharset charset .html declaration, it will override +the .utf8 extension in page.utf8.html (precedence moves +from right to left). By default, Apache has no such declaration.

+ +

Microsoft IIS

+ +

If anyone can contribute information on how to configure Microsoft +IIS to change character encodings, I'd be grateful.

+ +

XML

+ +

META tags are the most common source of embedded +encodings, but they can also come from somewhere else: XML +processing instructions. They look like:

+ +
<?xml version="1.0" encoding="UTF-8"?>
+ +

...and are most often found in XML documents (including XHTML).

+ +

For XHTML, this processing instruction theoretically +overrides the META tag. In reality, this happens only when the +XHTML is actually served as legit XML and not HTML, which is almost +always never due to Internet Explorer's lack of support for +application/xhtml+xml (even though doing so is often +argued to be good practice).

+ +

For XML, however, this processing instruction is extremely important. +Since most webservers are not configured to send charsets for .xml files, +this is the only thing a parser has to go on. Furthermore, the default +for XML files is UTF-8, which often butts heads with more common +ISO-8859-1 encoding (you see this in garbled RSS feeds).

+ +

In short, if you use XHTML and have gone through the +trouble of adding the XML header, be sure to make sure it jives +with your META tags and HTTP headers.

+ +

Inside the process

+ +

This section is not required reading, +but may answer some of your questions on what's going on in all +this character encoding hocus pocus. If you're interested in +moving on to the next phase, skip this section.

+ +

A logical question that follows all of our wheeling and dealing +with multiple sources of character encodings is "Why are there +so many options?" To answer this question, we have to turn +back our definition of character encodings: they allow a program +to interpret bytes into human-readable characters.

+ +

Thus, a chicken-egg problem: a character encoding +is necessary to interpret the +text of a document. A META tag is in the text of a document. +The META tag gives the character encoding. How can we +determine the contents of a META tag, inside the text, +if we don't know it's character encoding? And how do we figure out +the character encoding, if we don't know the contents of the +META tag?

+ +

Fortunantely for us, the characters we need to write the +META are in ASCII, which is pretty much universal +over every character encoding that is in common use today. So, +all the web-browser has to do is parse all the way down until +it gets to the Content-Type tag, extract the character encoding +tag, then re-parse the document according to this new information.

+ +

Obviously this is complicated, so browsers prefer the simpler +and more efficient solution: get the character encoding from a +somewhere other than the document itself, i.e. the HTTP headers, +much to the chagrin of HTML authors who can't set these headers.

+ +

Why UTF-8?

+ +

So, you've gone through all the trouble of ensuring that your +server and embedded characters all line up properly and are +present. Good job: at +this point, you could quit and rest easy knowing that your pages +are not vulnerable to character encoding style XSS attacks. +However, just as having a character encoding is better than +having no character encoding at all, having UTF-8 as your +character encoding is better than having some other random +character encoding, and the next step is to convert to UTF-8. +But why?

+ +

Internationalization

+ +

Many software projects, at one point or another, suddenly realize +that they should be supporting more than one language. Even regular +usage in one language sometimes requires the occasional special character +that, without surprise, is not available in your character set. Sometimes +developers get around this by adding support for multiple encodings: when +using Chinese, use Big5, when using Japanese, use Shift-JIS, when +using Greek, etc. Other times, they use character entities with great +zeal.

+ +

UTF-8, however, obviates the need for any of these complicated +measures. After getting the system to use UTF-8 and adjusting for +sources that are outside the hand of the browser (more on this later), +UTF-8 just works. You can use it for any language, even many languages +at once, you don't have to worry about managing multiple encodings, +you don't have to use those user-unfriendly entities.

+ +

User-friendly

+ +

Websites encoded in Latin-1 (ISO-8859-1) which ocassionally need +a special character outside of their scope often will use a character +entity to achieve the desired effect. For instance, θ can be +written &theta;, regardless of the character encoding's +support of Greek letters.

+ +

This works nicely for limited use of special characters, but +say you wanted this sentence of Chinese text: 激光, +這兩個字是甚麼意思. +The entity-ized version would look like this:

+ +
&#28608;&#20809;, &#36889;&#20841;&#20491;&#23383;&#26159;&#29978;&#40636;&#24847;&#24605;
+ +

Extremely inconvenient for those of us who actually know what +character entities are, totally unintelligible to poor users who don't! +Even the slightly more user-friendly, "intelligible" character +entities like &theta; will leave users who are +uninterested in learning HTML scratching their heads. On the other +hand, if they see θ in an edit box, they'll know that it's a +special character, and treat it accordingly, even if they don't know +how to write that character themselves.

+ +

Wikipedia is a great case study for +an application that originally used ISO-8859-1 but switched to UTF-8 +when it became far to cumbersome to support foreign languages. Bots +will now actually go through articles and convert character entities +to their corresponding real characters for the sake of user-friendliness +and searcheability. See +Meta's +page on special characters for more details. +

+ +

Forms

+ +

While we're on the tack of users, how do non-UTF-8 web forms deal +with characters that our outside of their character set? Rather than +discuss what UTF-8 does right, we're going to show what could go wrong +if you didn't use UTF-8 and people tried to use characters outside +of your character encoding.

+ +

The troubles are large, extensive, and extremely difficult to fix (or, +at least, difficult enough that if you had the time and resources to invest +in doing the fix, you would be probably better off migrating to UTF-8). +There are two types of form submission: application/x-www-form-urlencoded +which is used for GET and by default for POST, and multipart/form-data +which may be used by POST, and is required when you want to upload +files.

+ +

The following is a summarization of notes from + +FORM submission and i18n. That document contains lots +of useful information, but is written in a rambly manner, so +here I try to get right to the point.

+ +

application/x-www-form-urlencoded

+ +

This is the Content-Type that GET requests must use, and POST requests +use by default. It involves the ubiquituous percent encoding format that +looks something like: %C3%86. There is no official way of +determining the character encoding of such a request, since the percent +encoding operates on a byte level, so it is usually assumed that it +is the same as the encoding the page containing the form was submitted +in. You'll run into very few problems if you only use characters in +the character encoding you chose.

+ +

However, once you start adding characters outside of your encoding +(and this is a lot more common than you may think: take curly +"smart" quotes from Microsoft as an example), +a whole manner of strange things start to happen. Depending on the +browser you're using, they might:

+ + + +

To properly guard against these behaviors, you'd have to sniff out +the browser agent, compile a database of different behaviors, and +take appropriate conversion action against the string (disregarding +a spate of extremely mysterious, random and devastating bugs Internet +Explorer manifests every once in a while). Or you could +use UTF-8 and rest easy knowing that none of this could possibly happen +since UTF-8 supports every character.

+ +

multipart/form-data

+ +

Multipart form submission takes a way a lot of the ambiguity +that percent-encoding had: the server now can explicitly ask for +certain encodings, and the client can explicitly tell the server +during the form submission what encoding the fields are in.

+ +

There are two ways you go with this functionality: leave it +unset and have the browser send in the same encoding as the page, +or set it to UTF-8 and then do another conversion server-side. +Each method has deficiencies, especially the former.

+ +

If you tell the browser to send the form in the same encoding as +the page, you still have the trouble of what to do with characters +that are outside of the character encoding's range. The behavior, once +again, varies: Firefox 2.0 entity-izes them while Internet Explorer +7.0 mangles them beyond intelligibility. For serious I18N purposes, +this is not an option.

+ +

The other possibility is to set Accept-Encoding to UTF-8, which +begs the question: Why aren't you using UTF-8 for everything then? +This route is more palatable, but there's a notable caveat: your data +will come in as UTF-8, so you will have to explicitly convert it into +your favored local character encoding.

+ +

I object to this approach on idealogical grounds: you're +digging yourself deeper into +the hole when you could have been converting to UTF-8 +instead. And, of course, you can't use this method for GET requests.

+ +

Well supported

+ +

HTML Purifier

+ +

Migrate to UTF-8

+ +

Text editor

+ +

Configuring your database

+ +

Convert old text

+ +

Byte Order Mark (headers already sent!)

+ +

Dealing with variable width in functions

+ + + +

Many other developers have already discussed the subject of Unicode, +UTF-8 and internationalization, and I would like to defer to them for +a more in-depth look into character sets and encodings.

+ + + + + \ No newline at end of file diff --git a/docs/examples/basic.php b/docs/examples/basic.php index 244afc96..60258512 100644 --- a/docs/examples/basic.php +++ b/docs/examples/basic.php @@ -1,15 +1,14 @@ -Simple and short'; $pure_html = $purifier->purify($html); +echo $pure_html; + ?> \ No newline at end of file diff --git a/docs/index.html b/docs/index.html index 12d839db..5179205a 100644 --- a/docs/index.html +++ b/docs/index.html @@ -28,6 +28,9 @@ information for casual developers using HTML Purifier.

Embedding YouTube videos
Explains how to safely allow the embedding of flash from trusted sites.
+
Speeding up HTML Purifier
+
Explains how to speed up HTML Purifier through caching or inbound filtering.
+

Development

diff --git a/docs/style.css b/docs/style.css index bc7e85a4..f60b333c 100644 --- a/docs/style.css +++ b/docs/style.css @@ -23,6 +23,8 @@ h4 {font-family:sans-serif; font-size:0.9em; font-weight:bold; } /* Marks off asides, discussions on why something is the way it is */ .aside {margin-left:2em; font-family:sans-serif; font-size:0.9em; } +blockquote .label {font-weight:bold; font-size:1em; margin:0 0 .1em; + border-bottom:1px solid #CCC;} /* A regular table */ .table {border-collapse:collapse; border-bottom:2px solid #888; margin-left:2em; } @@ -37,4 +39,4 @@ h4 {font-family:sans-serif; font-size:0.9em; font-weight:bold; } #index {font-size:smaller; } /* Contains, without exception, $Id$, for SVN version info. */ -#version {text-align:right; font-style:italic; margin:2em 0;} \ No newline at end of file +#version {text-align:right; font-style:italic; margin:2em 0;} diff --git a/library/HTMLPurifier.func.php b/library/HTMLPurifier.func.php index 50e7c9b9..876ad7b2 100644 --- a/library/HTMLPurifier.func.php +++ b/library/HTMLPurifier.func.php @@ -6,12 +6,12 @@ * this is efficient for instances when you only use HTML Purifier * on a few of your pages, it murders bytecode caching. You still * need to add HTML Purifier to your path. + * @note ''HTMLPurifier()'' is NOT the same as ''new HTMLPurifier()'' */ function HTMLPurifier($html, $config = null) { static $purifier = false; if (!$purifier) { - $init = true; require_once 'HTMLPurifier.php'; $purifier = new HTMLPurifier(); } diff --git a/library/HTMLPurifier.php b/library/HTMLPurifier.php index 88ced00f..c758b439 100644 --- a/library/HTMLPurifier.php +++ b/library/HTMLPurifier.php @@ -109,7 +109,7 @@ class HTMLPurifier $config = $config ? HTMLPurifier_Config::create($config) : $this->config; - $context =& new HTMLPurifier_Context(); + $context = new HTMLPurifier_Context(); $html = $this->encoder->convertToUTF8($html, $config, $context); // purified HTML diff --git a/library/HTMLPurifier/AttrDef/CSS.php b/library/HTMLPurifier/AttrDef/CSS.php index 404c7000..220ec0d0 100644 --- a/library/HTMLPurifier/AttrDef/CSS.php +++ b/library/HTMLPurifier/AttrDef/CSS.php @@ -8,6 +8,11 @@ require_once 'HTMLPurifier/CSSDefinition.php'; * @note We don't implement the whole CSS specification, so it might be * difficult to reuse this component in the context of validating * actual stylesheet declarations. + * @note If we were really serious about validating the CSS, we would + * tokenize the styles and then parse the tokens. Obviously, we + * are not doing that. Doing that could seriously harm performance, + * but would make these components a lot more viable for a CSS + * filtering solution. */ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef { @@ -20,6 +25,9 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef // we're going to break the spec and explode by semicolons. // This is because semicolon rarely appears in escaped form + // Doing this is generally flaky but fast + // IT MIGHT APPEAR IN URIs, see HTMLPurifier_AttrDef_CSSURI + // for details $declarations = explode(';', $css); $propvalues = array(); diff --git a/library/HTMLPurifier/AttrDef/CSSURI.php b/library/HTMLPurifier/AttrDef/CSSURI.php new file mode 100644 index 00000000..a92b6263 --- /dev/null +++ b/library/HTMLPurifier/AttrDef/CSSURI.php @@ -0,0 +1,58 @@ +HTMLPurifier_AttrDef_URI(true); // always embedded + } + + function validate($uri_string, $config, &$context) { + // parse the URI out of the string and then pass it onto + // the parent object + + $uri_string = $this->parseCDATA($uri_string); + if (strpos($uri_string, 'url(') !== 0) return false; + $uri_string = substr($uri_string, 4); + $new_length = strlen($uri_string) - 1; + if ($uri_string[$new_length] != ')') return false; + $uri = trim(substr($uri_string, 0, $new_length)); + + if (isset($uri[0]) && ($uri[0] == "'" || $uri[0] == '"')) { + $quote = $uri[0]; + $new_length = strlen($uri) - 1; + if ($uri[$new_length] !== $quote) return false; + $uri = substr($uri, 1, $new_length - 1); + } + + $keys = array( '(', ')', ',', ' ', '"', "'"); + $values = array('\\(', '\\)', '\\,', '\\ ', '\\"', "\\'"); + $uri = str_replace($values, $keys, $uri); + + $result = parent::validate($uri, $config, $context); + + if ($result === false) return false; + + // escape necessary characters according to CSS spec + // except for the comma, none of these should appear in the + // URI at all + $result = str_replace($keys, $values, $result); + + return "url($result)"; + + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/AttrDef/ListStyle.php b/library/HTMLPurifier/AttrDef/ListStyle.php index a2df527a..b866798c 100644 --- a/library/HTMLPurifier/AttrDef/ListStyle.php +++ b/library/HTMLPurifier/AttrDef/ListStyle.php @@ -4,8 +4,7 @@ require_once 'HTMLPurifier/AttrDef.php'; /** * Validates shorthand CSS property list-style. - * @note This currently does not support list-style-image, as that functionality - * is not implemented yet elsewhere. + * @warning Does not support url tokens that have internal spaces. */ class HTMLPurifier_AttrDef_ListStyle extends HTMLPurifier_AttrDef { @@ -20,6 +19,7 @@ class HTMLPurifier_AttrDef_ListStyle extends HTMLPurifier_AttrDef $def = $config->getCSSDefinition(); $this->info['list-style-type'] = $def->info['list-style-type']; $this->info['list-style-position'] = $def->info['list-style-position']; + $this->info['list-style-image'] = $def->info['list-style-image']; } function validate($string, $config, &$context) { @@ -28,48 +28,49 @@ class HTMLPurifier_AttrDef_ListStyle extends HTMLPurifier_AttrDef $string = $this->parseCDATA($string); if ($string === '') return false; + // assumes URI doesn't have spaces in it $bits = explode(' ', strtolower($string)); // bits to process - $caught_type = false; - $caught_position = false; - $caught_none = false; // as in keyword none, which is in all of them + $caught = array(); + $caught['type'] = false; + $caught['position'] = false; + $caught['image'] = false; - $ret = ''; + $i = 0; // number of catches + $none = false; foreach ($bits as $bit) { - if ($caught_none && ($caught_type || $caught_position)) break; - if ($caught_type && $caught_position) break; - + if ($i >= 3) return; // optimization bit if ($bit === '') continue; - - if ($bit === 'none') { - if ($caught_none) continue; - $caught_none = true; - $ret .= 'none '; - continue; - } - - // if we add anymore, roll it into a loop - - $r = $this->info['list-style-type']->validate($bit, $config, $context); - if ($r !== false) { - if ($caught_type) continue; - $caught_type = true; - $ret .= $r . ' '; - continue; - } - - $r = $this->info['list-style-position']->validate($bit, $config, $context); - if ($r !== false) { - if ($caught_position) continue; - $caught_position = true; - $ret .= $r . ' '; - continue; + foreach ($caught as $key => $status) { + if ($status !== false) continue; + $r = $this->info['list-style-' . $key]->validate($bit, $config, $context); + if ($r === false) continue; + if ($r === 'none') { + if ($none) continue; + else $none = true; + if ($key == 'image') continue; + } + $caught[$key] = $r; + $i++; } } - $ret = rtrim($ret); - return $ret ? $ret : false; + if (!$i) return false; + + $ret = array(); + + // construct type + if ($caught['type']) $ret[] = $caught['type']; + + // construct image + if ($caught['image']) $ret[] = $caught['image']; + + // construct position + if ($caught['position']) $ret[] = $caught['position']; + + if (empty($ret)) return false; + return implode(' ', $ret); } diff --git a/library/HTMLPurifier/AttrDef/URI.php b/library/HTMLPurifier/AttrDef/URI.php index d5a36434..a3ce6ded 100644 --- a/library/HTMLPurifier/AttrDef/URI.php +++ b/library/HTMLPurifier/AttrDef/URI.php @@ -139,10 +139,10 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef // no need to validate the scheme's fmt since we do that when we // retrieve the specific scheme object from the registry $scheme = ctype_lower($scheme) ? $scheme : strtolower($scheme); - $scheme_obj =& $registry->getScheme($scheme, $config, $context); + $scheme_obj = $registry->getScheme($scheme, $config, $context); if (!$scheme_obj) return false; // invalid scheme, clean it out } else { - $scheme_obj =& $registry->getScheme( + $scheme_obj = $registry->getScheme( $config->get('URI', 'DefaultScheme'), $config, $context ); } diff --git a/library/HTMLPurifier/AttrTransform/BdoDir.php b/library/HTMLPurifier/AttrTransform/BdoDir.php index dd20f26a..0ea5eb6d 100644 --- a/library/HTMLPurifier/AttrTransform/BdoDir.php +++ b/library/HTMLPurifier/AttrTransform/BdoDir.php @@ -20,7 +20,7 @@ HTMLPurifier_ConfigSchema::defineAllowedValues( class HTMLPurifier_AttrTransform_BdoDir extends HTMLPurifier_AttrTransform { - function transform($attr, $config, $context) { + function transform($attr, $config, &$context) { if (isset($attr['dir'])) return $attr; $attr['dir'] = $config->get('Attr', 'DefaultTextDir'); return $attr; diff --git a/library/HTMLPurifier/AttrTransform/ImgRequired.php b/library/HTMLPurifier/AttrTransform/ImgRequired.php index c943d696..4ff356d8 100644 --- a/library/HTMLPurifier/AttrTransform/ImgRequired.php +++ b/library/HTMLPurifier/AttrTransform/ImgRequired.php @@ -25,7 +25,7 @@ HTMLPurifier_ConfigSchema::define( class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform { - function transform($attr, $config, $context) { + function transform($attr, $config, &$context) { $src = true; if (!isset($attr['src'])) { diff --git a/library/HTMLPurifier/AttrTransform/Lang.php b/library/HTMLPurifier/AttrTransform/Lang.php index 97fd8064..acb1786a 100644 --- a/library/HTMLPurifier/AttrTransform/Lang.php +++ b/library/HTMLPurifier/AttrTransform/Lang.php @@ -10,7 +10,7 @@ require_once 'HTMLPurifier/AttrTransform.php'; class HTMLPurifier_AttrTransform_Lang extends HTMLPurifier_AttrTransform { - function transform($attr, $config, $context) { + function transform($attr, $config, &$context) { $lang = isset($attr['lang']) ? $attr['lang'] : false; $xml_lang = isset($attr['xml:lang']) ? $attr['xml:lang'] : false; diff --git a/library/HTMLPurifier/AttrTransform/TextAlign.php b/library/HTMLPurifier/AttrTransform/TextAlign.php index e42354a0..84e5a016 100644 --- a/library/HTMLPurifier/AttrTransform/TextAlign.php +++ b/library/HTMLPurifier/AttrTransform/TextAlign.php @@ -8,7 +8,7 @@ require_once 'HTMLPurifier/AttrTransform.php'; class HTMLPurifier_AttrTransform_TextAlign extends HTMLPurifier_AttrTransform { - function transform($attr, $config, $context) { + function transform($attr, $config, &$context) { if (!isset($attr['align'])) return $attr; diff --git a/library/HTMLPurifier/CSSDefinition.php b/library/HTMLPurifier/CSSDefinition.php index 5647c4b7..d2227e03 100644 --- a/library/HTMLPurifier/CSSDefinition.php +++ b/library/HTMLPurifier/CSSDefinition.php @@ -11,6 +11,7 @@ require_once 'HTMLPurifier/AttrDef/FontFamily.php'; require_once 'HTMLPurifier/AttrDef/Font.php'; require_once 'HTMLPurifier/AttrDef/Border.php'; require_once 'HTMLPurifier/AttrDef/ListStyle.php'; +require_once 'HTMLPurifier/AttrDef/CSSURI.php'; /** * Defines allowed CSS attributes and what their values are. @@ -51,11 +52,19 @@ class HTMLPurifier_CSSDefinition $this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum( array('normal', 'small-caps'), false); + $uri_or_none = new HTMLPurifier_AttrDef_Composite( + array( + new HTMLPurifier_AttrDef_Enum(array('none')), + new HTMLPurifier_AttrDef_CSSURI() + ) + ); + $this->info['list-style-position'] = new HTMLPurifier_AttrDef_Enum( array('inside', 'outside'), false); $this->info['list-style-type'] = new HTMLPurifier_AttrDef_Enum( array('disc', 'circle', 'square', 'decimal', 'lower-roman', - 'upper-roman', 'lower-alpha', 'upper-alpha'), false); + 'upper-roman', 'lower-alpha', 'upper-alpha', 'none'), false); + $this->info['list-style-image'] = $uri_or_none; $this->info['list-style'] = new HTMLPurifier_AttrDef_ListStyle($config); @@ -63,13 +72,15 @@ class HTMLPurifier_CSSDefinition array('capitalize', 'uppercase', 'lowercase', 'none'), false); $this->info['color'] = new HTMLPurifier_AttrDef_Color(); - // technically speaking, this one should get its own validator, but - // since we don't support background images, it effectively is - // equivalent to color. The only trouble is that if the author - // specifies an image and a color, they'll both end up getting dropped, - // even though we ought to implement it and just discard the image - // info. This will be fixed in a later version (see TODO) when - // better URI filtering is implemented. + $this->info['background-image'] = $uri_or_none; + $this->info['background-repeat'] = new HTMLPurifier_AttrDef_Enum( + array('repeat', 'repeat-x', 'repeat-y', 'no-repeat') + ); + $this->info['background-attachment'] = new HTMLPurifier_AttrDef_Enum( + array('scroll', 'fixed') + ); + + // pending its own validator as a shorthand $this->info['background'] = $border_color = diff --git a/library/HTMLPurifier/Config.php b/library/HTMLPurifier/Config.php index 39f62855..91b8d0ab 100644 --- a/library/HTMLPurifier/Config.php +++ b/library/HTMLPurifier/Config.php @@ -51,8 +51,8 @@ class HTMLPurifier_Config * an array of directives based on loadArray(). * @return Configured HTMLPurifier_Config object */ - function create($config) { - if (is_a($config, 'HTMLPurifier_Config')) return $config; + static function create($config) { + if ($config instanceof HTMLPurifier_Config) return $config; $ret = HTMLPurifier_Config::createDefault(); if (is_array($config)) $ret->loadArray($config); return $ret; @@ -62,7 +62,7 @@ class HTMLPurifier_Config * Convenience constructor that creates a default configuration object. * @return Default HTMLPurifier_Config object. */ - function createDefault() { + static function createDefault() { $definition =& HTMLPurifier_ConfigSchema::instance(); $config = new HTMLPurifier_Config($definition); return $config; diff --git a/library/HTMLPurifier/ConfigSchema.php b/library/HTMLPurifier/ConfigSchema.php index 9b3c6e2f..5daeebee 100644 --- a/library/HTMLPurifier/ConfigSchema.php +++ b/library/HTMLPurifier/ConfigSchema.php @@ -68,7 +68,7 @@ class HTMLPurifier_ConfigSchema { /** * Retrieves an instance of the application-wide configuration definition. */ - function &instance($prototype = null) { + static function &instance($prototype = null) { static $instance; if ($prototype !== null) { $instance = $prototype; @@ -89,7 +89,7 @@ class HTMLPurifier_ConfigSchema { * HTMLPurifier_DirectiveDef::$type for allowed values * @param $description Description of directive for documentation */ - function define( + static function define( $namespace, $name, $default, $type, $description ) { @@ -147,7 +147,7 @@ class HTMLPurifier_ConfigSchema { * @param $namespace Namespace's name * @param $description Description of the namespace */ - function defineNamespace($namespace, $description) { + static function defineNamespace($namespace, $description) { $def =& HTMLPurifier_ConfigSchema::instance(); if (isset($def->info[$namespace])) { trigger_error('Cannot redefine namespace', E_USER_ERROR); @@ -174,7 +174,7 @@ class HTMLPurifier_ConfigSchema { * @param $alias Name of aliased value * @param $real Value aliased value will be converted into */ - function defineValueAliases($namespace, $name, $aliases) { + static function defineValueAliases($namespace, $name, $aliases) { $def =& HTMLPurifier_ConfigSchema::instance(); if (!isset($def->info[$namespace][$name])) { trigger_error('Cannot set value alias for non-existant directive', @@ -204,7 +204,7 @@ class HTMLPurifier_ConfigSchema { * @param $name Name of directive * @param $allowed_values Arraylist of allowed values */ - function defineAllowedValues($namespace, $name, $allowed_values) { + static function defineAllowedValues($namespace, $name, $allowed_values) { $def =& HTMLPurifier_ConfigSchema::instance(); if (!isset($def->info[$namespace][$name])) { trigger_error('Cannot define allowed values for undefined directive', @@ -305,7 +305,7 @@ class HTMLPurifier_ConfigSchema { */ function isError($var) { if (!is_object($var)) return false; - if (!is_a($var, 'HTMLPurifier_Error')) return false; + if (!($var instanceof HTMLPurifier_Error)) return false; return true; } } diff --git a/library/HTMLPurifier/Encoder.php b/library/HTMLPurifier/Encoder.php index 8465c709..d39a05d0 100644 --- a/library/HTMLPurifier/Encoder.php +++ b/library/HTMLPurifier/Encoder.php @@ -67,7 +67,7 @@ class HTMLPurifier_Encoder * would need that, and I'm probably not going to implement them. * Once again, PHP 6 should solve all our problems. */ - function cleanUTF8($str, $force_php = false) { + static function cleanUTF8($str, $force_php = false) { static $non_sgml_chars = array(); if (empty($non_sgml_chars)) { @@ -249,7 +249,7 @@ class HTMLPurifier_Encoder // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes // +----------+----------+----------+----------+ - function unichr($code) { + static function unichr($code) { if($code > 1114111 or $code < 0 or ($code >= 55296 and $code <= 57343) ) { // bits are set outside the "valid" range as defined diff --git a/library/HTMLPurifier/EntityLookup.php b/library/HTMLPurifier/EntityLookup.php index 9816f865..2d11aed4 100644 --- a/library/HTMLPurifier/EntityLookup.php +++ b/library/HTMLPurifier/EntityLookup.php @@ -28,7 +28,7 @@ class HTMLPurifier_EntityLookup { * Retrieves sole instance of the object. * @param Optional prototype of custom lookup table to overload with. */ - function instance($prototype = false) { + static function instance($prototype = false) { // no references, since PHP doesn't copy unless modified static $instance = null; if ($prototype) { diff --git a/library/HTMLPurifier/HTMLDefinition.php b/library/HTMLPurifier/HTMLDefinition.php index 7ca086a9..16f2b724 100644 --- a/library/HTMLPurifier/HTMLDefinition.php +++ b/library/HTMLPurifier/HTMLDefinition.php @@ -300,9 +300,6 @@ class HTMLPurifier_HTMLDefinition $this->info['b']->child = $this->info['big']->child = $this->info['small']->child= - $this->info['u']->child = - $this->info['s']->child = - $this->info['strike']->child = $this->info['bdo']->child = $this->info['span']->child = $this->info['dt']->child = @@ -314,6 +311,12 @@ class HTMLPurifier_HTMLDefinition $this->info['h5']->child = $this->info['h6']->child = $e_Inline; + if (!$this->strict) { + $this->info['u']->child = + $this->info['s']->child = + $this->info['strike']->child = $e_Inline; + } + // the only three required definitions, besides custom table code $this->info['ol']->child = $this->info['ul']->child = new HTMLPurifier_ChildDef_Required('li'); @@ -355,10 +358,12 @@ class HTMLPurifier_HTMLDefinition // reuses $e_Inline and $e_Block foreach ($e_Inline->elements as $name => $bool) { if ($name == '#PCDATA') continue; + if (!isset($this->info[$name])) continue; $this->info[$name]->type = 'inline'; } foreach ($e_Block->elements as $name => $bool) { + if (!isset($this->info[$name])) continue; $this->info[$name]->type = 'block'; } @@ -531,7 +536,7 @@ class HTMLPurifier_HTMLDefinition // protect against stdclasses floating around foreach ($this->info as $key => $obj) { - if (is_a($obj, 'stdclass')) { + if ($obj instanceof stdClass) { unset($this->info[$key]); } } diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index 31d31d27..0232249e 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -145,7 +145,7 @@ class HTMLPurifier_Lexer * @param $prototype Optional prototype lexer. * @return Concrete lexer. */ - function create($prototype = null) { + static function create($prototype = null) { // we don't really care if it's a reference or a copy static $lexer = null; if ($prototype) { @@ -170,7 +170,7 @@ class HTMLPurifier_Lexer * @param $string HTML string to process. * @returns HTML with CDATA sections escaped. */ - function escapeCDATA($string) { + static function escapeCDATA($string) { return preg_replace_callback( '//', array('HTMLPurifier_Lexer', 'CDATACallback'), @@ -187,7 +187,7 @@ class HTMLPurifier_Lexer * and 1 the inside of the CDATA section. * @returns Escaped internals of the CDATA section. */ - function CDATACallback($matches) { + static function CDATACallback($matches) { // not exactly sure why the character set is needed, but whatever return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8'); } diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index 57720457..dcf3caee 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -88,6 +88,11 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer } elseif ($node->nodeType === XML_COMMENT_NODE) { $tokens[] = $this->factory->createComment($node->data); return; + } elseif ( + // not-well tested: there may be other nodes we have to grab + $node->nodeType !== XML_ELEMENT_NODE + ) { + return; } $attr = $node->hasAttributes() ? diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php index ccd201a7..01828e12 100644 --- a/library/HTMLPurifier/Lexer/PEARSax3.php +++ b/library/HTMLPurifier/Lexer/PEARSax3.php @@ -37,7 +37,7 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer $string = $this->normalize($string, $config, $context); - $parser=& new XML_HTMLSax3(); + $parser= new XML_HTMLSax3(); $parser->set_object($this); $parser->set_element_handler('openHandler','closeHandler'); $parser->set_data_handler('dataHandler'); diff --git a/library/HTMLPurifier/Printer/HTMLDefinition.php b/library/HTMLPurifier/Printer/HTMLDefinition.php index c85585fc..2ec297e7 100644 --- a/library/HTMLPurifier/Printer/HTMLDefinition.php +++ b/library/HTMLPurifier/Printer/HTMLDefinition.php @@ -10,10 +10,10 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer */ var $def; - function render(&$config) { + function render($config) { $ret = ''; $this->config =& $config; - $this->def =& $config->getHTMLDefinition(); + $this->def = $config->getHTMLDefinition(); $def =& $this->def; $ret .= $this->start('div', array('class' => 'HTMLPurifier_Printer')); diff --git a/library/HTMLPurifier/URISchemeRegistry.php b/library/HTMLPurifier/URISchemeRegistry.php index d9c25259..34bcc0af 100644 --- a/library/HTMLPurifier/URISchemeRegistry.php +++ b/library/HTMLPurifier/URISchemeRegistry.php @@ -37,7 +37,7 @@ class HTMLPurifier_URISchemeRegistry * @note Pass a registry object $prototype with a compatible interface and * the function will copy it and return it all further times. */ - function &instance($prototype = null) { + static function &instance($prototype = null) { static $instance = null; if ($prototype !== null) { $instance = $prototype; diff --git a/smoketests/common.php b/smoketests/common.php index 13cc6e59..0cba96f6 100644 --- a/smoketests/common.php +++ b/smoketests/common.php @@ -3,6 +3,7 @@ header('Content-type: text/html; charset=UTF-8'); require_once '../library/HTMLPurifier.auto.php'; +error_reporting(E_ALL | E_STRICT); function escapeHTML($string) { $string = HTMLPurifier_Encoder::cleanUTF8($string); diff --git a/smoketests/printDefinition.php b/smoketests/printDefinition.php index 91c15d4c..17ed8bee 100644 --- a/smoketests/printDefinition.php +++ b/smoketests/printDefinition.php @@ -54,11 +54,15 @@ echo ''; +

HTML Purifier Printer Smoketest

-

This page will allow you to see precisely what HTML Purifier's internal + +

HTML Purifier claims to have a robust yet permissive whitelist: this +page will allow you to see precisely what HTML Purifier's internal whitelist is. You can also twiddle with the configuration settings to see how a directive influences the internal workings of the definition objects.

+

Modify configuration

You can specify an array by typing in a comma-separated diff --git a/smoketests/utf8.php b/smoketests/utf8.php index e5e57857..2d23330b 100644 --- a/smoketests/utf8.php +++ b/smoketests/utf8.php @@ -1,5 +1,7 @@ '; diff --git a/smoketests/xssAttacks.xml b/smoketests/xssAttacks.xml index dd8a5feb..5b833f8d 100644 --- a/smoketests/xssAttacks.xml +++ b/smoketests/xssAttacks.xml @@ -978,8 +978,6 @@ alert(a.source)</SCRIPT> -onErrorUpdate() (fires on a databound object when an error occurs while updating the associated data in the data source object) --onExit() (fires when someone clicks on a link or presses the back button) - -onFilterChange() (fires when a visual filter completes state change) -onFinish() (attacker could create the exploit when marquee is finished looping) diff --git a/tests/Debugger.php b/tests/Debugger.php index 3213af3c..0bde21bb 100644 --- a/tests/Debugger.php +++ b/tests/Debugger.php @@ -70,7 +70,7 @@ class Debugger $this->add_pre = !extension_loaded('xdebug'); } - function &instance() { + static function &instance() { static $soleInstance = false; if (!$soleInstance) $soleInstance = new Debugger(); return $soleInstance; diff --git a/tests/HTMLPurifier/AttrDef/CSSTest.php b/tests/HTMLPurifier/AttrDef/CSSTest.php index 7afa2172..cb5e8083 100644 --- a/tests/HTMLPurifier/AttrDef/CSSTest.php +++ b/tests/HTMLPurifier/AttrDef/CSSTest.php @@ -1,6 +1,7 @@ assertDef('vertical-align:12px;'); $this->assertDef('vertical-align:50%;'); $this->assertDef('table-layout:fixed;'); + $this->assertDef('list-style-image:url(nice.jpg);'); + $this->assertDef('list-style:disc url(nice.jpg) inside;'); + $this->assertDef('background-image:url(foo.jpg);'); + $this->assertDef('background-image:none;'); + $this->assertDef('background-repeat:repeat-y;'); + $this->assertDef('background-attachment:fixed;'); // duplicates $this->assertDef('text-align:right;text-align:left;', diff --git a/tests/HTMLPurifier/AttrDef/CSSURITest.php b/tests/HTMLPurifier/AttrDef/CSSURITest.php new file mode 100644 index 00000000..1fe1a3dc --- /dev/null +++ b/tests/HTMLPurifier/AttrDef/CSSURITest.php @@ -0,0 +1,37 @@ +def = new HTMLPurifier_AttrDef_CSSURI(); + + $this->assertDef('', false); + + // we could be nice but we won't be + $this->assertDef('http://www.example.com/', false); + + // no quotes are used, since that's the most widely supported + // syntax + $this->assertDef('url(', false); + $this->assertDef('url()', true); + $result = "url(http://www.example.com/)"; + $this->assertDef('url(http://www.example.com/)', $result); + $this->assertDef('url("http://www.example.com/")', $result); + $this->assertDef("url('http://www.example.com/')", $result); + $this->assertDef( + ' url( "http://www.example.com/" ) ', $result); + + // escaping + $this->assertDef("url(http://www.example.com/foo,bar\))", + "url(http://www.example.com/foo\,bar\))"); + + } + +} + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/AttrDef/CompositeTest.php b/tests/HTMLPurifier/AttrDef/CompositeTest.php index 9c49a289..a61db20c 100644 --- a/tests/HTMLPurifier/AttrDef/CompositeTest.php +++ b/tests/HTMLPurifier/AttrDef/CompositeTest.php @@ -28,10 +28,10 @@ class HTMLPurifier_AttrDef_CompositeTest extends HTMLPurifier_AttrDefHarness // first test: value properly validates on first definition // so second def is never called - $def1 =& new HTMLPurifier_AttrDefMock($this); - $def2 =& new HTMLPurifier_AttrDefMock($this); + $def1 = new HTMLPurifier_AttrDefMock($this); + $def2 = new HTMLPurifier_AttrDefMock($this); $defs = array(&$def1, &$def2); - $def =& new HTMLPurifier_AttrDef_Composite_Testable($defs); + $def = new HTMLPurifier_AttrDef_Composite_Testable($defs); $input = 'FOOBAR'; $output = 'foobar'; $def1_params = array($input, $config, $context); @@ -47,10 +47,10 @@ class HTMLPurifier_AttrDef_CompositeTest extends HTMLPurifier_AttrDefHarness // second test, first def fails, second def works - $def1 =& new HTMLPurifier_AttrDefMock($this); - $def2 =& new HTMLPurifier_AttrDefMock($this); + $def1 = new HTMLPurifier_AttrDefMock($this); + $def2 = new HTMLPurifier_AttrDefMock($this); $defs = array(&$def1, &$def2); - $def =& new HTMLPurifier_AttrDef_Composite_Testable($defs); + $def = new HTMLPurifier_AttrDef_Composite_Testable($defs); $input = 'BOOMA'; $output = 'booma'; $def_params = array($input, $config, $context); @@ -67,10 +67,10 @@ class HTMLPurifier_AttrDef_CompositeTest extends HTMLPurifier_AttrDefHarness // third test, all fail, so composite faiils - $def1 =& new HTMLPurifier_AttrDefMock($this); - $def2 =& new HTMLPurifier_AttrDefMock($this); + $def1 = new HTMLPurifier_AttrDefMock($this); + $def2 = new HTMLPurifier_AttrDefMock($this); $defs = array(&$def1, &$def2); - $def =& new HTMLPurifier_AttrDef_Composite_Testable($defs); + $def = new HTMLPurifier_AttrDef_Composite_Testable($defs); $input = 'BOOMA'; $output = false; $def_params = array($input, $config, $context); diff --git a/tests/HTMLPurifier/AttrDef/ListStyleTest.php b/tests/HTMLPurifier/AttrDef/ListStyleTest.php index a12080f8..95ef9444 100644 --- a/tests/HTMLPurifier/AttrDef/ListStyleTest.php +++ b/tests/HTMLPurifier/AttrDef/ListStyleTest.php @@ -15,9 +15,20 @@ class HTMLPurifier_AttrDef_ListStyleTest extends HTMLPurifier_AttrDefHarness $this->assertDef('circle outside'); $this->assertDef('inside'); $this->assertDef('none'); + $this->assertDef('url(foo.gif)'); + $this->assertDef('circle url(foo.gif) inside'); + // invalid values $this->assertDef('outside inside', 'outside'); + + // ordering + $this->assertDef('url(foo.gif) none', 'none url(foo.gif)'); $this->assertDef('circle lower-alpha', 'circle'); + // the spec is ambiguous about what happens in these + // cases, so we're going off the W3C CSS validator + $this->assertDef('disc none', 'disc'); + $this->assertDef('none disc', 'none'); + } diff --git a/tests/HTMLPurifier/AttrDef/URITest.php b/tests/HTMLPurifier/AttrDef/URITest.php index a80c436f..f9a9ab41 100644 --- a/tests/HTMLPurifier/AttrDef/URITest.php +++ b/tests/HTMLPurifier/AttrDef/URITest.php @@ -206,7 +206,7 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness $registry =& HTMLPurifier_URISchemeRegistry::instance($fake_registry); // now, let's add a pseudo-scheme to the registry - $this->scheme =& new HTMLPurifier_URISchemeMock($this); + $this->scheme = new HTMLPurifier_URISchemeMock($this); // here are the schemes we will support with overloaded mocks $registry->setReturnReference('getScheme', $this->scheme, array('http', $this->config, $this->context)); diff --git a/tests/HTMLPurifier/ContextTest.php b/tests/HTMLPurifier/ContextTest.php index 68604d5c..88c0f615 100644 --- a/tests/HTMLPurifier/ContextTest.php +++ b/tests/HTMLPurifier/ContextTest.php @@ -20,7 +20,7 @@ class HTMLPurifier_ContextTest extends UnitTestCase $this->assertFalse($this->context->exists('IDAccumulator')); - $accumulator =& new HTMLPurifier_IDAccumulatorMock($this); + $accumulator = new HTMLPurifier_IDAccumulatorMock($this); $this->context->register('IDAccumulator', $accumulator); $this->assertTrue($this->context->exists('IDAccumulator')); diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index a690466b..26875181 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -16,7 +16,9 @@ class HTMLPurifier_LexerTest extends UnitTestCase $this->DirectLex = new HTMLPurifier_Lexer_DirectLex(); - if ( $GLOBALS['HTMLPurifierTest']['PEAR'] ) { + if ( $GLOBALS['HTMLPurifierTest']['PEAR'] && + ((error_reporting() & E_STRICT) != E_STRICT) + ) { $this->_has_pear = true; require_once 'HTMLPurifier/Lexer/PEARSax3.php'; $this->PEARSax3 = new HTMLPurifier_Lexer_PEARSax3(); diff --git a/tests/index.php b/tests/index.php index 92c845fe..3f9775aa 100644 --- a/tests/index.php +++ b/tests/index.php @@ -1,6 +1,6 @@