diff --git a/Doxyfile b/Doxyfile index 4ef1d4b4..c906929e 100644 --- a/Doxyfile +++ b/Doxyfile @@ -4,7 +4,7 @@ # Project related configuration options #--------------------------------------------------------------------------- PROJECT_NAME = HTML Purifier -PROJECT_NUMBER = 1.4.1 +PROJECT_NUMBER = 1.5.0 OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen" CREATE_SUBDIRS = NO OUTPUT_LANGUAGE = English diff --git a/NEWS b/NEWS index 69e713c6..9bd45a99 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,36 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier . Internal change ========================== +1.5.0, released 2007-03-23 +! Added a rudimentary I18N and L10N system modeled off MediaWiki. It + doesn't actually do anything yet, but keep your eyes peeled. +! docs/enduser-utf8.html explains how to use UTF-8 and HTML Purifier +! Newly structured HTMLDefinition modeled off of XHTML 1.1 modules. + I am loathe to release beta quality APIs, but this is exactly that; + don't use the internal interfaces if you're not willing to do migration + later on. +- Allow 'x' subtag in language codes +- Fixed buggy chameleon-support for ins and del +. Added support for IDREF attributes (i.e. for) +. Renamed HTMLPurifier_AttrDef_Class to HTMLPurifier_AttrDef_Nmtokens +. Removed context variable ParentType, replaced with IsInline, which + is false when you're not inline and an integer of the parent that + caused you to become inline when you are (so possibly zero) +. Removed ElementDef->type in favor of ElementDef->descendants_are_inline + and HTMLDefinition->content_sets +. StrictBlockquote now reports what elements its supposed to allow, + rather than what it does allow +. Removed HTMLDefinition->info_flow_elements in favor of + HTMLDefinition->content_sets['Flow'] +. Removed redundant "exclusionary" definitions from DTD roster +. StrictBlockquote now requires a construction parameter as if it + were an Required ChildDef, this is the "real" set of allowed elements +. AttrDef partitioned into HTML, CSS and URI segments +. Modify Youtube filter regexp to be multiline +. Require both PHP5 and DOM extension in order to use DOMLex, fixes + some edge cases where a DOMDocument class exists in a PHP4 environment + due to DOM XML extension. + 1.4.1, released 2007-01-21 ! docs/enduser-youtube.html updated according to new functionality - YouTube IDs can have underscores and dashes diff --git a/TODO b/TODO index df7b9184..436f4fd9 100644 --- a/TODO +++ b/TODO @@ -7,7 +7,7 @@ TODO List ? At-risk ========================== -1.5 release +1.6 release # Implement all non-essential attribute transforms, configurable # URI validation routines tighter (see docs/dev-code-quality.html) (COMPLEX) # Advanced URI filtering schemes (see docs/proposal-new-directives.txt) @@ -15,8 +15,9 @@ TODO List - Requires I18N facilities to be created first (COMPLEX) ? Configuration profiles: sets of directives that get set with one func call - XSS-attempt detection + - Implement IDREF support -1.6 release +1.7 release # Add pre-packaged "levels" of cleaning (custom behavior already done) - More fine-grained control over escaping behavior - Silently drop content inbetween SCRIPT tags (can be generalized to allow @@ -29,7 +30,7 @@ TODO List tag or attribute that is not supported - Parse TinyMCE whitelist into our %HTML.Allow* whitelists -1.7 release +1.8 release # Additional support for poorly written HTML - Microsoft Word HTML cleaning (i.e. MsoNormal, but research essential!) - Friendly strict handling of
(block ->
) @@ -76,7 +77,6 @@ Ongoing - more! (look for ones that use WYSIWYGs) Unknown release (on a scratch-an-itch basis) - - Upgrade SimpleTest testing code to newest versions - Have 'lang' attribute be checked against official lists ? Semi-lossy dumb alternate character encoding transformations, achieved by encoding all characters that have string entity equivalents diff --git a/benchmarks/Lexer.php b/benchmarks/Lexer.php index 9e13b54b..86df149b 100644 --- a/benchmarks/Lexer.php +++ b/benchmarks/Lexer.php @@ -7,6 +7,7 @@ set_include_path(get_include_path() . PATH_SEPARATOR . '../library/'); require_once 'HTMLPurifier/ConfigSchema.php'; require_once 'HTMLPurifier/Config.php'; +require_once 'HTMLPurifier/Context.php'; $LEXERS = array(); $RUNS = isset($GLOBALS['HTMLPurifierTest']['Runs']) @@ -93,11 +94,14 @@ function print_lexers() { function do_benchmark($name, $document) { global $LEXERS, $RUNS; + $config = HTMLPurifier_Config::createDefault(); + $context = new HTMLPurifier_Context(); + $timer = new RowTimer($name); $timer->start(); foreach($LEXERS as $key => $lexer) { - for ($i=0; $i<$RUNS; $i++) $tokens = $lexer->tokenizeHTML($document); + for ($i=0; $i<$RUNS; $i++) $tokens = $lexer->tokenizeHTML($document, $config, $context); $timer->setMarker($key); } diff --git a/benchmarks/ProfileDirectLex.php b/benchmarks/ProfileDirectLex.php index faf9bef5..20ff0159 100644 --- a/benchmarks/ProfileDirectLex.php +++ b/benchmarks/ProfileDirectLex.php @@ -5,12 +5,15 @@ set_include_path(get_include_path() . PATH_SEPARATOR . '../library/'); require_once 'HTMLPurifier/ConfigSchema.php'; require_once 'HTMLPurifier/Config.php'; require_once 'HTMLPurifier/Lexer/DirectLex.php'; +require_once 'HTMLPurifier/Context.php'; $input = file_get_contents('samples/Lexer/4.html'); $lexer = new HTMLPurifier_Lexer_DirectLex(); +$config = HTMLPurifier_Config::createDefault(); +$context = new HTMLPurifier_Context(); for ($i = 0; $i < 10; $i++) { - $tokens = $lexer->tokenizeHTML($input); + $tokens = $lexer->tokenizeHTML($input, $config, $context); } ?> \ No newline at end of file diff --git a/configdoc/generate.php b/configdoc/generate.php index 14335e98..d5966e2e 100644 --- a/configdoc/generate.php +++ b/configdoc/generate.php @@ -188,7 +188,7 @@ $xsl_processor->importStylesheet($xsl_dom_stylesheet); $html_output = $xsl_processor->transformToXML($dom_document); // some slight fudges to preserve backwards compatibility -$html_output = str_replace('/>', ' />', $html_output); //
not
+$html_output = str_replace('/>', ' />', $html_output); //
not
$html_output = str_replace(' xmlns=""', '', $html_output); // rm unnecessary xmlns if (class_exists('Tidy')) { diff --git a/docs/dev-advanced-api.html b/docs/dev-advanced-api.html new file mode 100644 index 00000000..731397f2 --- /dev/null +++ b/docs/dev-advanced-api.html @@ -0,0 +1,188 @@ + + + + + + + +Advanced API - HTML Purifier + + + +

Advanced API

+ +
Filed under Development
+
Return to the index.
+
HTML Purifier End-User Documentation
+ +

It makes no sense to adopt a one-size-fits-all approach to +filtersets: therefore, users must be able to define their own sets of +allowed elements, as well as switch in-between doctypes of HTML.

+ +

Our goals are to let the user:

+ +
+
Select
+
    +
  • Doctype
  • +
  • Filtersets: Rich / Plain / Full ...
  • +
  • Mode: Lenient / Correctional
  • +
  • Collections (?): Safe / Unsafe
  • +
  • Modules / Tags / Attributes
  • +
+
Customize
+
    +
  • Tags / Attributes / Attribute Types
  • +
  • Filtersets
  • +
  • Root Node
  • +
+
Create
+
    +
  • Modules / Tags / Attributes / Attribute Types
  • +
  • Filtersets
  • +
  • Doctype
  • +
+
+ +

Select

+ +

Selecting a Doctype

+ +

By default, users will use a doctype-based, permissive but secure +whitelist. They must define a doctype, and this serves +as the first method of determining a filterset.

+ +

This identifier is based +on the name the W3C has given to the document type and not +the DTD identifier.

+ +

This parameter is set via the configuration object:

+ +
$config->set('HTML', 'Doctype', 'XHTML 1.0 Transitional');
+ +

Selecting a Filterset

+ +

However, selecting this doctype doesn't mean much, because if we +adhered exactly to the definition we would be letting XSS and other +nasties through. HTML Purifier must, in its filterset, allow a subset +of the doctype, which we shall call a filterset.

+ +

By default, HTML Purifier will use the Rich +filterset, which allows as many elements as possible with untrusted +sources. Other possible filtersets could be:

+ +
+
Full
+
Allows the full span of elements in the doctype, good if you want + HTML Purifier to work as a Tidy substitute but not to strip + anything out.
+
Plain
+
Provides a minimum set of tags for semantic markup of things + like blog comments.
+
+ +

Extension-authors would be able to define custom filtersets for +other users to use.

+ +

A possible call to select a filterset would be:

+ +
$config->set('HTML', 'Filterset', 'Rich');
+ +

Selecting Mode

+ +

Within filtersets, there are various modes of operation. +These indicate variant behaviors that, while not strictly changing the +allowed set of elements and attributes, will definitely affect the output. +Currently, we have two modes, which may be used together:

+ +
+
Lenient
+
Deprecated elements and attributes will be transformed into + standards-compliant alternatives when explicitly disallowed. For + example, in the XHTML 1.0 Strict doctype, a center + tag would be turned into a div with the CSS property + text-align:center;, but in XHTML 1.0 Transitional + the tag would be preserved. This mode is on by default.
+
Correctional
+
Deprecated elements and attributes will be transformed into + standards-compliant alternatives whenever possible. Referring + back to the previous example, the center tag would + be transformed in both cases. However, tags without a + reasonable standards-compliant alternative will be preserved + in their form. This mode is on by default. It may have + various levels of operation.
+
+ +

A possible call to select modes would be:

+ +
$config->set('HTML', 'Mode', array('correctional', 'lenient'));
+ +

If modes have extra parameters, a hash might work well:

+ +
$config->set('HTML', 'Mode', array(
+    'correctional' => 9, // strongest level
+    'lenient' => true // this one's just boolean
+));
+ +

Modes may possibly be wrapped up with the filterset declaration:

+ +
$config->set('HTML', 'Filterset', 'Rich: correctional, lenient');
+ +

Further investigation in this field is necessary.

+ +

Selecting Modules / Tags / Attributes

+ +

If this cookie cutter approach doesn't appeal to a user, they may +decide to roll their own filterset by selecting modules, tags and +attributes to allow.

+ +

This would make use of the same facilities +as a filterset author would use, except that it would go under an +anonymous filterset that would be auto-selected if any of the +relevant module/tag/attribute selection configuration directives were +non-null.

+ +

On the highest level, a user will usually be most interested in +directly specifying which elements and attributes are desired. For +example:

+ +
$config->set('HTML', 'AllowedElements', 'a,b,em,p,blockquote,code,i');
+ +

Attribute declarations could be merged into this declaration as such:

+ +
$config->set('HTML', 'Allowed', 'a[href,title],b,em,p[class],blockquote[cite],code,i');
+ +

...or be kept separate:

+ +
$config->set('HTML', 'AllowedAttributes', 'a.href,a.title,p.class,blockquote.cite');
+ +

Considering that, internally speaking, as mandated by +the XHTML 1.1 Modularization specification, we have organized our +elements around modules, considerable gymnastics will be needed to +get this sort of functionality working.

+ +

A user may also specify a module to load a class of elements and attributes +into their filterest:

+ +
$config->set('HTML', 'Allowed', 'Hypertext,Core');
+ +

The granularity of these modules is too coarse for +the average user (for example, the core module loads everything from +the essential p tag to the not-so-safe h1 +tag). How do we make this still a viable solution?

+ +

Unified selector

+ +

Because selecting each and every one of these configuration options +is a chore, we may wish to offer a specialized configuration method +for selecting a filterset. Possibility:

+ +
function selectFilter($doctype, $filterset, $mode)
+ +

...which is simply a light wrapper over the individual configuration +calls. A custom config file format or text format could also be adopted.

+ +
$Id$
+ + \ No newline at end of file diff --git a/docs/enduser-overview.txt b/docs/enduser-overview.txt index 1e9f13c0..3ebccd21 100644 --- a/docs/enduser-overview.txt +++ b/docs/enduser-overview.txt @@ -36,7 +36,7 @@ forgiving lexer. You may also be interested in the unit tests located in the tests/ folder, which provide a living document on how exactly the filter deals with malformed input. -In summary: +In summary (see corresponding classes for more details): 1. Parse document into an array of tag and text tokens (Lexer) 2. Remove all elements not on whitelist and transform certain other elements diff --git a/docs/enduser-security.txt b/docs/enduser-security.txt index e7c9a8ce..d33f473c 100644 --- a/docs/enduser-security.txt +++ b/docs/enduser-security.txt @@ -6,45 +6,17 @@ through negligence of people. This class will do its job: no more, no less, and it's up to you to provide it the proper information and proper context to be effective. Things to remember: -1. Character Encoding: UTF-8. - This segment will soon be obsoleted by enduser-utf8.html -Currently, the parser runs under the assumption that it is dealing -with UTF-8. Not ISO-8859-1 or Windows-1252, UTF-8. And definitely not "no -character encoding explicitly stated" or UTF-7. If you're not using UTF-8 as -your character encoding, make sure you configure HTML Purifier or switch -to UTF-8. Now. Also, make sure any input is properly converted to UTF-8, or -the parser will mangle it badly (though it won't be a security risk if you're -outputting it as UTF-8 though). Character encoding is, in general, a knotty -issue, but do yourself a favor and learn about it: - +1. Character Encoding: see enduser-utf8.html for more info. -2. Doctype: XHTML 1.0 Transitional -This is what the parser is outputting. For the most -part, it's compatible with HTML 4.01, but XHTML enforces some very nice things -that all web developers should use. Regardless, NO DOCTYPE is a NO. Quirks mode -has waaaay too many quirks for a little parser to handle. We did not select -strict in order to prevent ourselves from being too draconic on users, but -this may be configurable in the future. Do you want standards compliance? -The doctype is a good place to start. +2. Doctype: document pending feature completion +Not strictly necessary, actually. More in-depth discussion once we figure +out how to get strict loose mode working. -3. IDs - This segment is obsoleted by enduser-id.html -They need to be unique, but without some knowledge of the -rest of the document, it's difficult to know what's unique. %Attr.IDBlacklist -needs to be set: we may want to consider disallowing IDs by default to -save lazy programmers. +3. IDs: see enduser-id.html for more info -4. [PROJECTED] Links -We're not going to try for spam protection (although -some hooks for such a module might be nice) but we may offer the ability to -only accept relative URLs. Pick the one that's right for you. +4. Links: document pending feature completion +Rudimentary blacklisting, we should also allow only relative URIs. We +need a doc to explain the stuff. -5. CSS -While we can prevent the most flagrant cases from affecting your -layout (such as absolutely positioned elements), no amount of code is going -to protect your pages from being attacked by garish colors and plain old -bad taste. A neat feature would be the ability to define acceptable colors -in a document, but that's not likely to be implemented for a while. In the -meantime, be sure to make sure that floated elements (permitted, since they -can be quite useful) can't mess up your layout. Once again, we may want to -disable this by default to protect lazy developers. +5. CSS: document pending +Explain which CSS styles we blocked and why. diff --git a/docs/enduser-utf8.html b/docs/enduser-utf8.html index e9a5bc88..d8187c5d 100644 --- a/docs/enduser-utf8.html +++ b/docs/enduser-utf8.html @@ -10,7 +10,7 @@ .minor td {font-style:italic;} -UTF-8 - HTML Purifier +UTF-8: The Secret of Character Encoding - HTML Purifier -

UTF-8

+

UTF-8: The Secret of Character Encoding

Filed under End-User
Return to the index.
HTML Purifier End-User Documentation
-

Character encoding and character sets, in truth, are not that -difficult to understand. But if you don't understand them, you are going -to be caught by surprise by some of HTML Purifier's behavior, namely -the fact that it operates UTF-8 or the limitations of the character -encoding transformations it does. This document will walk you through +

Character encoding and character sets are not that +difficult to understand, but so many people blithely stumble +through the worlds of programming without knowing what to actually +do about it, or say "Ah, it's a job for those internationalization +experts." No, it is not! This document will walk you through determining the encoding of your system and how you should handle this information. It will stay away from excessive discussion on -the internals of character encoding, but offer the information in -asides that can easily be skipped.

+the internals of character encoding.

+ +

This document is not designed to be read in its entirety: it will +slowly introduce concepts that build on each other: you need not get to +the bottom to have learned something new. However, I strongly +recommend you read all the way to Why UTF-8?, because at least +at that point you'd have made a conscious decision not to migrate, +which can be a rewarding (but difficult) task.

Asides
@@ -43,6 +49,50 @@ asides that can easily be skipped.

with a greater understanding of the underlying issues.

+

Table of Contents

+ +
    +
  1. Finding the real encoding
  2. +
  3. Finding the embedded encoding
  4. +
  5. Fixing the encoding
      +
    1. No embedded encoding
    2. +
    3. Embedded encoding disagrees
    4. +
    5. Changing the server encoding
        +
      1. PHP header() function
      2. +
      3. PHP ini directive
      4. +
      5. Non-PHP
      6. +
      7. .htaccess
      8. +
      9. File extensions
      10. +
    6. +
    7. XML
    8. +
    9. Inside the process
    10. +
  6. +
  7. Why UTF-8?
      +
    1. Internationalization
    2. +
    3. User-friendly
    4. +
    5. Forms
        +
      1. application/x-www-form-urlencoded
      2. +
      3. multipart/form-data
      4. +
    6. +
    7. Well supported
    8. +
    9. HTML Purifiers
    10. +
  8. +
  9. Migrate to UTF-8
      +
    1. Configuring your database
        +
      1. Legit method
      2. +
      3. Binary
      4. +
    2. +
    3. Text editor
    4. +
    5. Byte Order Mark (headers already sent!)
    6. +
    7. Fonts
        +
      1. Obscure scripts
      2. +
      3. Occasional use
      4. +
    8. +
    9. Dealing with variable width in functions
    10. +
  10. +
  11. Further Reading
  12. +
+

Finding the real encoding

In the beginning, there was ASCII, and things were simple. But they @@ -275,7 +325,7 @@ your own php.ini file, ask your support for details. Use:

Non-PHP

-

You may, for whatever reason, may need to set the character encoding +

You may, for whatever reason, need to set the character encoding on non-PHP files, usually plain ol' HTML files. Doing this is more of a hit-or-miss process: depending on the software being used as a webserver and the configuration of that software, certain @@ -386,8 +436,8 @@ processing instructions. They look like:

For XHTML, this processing instruction theoretically overrides the META tag. In reality, this happens only when the -XHTML is actually served as legit XML and not HTML, which is almost -always never due to Internet Explorer's lack of support for +XHTML is actually served as legit XML and not HTML, which is almost always +never due to Internet Explorer's lack of support for application/xhtml+xml (even though doing so is often argued to be good practice).

@@ -398,10 +448,10 @@ for XML files is UTF-8, which often butts heads with more common ISO-8859-1 encoding (you see this in garbled RSS feeds).

In short, if you use XHTML and have gone through the -trouble of adding the XML header, be sure to make sure it jives +trouble of adding the XML header, make sure it jives with your META tags and HTTP headers.

-

Inside the process

+

Inside the process

This section is not required reading, but may answer some of your questions on what's going on in all @@ -572,7 +622,7 @@ Each method has deficiencies, especially the former.

the page, you still have the trouble of what to do with characters that are outside of the character encoding's range. The behavior, once again, varies: Firefox 2.0 entity-izes them while Internet Explorer -7.0 mangles them beyond intelligibility. For serious I18N purposes, +7.0 mangles them beyond intelligibility. For serious internationalization purposes, this is not an option.

The other possibility is to set Accept-Encoding to UTF-8, which @@ -604,22 +654,374 @@ hounding you about broken pages.

HTML Purifier

-

And finally, we get to HTML Purifier.

+

And finally, we get to HTML Purifier. HTML Purifier is built to +deal with UTF-8: any indications otherwise are the result of an +encoder that converts text from your preferred encoding to UTF-8, and +back again. HTML Purifier never touches anything else, and leaves +it up to the module iconv to do the dirty work.

+ +

This approach, however, is not perfect. iconv is blithely unaware +of HTML character entities. HTML Purifier, in order to +protect against sophisticated escaping schemes, normalizes all character +and numeric entities before processing the text. This leads to +one important ramification:

+ +

Any character that is not supported by the target character +set, regardless of whether or not it is in the form of a character +entity or a raw character, will be silently ignored.

+ +

Example of this principle at work: say you have &theta; +in your HTML, but the output is in Latin-1 (which, understandably, +does not understand Greek), the following process will occur (assuming you've +set the encoding correctly using %Core.Encoding):

+ + + +

This behaviour is quite unsatisfactory. It is a deal-breaker for +international applications, and it can be mildly annoying for the provincial +soul who occasionally needs a special character. Since 1.4.0, HTML +Purifier has provided a slightly more palatable workaround using +%Core.EscapeNonASCIICharacters. The process now looks like:

+ + + +

...which means that this is only good for an occasional foray into +the land of Unicode characters, and is totally unacceptable for Chinese +or Japanese texts. The even bigger kicker is that, supposing the +input encoding was actually ISO-8859-7, which does support +theta, the character would get entity-ized anyway! (The Encoder does +not discriminate).

+ +

The current functionality is about where HTML Purifier will be for +the rest of eternity. HTML Purifier could attempt to preserve the original +form of the entities so that they could be substituted back in, only the +DOM extension kills them off irreversibly. HTML Purifier could also attempt +to be smart and only convert non-ASCII characters that weren't supported +by the target encoding, but that would require reimplementing iconv +with HTML awareness, something I will not do.

+ +

So there: either it's UTF-8 or crippled international support. Your pick! (and I'm +not being sarcastic here: some people could care less about other languages)

Migrate to UTF-8

-

Text editor

+

So, you've decided to bite the bullet, and want to migrate to UTF-8. +Note that this is not for the faint-hearted, and you should expect +the process to take longer than you think it will take.

+ +

The general idea is that you convert all existing text to UTF-8, +and then you set all the headers and META tags we discussed earlier +to UTF-8. There are many ways going about doing this: you could +write a conversion script that runs through the database and re-encodes +everything as UTF-8 or you could do the conversion on the fly when someone +reads the page. The details depend on your system, but I will cover +some of the more subtle points of migration that may trip you up.

Configuring your database

-

Convert old text

+

Most modern databases, the most prominent open-source ones being MySQL +4.1+ and PostgreSQL, support character encodings. If you're switching +to UTF-8, logically speaking, you'd want to make sure your database +knows about the change too. There are some caveats though:

+ +

Legit method

+ +

Standardization in terms of SQL syntax for specifying character +encodings is notoriously spotty. Refer to your respective database's +documentation on how to do this properly.

+ +

For MySQL, ALTER will magically perform the +character encoding conversion for you. However, you have +to make sure that the text inside the column is what is says it is: +if you had put Shift-JIS in an ISO 8859-1 column, MySQL will irreversibly mangle +the text when you try to convert it to UTF-8. You'll have to convert +it to a binary field, convert it to a Shift-JIS field (the real encoding), +and then finally to UTF-8. Many a website had pages irreversibly mangled +because they didn't realize that they'd been deluding themselves about +the character encoding all along, don't become the next victim.

+ +

For PostgreSQL, there appears to be no direct way to change the +encoding of a database (as of 8.2). You will have to dump the data, and then reimport +it into a new table. Make sure that your client encoding is set properly: +this is how PostgreSQL knows to perform an encoding conversion.

+ +

Many times, you will be also asked about the "collation" of +the new column. Collation is how a DBMS sorts text, like ordering +B, C and A into A, B and C (the problem gets surprisingly complicated +when you get to languages like Thai and Japanese). If in doubt, +going with the default setting is usually a safe bet.

+ +

Once the conversion is all said and done, you still have to remember +to set the client encoding (your encoding) properly on each database +connection using SET NAMES (which is standard SQL and is +usually supported).

+ +

Binary

+ +

Due to the abovementioned compatibility issues, a more interoperable +way of storing UTF-8 text is to stuff it in a binary datatype. +CHAR becomes BINARY, VARCHAR becomes +VARBINARY and TEXT becomes BLOB. +Doing so can save you some huge headaches:

+ + + +

MediaWiki, a very prominent international application, uses binary fields +for storing their data because of point three.

+ +

There are drawbacks, of course:

+ + + +

Choose based on your circumstances.

+ +

Text editor

+ +

For more flat-file oriented systems, you will often be tasked with +converting reams of existing text and HTML files into UTF-8, as well as +making sure that all new files uploaded are properly encoded. Once again, +I can only point vaguely in the right direction for converting your +existing files: make sure you backup, make sure you use +iconv(), and +make sure you know what the original character encoding of the files +is (or are, depending on the tidiness of your system).

+ +

However, I can proffer more specific advice on the subject of +text editors. Many text editors have notoriously spotty Unicode support. +To find out how your editor is doing, you can check out this list +or Wikipedia's list. +I personally use Notepad++, which works like a charm when it comes to UTF-8. +Usually, you will have to explicitly tell the editor through some dialogue +(usually Save as or Format) what encoding you want it to use. An editor +will often offer "Unicode" as a method of saving, which is +ambiguous. Make sure you know whether or not they really mean UTF-8 +or UTF-16 (which is another flavor of Unicode).

+ +

The two things to look out for are whether or not the editor +supports font mixing (multiple +fonts in one document) and whether or not it adds a BOM. +Font mixing is important because fonts rarely have support for every +language known to mankind: in order to be flexible, an editor must +be able to take a little from here and a little from there, otherwise +all your Chinese characters will come as nice boxes. We'll discuss +BOM below.

Byte Order Mark (headers already sent!)

+

The BOM, or Byte +Order Mark, is a magical, invisible character placed at +the beginning of UTF-8 files to tell people what the encoding is and +what the endianness of the text is. It is also unnecessary.

+ +

Because it's invisible, it often +catches people by surprise when it starts doing things it shouldn't +be doing. For example, this PHP file:

+ +
BOM<?php
+header('Location: index.php');
+?>
+ +

...will fail with the all too familiar Headers already sent +PHP error. And because the BOM is invisible, this culprit will go unnoticed. +My suggestion is to only use ASCII in PHP pages, but if you must, make +sure the page is saved WITHOUT the BOM.

+ +
+

The headers the error is referring to are HTTP headers, + which are sent to the browser before any HTML to tell it various + information. The moment any regular text (and yes, a BOM counts as + ordinary text) is output, the headers must be sent, and you are + not allowed to send anymore. Thus, the error.

+
+ +

If you are reading in text files to insert into the middle of another +page, it is strongly advised (but not strictly necessary) that you replace out the UTF-8 byte +sequence for BOM "\xEF\xBB\xBF" before inserting it in, +via:

+ +
$text = str_replace("\xEF\xBB\xBF", '', $text);
+ +

Fonts

+ +

Generally speaking, people who are having trouble with fonts fall +into two categories:

+ + + +

Yes, there's always a chance where an English user happens across +a Sinhalese website and doesn't have the right font. But an English user +who happens not to have the right fonts probably has no business reading Sinhalese +anyway. So we'll deal with the other two edge cases.

+ +

Obscure scripts

+ +

If you run a Bengali website, you may get comments from users who +would like to read your website but get heaps of question marks or +other meaningless characters. Fixing this problem requires the +installation of a font or language pack which is often highly +dependent on what the language is. Here is an example +of such a help file for the Bengali language, I am sure there are +others out there too. You just have to point users to the appropriate +help file.

+ +

Occasional use

+ +

A prime example of when you'll see some very obscure Unicode +characters embedded in what otherwise would be very bland ASCII are +letters of the +International +Phonetic Alphabet (IPA), use to designate pronounciations in a very standard +manner (you probably see them all the time in your dictionary). Your +average font probably won't have support for all of the IPA characters +like ʘ (bilabial click) or ʒ (voiced postalveolar fricative). +So what's a poor browser to do? Font mix! Smart browsers like Mozilla Firefox +and Internet Explorer 7 will borrow glyphs from other fonts in order +to make sure that all the characters display properly.

+ +

But what happens when the browser isn't smart and happens to be the +most widely used browser in the entire world? Microsoft IE 6 +is not smart enough to borrow from other fonts when a character isn't +present, so more often than not you'll be slapped with a nice big �. +To get things to work, MSIE 6 needs a little nudge. You could configure it +to use a different font to render the text, but you can acheive the same +effect by selectively changing the font for blocks of special characters +to known good Unicode fonts.

+ +

Fortunantely, the folks over at Wikipedia have already done all the +heavy lifting for you. Get the CSS from the horses mouth here: +Common.css, +and search for ".IPA" There are also a smattering of +other classes you can use for other purposes, check out +this page +for more details. For you lazy ones, this should work:

+ +
.Unicode {
+        font-family: Code2000, "TITUS Cyberbit Basic", "Doulos SIL",
+            "Chrysanthi Unicode", "Bitstream Cyberbit",
+            "Bitstream CyberBase", Thryomanes, Gentium, GentiumAlt,
+            "Lucida Grande", "Arial Unicode MS", "Microsoft Sans Serif",
+            "Lucida Sans Unicode";
+        font-family /**/:inherit; /* resets fonts for everyone but IE6 */
+}
+ +

The standard usage goes along the lines of <span class="Unicode">Crazy +Unicode stuff here</span>. Characters in the +Windows Glyph List +usually don't need to be fixed, but for anything else you probably +want to play it safe. Unless, of course, you don't care about IE6 +users.

+

Dealing with variable width in functions

+

When people claim that PHP6 will solve all our Unicode problems, they're +misinformed. It will not fix any of the abovementioned troubles. It will, +however, fix the problem we are about to discuss: processing UTF-8 text +in PHP.

+ +

PHP (as of PHP5) is blithely unaware of the existence of UTF-8 (with a few +notable exceptions). Sometimes, this will cause problems, other times, +this won't. So far, we've avoided discussing the architecture of +UTF-8, so, we must first ask, what is UTF-8? Yes, it supports Unicode, +and yes, it is variable width. Other traits:

+ + + +

Each of these traits affect different domains of text processing +in different ways. It is beyond the scope of this document to explain +what precisely these implications are. PHPWact provides +a very good reference document +on what to expect from each functions, although coverage is spotty in +some areas. Their more general notes on +character sets +are also worth looking at for information on UTF-8. Some rules of thumb +when dealing with Unicode text:

+ + + +

...and always think in bytes, not characters. If you use strpos() +to find the position of a character, it will be in bytes, but this +usually won't matter since substr() also operates with byte indices!

+ +

You'll also need to make sure your UTF-8 is well-formed and will +probably need replacements for some of these functions. I recommend +using Harry Fuecks' PHP +UTF-8 library, rather than use mb_string directly. HTML Purifier +also defines a few useful UTF-8 compatible functions: check out +Encoder.php in the /library/HTMLPurifier/ +directory.

+ +

Well, that's it. Hopefully this document has served as a very +practical springboard into knowledge of how UTF-8 works. You may have +decided that you don't want to migrate yet: that's fine, just know +what will happen to your output and what bug reports you may recieve.

+

Many other developers have already discussed the subject of Unicode, UTF-8 and internationalization, and I would like to defer to them for a more in-depth look into character sets and encodings.

diff --git a/docs/fixquotes.htc b/docs/fixquotes.htc new file mode 100644 index 00000000..bf2e7842 --- /dev/null +++ b/docs/fixquotes.htc @@ -0,0 +1,6 @@ + + diff --git a/docs/index.html b/docs/index.html index ea498147..0065c3d6 100644 --- a/docs/index.html +++ b/docs/index.html @@ -31,7 +31,7 @@ information for casual developers using HTML Purifier.

Speeding up HTML Purifier
Explains how to speed up HTML Purifier through caching or inbound filtering.
-
UTF-8
+
UTF-8: The Secret of Character Encoding
Describes the rationale for using UTF-8, the ramifications otherwise, and how to make the switch.
@@ -54,6 +54,10 @@ conventions.

Optimization
Discusses possible methods of optimizing HTML Purifier.
+
Advanced API
+
Functional specification for HTML Purifier's advanced API for defining +custom filtering behavior.
+

Proposals

diff --git a/docs/proposal-config.txt b/docs/proposal-config.txt index d291a3fb..93314122 100644 --- a/docs/proposal-config.txt +++ b/docs/proposal-config.txt @@ -7,7 +7,7 @@ value is used for. This means decentralized configuration declarations that are nevertheless error checking and a centralized configuration object. Directives are divided into namespaces, indicating the major portion of -functionality they cover (although there may be overlaps. Please consult +functionality they cover (although there may be overlaps). Please consult the documentation in ConfigDef for more information on these namespaces. Since configuration is dependant on context, internal classes require a @@ -36,4 +36,5 @@ the definition, you'd have to force reconstruction. In practice, the pulling directives from the config object are solely need-based, and the flex points are littered throughout the -setup() function. Some sort of refactoring is likely in order. +setup() function. Some sort of refactoring is likely in order. See +ref-xhtml-1.1.txt for more info. diff --git a/docs/proposal-language.txt b/docs/proposal-language.txt index 3bfa9943..d4e67e9c 100644 --- a/docs/proposal-language.txt +++ b/docs/proposal-language.txt @@ -1,42 +1,6 @@ We are going to model our I18N/L10N off of MediaWiki's system. Their's is obviously quite complicated, so we're going to simplify it a bit for our needs. -== Structure == - -First, you have a Language object. This object contains all the localisable -message strings, as well as other important language-specific settings and -custom behavior (uppercasing, lowercasing, printing dates, formatting -numbers, etc.) - -The object is constructed from two sources: subclassed versions of itself -(classes) and Message files (messages). - -== General use == - -You load a language object by calling the Language::factory() function. -This function the class file for the object (taking in account fallback -languages by using the fallback langauge's object but overloading the -language key) and returns that object. Nothing else happens. - -When a message/etc is requested, a lazy load initializor is called. Now the -real work starts. We're first going to take the scenario that the language -is not cached. The system loads the Messages file by: - - require( $filename ); - $cache = compact( self::$mLocalisationKeys ); - -...where self::$mLocalisationKeys is the name of variables that could be used -in the localization file. This lets you use things like: - - $fallback = false; - $rtl = false; - -...and easily siphon them into arrays. - -Then, we load the $fallback language (if not set, English) to fill in the gaps in -the messages. There is specialized behavior for certain keys, as they can be -mergeable maps, lists or alias lists (not sure what the last one is). - == Caching == MediaWiki has lots of caching mechanisms built in, which make the code somewhat diff --git a/docs/ref-loose-vs-strict.txt b/docs/ref-loose-vs-strict.txt index 3581178f..7828aa63 100644 --- a/docs/ref-loose-vs-strict.txt +++ b/docs/ref-loose-vs-strict.txt @@ -32,6 +32,6 @@ A tag's attribute 'target' (for selecting frames) cut current behavior: no substitute, just delete when in strict, allow in loose Attribute 'name' deprecated in favor of 'id' current behavior: dropped silently - projected behavior: create proper AttrTransform (currently not allowed at all) + projected behavior: create proper AttrTransform [done] PRE tag allows SUB/SUP? (strict dtd comment vs syntax, loose disallows) current behavior: disallow as usual diff --git a/docs/ref-xhtml-1.1.txt b/docs/ref-xhtml-1.1.txt index affe4f2a..b32db5a8 100644 --- a/docs/ref-xhtml-1.1.txt +++ b/docs/ref-xhtml-1.1.txt @@ -1,21 +1,187 @@ -Getting XHTML 1.1 Working - -It's quite simple, according to +XHTML 1.1 and HTML Purifier +Todo for XHTML 1.1 support 1. Scratch lang entirely in favor of xml:lang 2. Scratch name entirely in favor of id (partially-done) 3. Support Ruby -...but that's only an informative section. More things to do: +HTML Purifier uses the modularization of XHTML + to organize the internals +of HTMLDefinition into a more manageable and extensible fashion. Rather +than have one super-object, HTMLDefinition is split into HTMLModules, +each of which are responsible for defining elements, their attributes, +and other properties (for a more indepth coverage, see +/library/HTMLPurifier/HTMLModule.php's docblock comments). -1. Scratch style attribute (it's deprecated) -2. Be module-aware (this might entail intelligent grouping in the definition - and allowing users to specifically remove certain modules (see 5)) -3. Cross-reference minimal content models with existing DTDs and determine - changes (todo) -4. Watch out for the Legacy Module - -5. Let users specify their own custom modules -6. Study Modularization document - +The modules that W3C defines and we support are: + + * 5.1. Attribute Collections (technically not a module + * 5.2. Core Modules + o 5.2.2. Text Module + o 5.2.3. Hypertext Module + o 5.2.4. List Module + * 5.4. Text Extension Modules + o 5.4.1. Presentation Module + o 5.4.2. Edit Module + o 5.4.3. Bi-directional Text Module + * 5.6. Table Modules + o 5.6.2. Tables Module + * 5.7. Image Module + * 5.18. Style Attribute Module + +Modules that we don't support but coul support are: + + * 5.6. Table Modules + o 5.6.1. Basic Tables Module [?] + * 5.8. Client-side Image Map Module [?] + * 5.9. Server-side Image Map Module [?] + * 5.12. Target Module [?] + * 5.21. Name Identification Module [deprecated] + * 5.22. Legacy Module [deprecated] + +These modules will not be implemented due to their dangerousness or +inapplicability as an XHTML fragment: + + * 5.2. Core Modules + o 5.2.1. Structure Module + * 5.3. Applet Module + * 5.5. Forms Modules + o 5.5.1. Basic Forms Module + o 5.5.2. Forms Module + * 5.10. Object Module + * 5.11. Frames Module + * 5.13. Iframe Module + * 5.14. Intrinsic Events Module + * 5.15. Metainformation Module + * 5.16. Scripting Module + * 5.17. Style Sheet Module + * 5.19. Link Module + * 5.20. Base Module + +We will not be using W3C's XML Schemas or DTDs directly due to the lack +of robust tools for handling them (the main problem is that all the +current parsers are usually PHP 5 only and solely-validating, not +correcting). + +The abstraction of the HTMLDefinition creation process will also +contribute to a need for a caching system. Cache invalidation would be +difficult, but could be done by comparing the HTML and Attr config +namespaces with a copy that was packaged along with the serialized +HTMLDefinition object. + +== General Use-Case == + +The outwards API of HTMLDefinition has been largely preserved, not +only for backwards-compatibility but also by design. Instead, +HTMLDefinition can be retrieved "raw", in which it loads a structure +that closely resembles the modules of XHTML 1.1. This structure is very +dynamic, making it easy to make cascading changes to global content +sets or remove elements in bulk. + +However, once HTML Purifier needs the actual definition, it retrieves +a finalized version of HTMLDefinition. The finalized definition involves +processing the modules into a form that it is optimized for multiple +calls. This final version is immutable and, even if editable, would +be extremely hard to change. + +So, some code taking advantage of the XHTML modularization may look +like this: + +getHTMLDefinition(true); // reference to raw + unset($def->modules['Hypertext']); // rm ''a'' link + $purifier = new HTMLPurifier($config); + $purifier->purify($html); // now the definition is finalized +?> + +== Inclusions == + +One of the nice features of HTMLDefinition is that piggy-backing off +of global attribute and content sets is extremely easy to do. + +=== Attributes === + +HTMLModule->elements[$element]->attr stores attribute information for the +specific attributes of $element. This is quite close to the final +API that HTML Purifier interfaces with, but there's an important +extra feature: attr may also contain a array with a member index zero. + +elements[$element]->attr[0] = array('AttrSet'); +?> + +Rather than map the attribute key 0 to an array (which should be +an AttrDef), it defines a number of attribute collections that should +be merged into this elements attribute array. + +Furthermore, the value of an attribute key, attribute value pair need +not be a fully fledged AttrDef object. They can also be a string, which +signifies a AttrDef that is looked up from a centralized registry +AttrTypes. This allows more concise attribute definitions that look +more like W3C's declarations, as well as offering a centralized point +for modifying the behavior of one attribute type. And, of course, the +old method of manually instantiating an AttrDef still works. + +=== Attribute Collections === + +Attribute collections are stored and processed in the AttrCollections +object, which is responsible for performing the inclusions signified +by the 0 index. These attribute collections, too, are mutable, by +using HTMLModule->attr_collections. You may add new attributes +to a collection or define an entirely new collection for your module's +use. Inclusions can also be cumulative. + +Attribute collections allow us to get rid of so called "global attributes" +(which actually aren't so global). + +=== Content Models and ChildDef === + +An implementation of the above-mentioned attributes and attribute +collections was applied to the ChildDef system. HTML Purifier uses +a proprietary system called ChildDef for performance and flexibility +reasons, but this does not line up very well with W3C's notion of +regexps for defining the allowed children of an element. + +HTMLPurifier->elements[$element]->content_model and +HTMLPurifier->elements[$element]->content_model_type store information +about the final ChildDef that will be stored in +HTMLPurifier->elements[$element]->child (we use a different variable +because the two forms are sufficiently different). + +$content_model is an abstract, string representation of the internal +state of ChildDef, while $content_model_type is a string identifier +of which ChildDef subclass to instantiate. $content_model is processed +by substituting all content set identifiers (capitalized element names) +with their contents. It is then parsed and passed into the appropriate +ChildDef class, as defined by the ContentSets->getChildDef() or the +custom fallback HTMLModule->getChildDef() for custom child definitions +not in the core. + +You'll need to use these facilities if you plan on referencing a content +set like "Inline" or "Block", and using them is recommended even if you're +not due to their conciseness. + +A few notes on $content_model: it's structure can be as complicated +as you want, but the pipe symbol (|) is reserved for defining possible +choices, due to the content sets implementation. For example, a content +model that looks like: + +"Inline -> Block -> a" + +...when the Inline content set is defined as "span | b" and the Block +content set is defined as "div | blockquote", will expand into: + +"span | b -> div | blockquote -> a" + +The custom HTMLModule->getChildDef() function will need to be able to +then feed this information to ChildDef in a usable manner. + +=== Content Sets === + +Content sets can be altered using HTMLModule->content_sets, an associative +array of content set names to content set contents. If the content set +already exists, your values are appended on to it (great for, say, +registering the font tag as an inline element), otherwise it is +created. They are substituted into content_model. \ No newline at end of file diff --git a/docs/style.css b/docs/style.css index 03bf1702..75a3e2f7 100644 --- a/docs/style.css +++ b/docs/style.css @@ -42,3 +42,27 @@ blockquote .label {font-weight:bold; font-size:1em; margin:0 0 .1em; /* Contains, without exception, $Id$, for SVN version info. */ #version {text-align:right; font-style:italic; margin:2em 0;} + +#toc ol ol {list-style-type:lower-roman;} +#toc ol {list-style-type:decimal;} +#toc {list-style-type:upper-alpha;} + +q { + behavior: url(fixquotes.htc); /* IE fix */ + quotes: '\201C' '\201D' '\2018' '\2019'; +} +q:before { + content: open-quote; +} +q:after { + content: close-quote; +} + +/* Marks off implementation details interesting only to the person writing + the class described in the spec. */ +.technical {margin-left:2em; } +.technical:before {content:"Technical note: "; font-weight:bold; color:#061; } + +/* Marks off sections that are lacking. */ +.fixme {margin-left:2em; } +.fixme:before {content:"Fix me: "; font-weight:bold; color:#C00; } diff --git a/library/HTMLPurifier/AttrCollections.php b/library/HTMLPurifier/AttrCollections.php new file mode 100644 index 00000000..8318abb1 --- /dev/null +++ b/library/HTMLPurifier/AttrCollections.php @@ -0,0 +1,100 @@ +info; + // load extensions from the modules + foreach ($modules as $module) { + foreach ($module->attr_collections as $coll_i => $coll) { + foreach ($coll as $attr_i => $attr) { + if ($attr_i === 0 && isset($info[$coll_i][$attr_i])) { + // merge in includes + $info[$coll_i][$attr_i] = array_merge( + $info[$coll_i][$attr_i], $attr); + continue; + } + $info[$coll_i][$attr_i] = $attr; + } + } + } + // perform internal expansions and inclusions + foreach ($info as $name => $attr) { + // merge attribute collections that include others + $this->performInclusions($info[$name]); + // replace string identifiers with actual attribute objects + $this->expandIdentifiers($info[$name], $attr_types); + } + } + + /** + * Takes a reference to an attribute associative array and performs + * all inclusions specified by the zero index. + * @param &$attr Reference to attribute array + */ + function performInclusions(&$attr) { + if (!isset($attr[0])) return; + $merge = $attr[0]; + // loop through all the inclusions + for ($i = 0; isset($merge[$i]); $i++) { + // foreach attribute of the inclusion, copy it over + foreach ($this->info[$merge[$i]] as $key => $value) { + if (isset($attr[$key])) continue; // also catches more inclusions + $attr[$key] = $value; + } + if (isset($info[$merge[$i]][0])) { + // recursion + $merge = array_merge($merge, isset($info[$merge[$i]][0])); + } + } + unset($attr[0]); + } + + /** + * Expands all string identifiers in an attribute array by replacing + * them with the appropriate values inside HTMLPurifier_AttrTypes + * @param &$attr Reference to attribute array + * @param $attr_types HTMLPurifier_AttrTypes instance + */ + function expandIdentifiers(&$attr, $attr_types) { + foreach ($attr as $def_i => $def) { + if ($def_i === 0) continue; + if (!is_string($def)) continue; + if ($def === false) { + unset($attr[$def_i]); + continue; + } + if (isset($attr_types->info[$def])) { + $attr[$def_i] = $attr_types->info[$def]; + } else { + trigger_error('Attempted to reference undefined attribute type', E_USER_ERROR); + unset($attr[$def_i]); + } + } + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/AttrDef/Background.php b/library/HTMLPurifier/AttrDef/CSS/Background.php similarity index 95% rename from library/HTMLPurifier/AttrDef/Background.php rename to library/HTMLPurifier/AttrDef/CSS/Background.php index 1db3f88d..42d8bcf0 100644 --- a/library/HTMLPurifier/AttrDef/Background.php +++ b/library/HTMLPurifier/AttrDef/CSS/Background.php @@ -7,7 +7,7 @@ require_once 'HTMLPurifier/CSSDefinition.php'; * Validates shorthand CSS property background. * @warning Does not support url tokens that have internal spaces. */ -class HTMLPurifier_AttrDef_Background extends HTMLPurifier_AttrDef +class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef { /** @@ -16,7 +16,7 @@ class HTMLPurifier_AttrDef_Background extends HTMLPurifier_AttrDef */ var $info; - function HTMLPurifier_AttrDef_Background($config) { + function HTMLPurifier_AttrDef_CSS_Background($config) { $def = $config->getCSSDefinition(); $this->info['background-color'] = $def->info['background-color']; $this->info['background-image'] = $def->info['background-image']; diff --git a/library/HTMLPurifier/AttrDef/BackgroundPosition.php b/library/HTMLPurifier/AttrDef/CSS/BackgroundPosition.php similarity index 89% rename from library/HTMLPurifier/AttrDef/BackgroundPosition.php rename to library/HTMLPurifier/AttrDef/CSS/BackgroundPosition.php index 0b59c403..77a3ddd6 100644 --- a/library/HTMLPurifier/AttrDef/BackgroundPosition.php +++ b/library/HTMLPurifier/AttrDef/CSS/BackgroundPosition.php @@ -1,8 +1,8 @@ length = new HTMLPurifier_AttrDef_CSSLength(); - $this->percentage = new HTMLPurifier_AttrDef_Percentage(); + function HTMLPurifier_AttrDef_CSS_BackgroundPosition() { + $this->length = new HTMLPurifier_AttrDef_CSS_Length(); + $this->percentage = new HTMLPurifier_AttrDef_CSS_Percentage(); } function validate($string, $config, &$context) { diff --git a/library/HTMLPurifier/AttrDef/Border.php b/library/HTMLPurifier/AttrDef/CSS/Border.php similarity index 90% rename from library/HTMLPurifier/AttrDef/Border.php rename to library/HTMLPurifier/AttrDef/CSS/Border.php index ecd016a3..583f14fd 100644 --- a/library/HTMLPurifier/AttrDef/Border.php +++ b/library/HTMLPurifier/AttrDef/CSS/Border.php @@ -5,7 +5,7 @@ require_once 'HTMLPurifier/AttrDef.php'; /** * Validates the border property as defined by CSS. */ -class HTMLPurifier_AttrDef_Border extends HTMLPurifier_AttrDef +class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef { /** @@ -13,7 +13,7 @@ class HTMLPurifier_AttrDef_Border extends HTMLPurifier_AttrDef */ var $info = array(); - function HTMLPurifier_AttrDef_Border($config) { + function HTMLPurifier_AttrDef_CSS_Border($config) { $def = $config->getCSSDefinition(); $this->info['border-width'] = $def->info['border-width']; $this->info['border-style'] = $def->info['border-style']; diff --git a/library/HTMLPurifier/AttrDef/Color.php b/library/HTMLPurifier/AttrDef/CSS/Color.php similarity index 97% rename from library/HTMLPurifier/AttrDef/Color.php rename to library/HTMLPurifier/AttrDef/CSS/Color.php index 3948a19c..4e6a78ac 100644 --- a/library/HTMLPurifier/AttrDef/Color.php +++ b/library/HTMLPurifier/AttrDef/CSS/Color.php @@ -5,7 +5,7 @@ require_once 'HTMLPurifier/AttrDef.php'; /** * Validates Color as defined by CSS. */ -class HTMLPurifier_AttrDef_Color extends HTMLPurifier_AttrDef +class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef { /** diff --git a/library/HTMLPurifier/AttrDef/Composite.php b/library/HTMLPurifier/AttrDef/CSS/Composite.php similarity index 88% rename from library/HTMLPurifier/AttrDef/Composite.php rename to library/HTMLPurifier/AttrDef/CSS/Composite.php index 7be0bd97..9d2803d2 100644 --- a/library/HTMLPurifier/AttrDef/Composite.php +++ b/library/HTMLPurifier/AttrDef/CSS/Composite.php @@ -9,7 +9,7 @@ * especially useful for CSS values, which often are a choice between * an enumerated set of predefined values or a flexible data type. */ -class HTMLPurifier_AttrDef_Composite extends HTMLPurifier_AttrDef +class HTMLPurifier_AttrDef_CSS_Composite extends HTMLPurifier_AttrDef { /** @@ -21,7 +21,7 @@ class HTMLPurifier_AttrDef_Composite extends HTMLPurifier_AttrDef /** * @param $defs List of HTMLPurifier_AttrDef objects */ - function HTMLPurifier_AttrDef_Composite($defs) { + function HTMLPurifier_AttrDef_CSS_Composite($defs) { $this->defs = $defs; } diff --git a/library/HTMLPurifier/AttrDef/Font.php b/library/HTMLPurifier/AttrDef/CSS/Font.php similarity index 98% rename from library/HTMLPurifier/AttrDef/Font.php rename to library/HTMLPurifier/AttrDef/CSS/Font.php index 7357e282..1b3b0905 100644 --- a/library/HTMLPurifier/AttrDef/Font.php +++ b/library/HTMLPurifier/AttrDef/CSS/Font.php @@ -5,7 +5,7 @@ require_once 'HTMLPurifier/AttrDef.php'; /** * Validates shorthand CSS property font. */ -class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef +class HTMLPurifier_AttrDef_CSS_Font extends HTMLPurifier_AttrDef { /** @@ -30,7 +30,7 @@ class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef 'status-bar' => true ); - function HTMLPurifier_AttrDef_Font($config) { + function HTMLPurifier_AttrDef_CSS_Font($config) { $def = $config->getCSSDefinition(); $this->info['font-style'] = $def->info['font-style']; $this->info['font-variant'] = $def->info['font-variant']; diff --git a/library/HTMLPurifier/AttrDef/FontFamily.php b/library/HTMLPurifier/AttrDef/CSS/FontFamily.php similarity index 96% rename from library/HTMLPurifier/AttrDef/FontFamily.php rename to library/HTMLPurifier/AttrDef/CSS/FontFamily.php index 32da724e..15cbbf39 100644 --- a/library/HTMLPurifier/AttrDef/FontFamily.php +++ b/library/HTMLPurifier/AttrDef/CSS/FontFamily.php @@ -7,7 +7,7 @@ require_once 'HTMLPurifier/AttrDef.php'; /** * Validates a font family list according to CSS spec */ -class HTMLPurifier_AttrDef_FontFamily extends HTMLPurifier_AttrDef +class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef { /** diff --git a/library/HTMLPurifier/AttrDef/CSSLength.php b/library/HTMLPurifier/AttrDef/CSS/Length.php similarity index 81% rename from library/HTMLPurifier/AttrDef/CSSLength.php rename to library/HTMLPurifier/AttrDef/CSS/Length.php index 50613a39..7da26a8f 100644 --- a/library/HTMLPurifier/AttrDef/CSSLength.php +++ b/library/HTMLPurifier/AttrDef/CSS/Length.php @@ -1,13 +1,12 @@ number_def = new HTMLPurifier_AttrDef_Number($non_negative); + function HTMLPurifier_AttrDef_CSS_Length($non_negative = false) { + $this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative); } function validate($length, $config, &$context) { diff --git a/library/HTMLPurifier/AttrDef/ListStyle.php b/library/HTMLPurifier/AttrDef/CSS/ListStyle.php similarity index 91% rename from library/HTMLPurifier/AttrDef/ListStyle.php rename to library/HTMLPurifier/AttrDef/CSS/ListStyle.php index b09ee354..2d2ed12d 100644 --- a/library/HTMLPurifier/AttrDef/ListStyle.php +++ b/library/HTMLPurifier/AttrDef/CSS/ListStyle.php @@ -6,16 +6,16 @@ require_once 'HTMLPurifier/AttrDef.php'; * Validates shorthand CSS property list-style. * @warning Does not support url tokens that have internal spaces. */ -class HTMLPurifier_AttrDef_ListStyle extends HTMLPurifier_AttrDef +class HTMLPurifier_AttrDef_CSS_ListStyle extends HTMLPurifier_AttrDef { /** * Local copy of component validators. - * @note See HTMLPurifier_AttrDef_Font::$info for a similar impl. + * @note See HTMLPurifier_AttrDef_CSS_Font::$info for a similar impl. */ var $info; - function HTMLPurifier_AttrDef_ListStyle($config) { + function HTMLPurifier_AttrDef_CSS_ListStyle($config) { $def = $config->getCSSDefinition(); $this->info['list-style-type'] = $def->info['list-style-type']; $this->info['list-style-position'] = $def->info['list-style-position']; diff --git a/library/HTMLPurifier/AttrDef/Multiple.php b/library/HTMLPurifier/AttrDef/CSS/Multiple.php similarity index 92% rename from library/HTMLPurifier/AttrDef/Multiple.php rename to library/HTMLPurifier/AttrDef/CSS/Multiple.php index ca053250..0d1c8406 100644 --- a/library/HTMLPurifier/AttrDef/Multiple.php +++ b/library/HTMLPurifier/AttrDef/CSS/Multiple.php @@ -13,7 +13,7 @@ require_once 'HTMLPurifier/AttrDef.php'; * can only be used alone: it will never manifest as part of a multi * shorthand declaration. Thus, this class does not allow inherit. */ -class HTMLPurifier_AttrDef_Multiple extends HTMLPurifier_AttrDef +class HTMLPurifier_AttrDef_CSS_Multiple extends HTMLPurifier_AttrDef { /** @@ -30,7 +30,7 @@ class HTMLPurifier_AttrDef_Multiple extends HTMLPurifier_AttrDef * @param $single HTMLPurifier_AttrDef to multiply * @param $max Max number of values allowed (usually four) */ - function HTMLPurifier_AttrDef_Multiple($single, $max = 4) { + function HTMLPurifier_AttrDef_CSS_Multiple($single, $max = 4) { $this->single = $single; $this->max = $max; } diff --git a/library/HTMLPurifier/AttrDef/Number.php b/library/HTMLPurifier/AttrDef/CSS/Number.php similarity index 90% rename from library/HTMLPurifier/AttrDef/Number.php rename to library/HTMLPurifier/AttrDef/CSS/Number.php index f28f80fc..48f1335a 100644 --- a/library/HTMLPurifier/AttrDef/Number.php +++ b/library/HTMLPurifier/AttrDef/CSS/Number.php @@ -3,7 +3,7 @@ /** * Validates a number as defined by the CSS spec. */ -class HTMLPurifier_AttrDef_Number extends HTMLPurifier_AttrDef +class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef { /** @@ -14,7 +14,7 @@ class HTMLPurifier_AttrDef_Number extends HTMLPurifier_AttrDef /** * @param $non_negative Bool indicating whether negatives are forbidden */ - function HTMLPurifier_AttrDef_Number($non_negative = false) { + function HTMLPurifier_AttrDef_CSS_Number($non_negative = false) { $this->non_negative = $non_negative; } diff --git a/library/HTMLPurifier/AttrDef/Percentage.php b/library/HTMLPurifier/AttrDef/CSS/Percentage.php similarity index 68% rename from library/HTMLPurifier/AttrDef/Percentage.php rename to library/HTMLPurifier/AttrDef/CSS/Percentage.php index fcab2868..cc96f15d 100644 --- a/library/HTMLPurifier/AttrDef/Percentage.php +++ b/library/HTMLPurifier/AttrDef/CSS/Percentage.php @@ -1,24 +1,24 @@ number_def = new HTMLPurifier_AttrDef_Number($non_negative); + function HTMLPurifier_AttrDef_CSS_Percentage($non_negative = false) { + $this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative); } function validate($string, $config, &$context) { diff --git a/library/HTMLPurifier/AttrDef/TextDecoration.php b/library/HTMLPurifier/AttrDef/CSS/TextDecoration.php similarity index 92% rename from library/HTMLPurifier/AttrDef/TextDecoration.php rename to library/HTMLPurifier/AttrDef/CSS/TextDecoration.php index 90d011e4..294dd830 100644 --- a/library/HTMLPurifier/AttrDef/TextDecoration.php +++ b/library/HTMLPurifier/AttrDef/CSS/TextDecoration.php @@ -7,7 +7,7 @@ require_once 'HTMLPurifier/AttrDef.php'; * @note This class could be generalized into a version that acts sort of * like Enum except you can compound the allowed values. */ -class HTMLPurifier_AttrDef_TextDecoration extends HTMLPurifier_AttrDef +class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef { /** diff --git a/library/HTMLPurifier/AttrDef/CSSURI.php b/library/HTMLPurifier/AttrDef/CSS/URI.php similarity index 91% rename from library/HTMLPurifier/AttrDef/CSSURI.php rename to library/HTMLPurifier/AttrDef/CSS/URI.php index a92b6263..b310907c 100644 --- a/library/HTMLPurifier/AttrDef/CSSURI.php +++ b/library/HTMLPurifier/AttrDef/CSS/URI.php @@ -4,17 +4,17 @@ require_once 'HTMLPurifier/AttrDef/URI.php'; /** * Validates a URI in CSS syntax, which uses url('http://example.com') - * @note While theoretically speaking we a URI in a CSS document could + * @note While theoretically speaking a URI in a CSS document could * be non-embedded, as of CSS2 there is no such usage so we're * generalizing it. This may need to be changed in the future. * @warning Since HTMLPurifier_AttrDef_CSS blindly uses semicolons as * the separator, you cannot put a literal semicolon in * in the URI. Try percent encoding it, in that case. */ -class HTMLPurifier_AttrDef_CSSURI extends HTMLPurifier_AttrDef_URI +class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI { - function HTMLPurifier_AttrDef_CSSURI() { + function HTMLPurifier_AttrDef_CSS_URI() { $this->HTMLPurifier_AttrDef_URI(true); // always embedded } diff --git a/library/HTMLPurifier/AttrDef/Enum.php b/library/HTMLPurifier/AttrDef/Enum.php index a7da54cd..3246318f 100644 --- a/library/HTMLPurifier/AttrDef/Enum.php +++ b/library/HTMLPurifier/AttrDef/Enum.php @@ -25,8 +25,8 @@ class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef * @param $case_sensitive Bool indicating whether or not case sensitive */ function HTMLPurifier_AttrDef_Enum( - $valid_values = array(), $case_sensitive = false) { - + $valid_values = array(), $case_sensitive = false + ) { $this->valid_values = array_flip($valid_values); $this->case_sensitive = $case_sensitive; } diff --git a/library/HTMLPurifier/AttrDef/ID.php b/library/HTMLPurifier/AttrDef/HTML/ID.php similarity index 70% rename from library/HTMLPurifier/AttrDef/ID.php rename to library/HTMLPurifier/AttrDef/HTML/ID.php index 09c277ca..2a6d2c9a 100644 --- a/library/HTMLPurifier/AttrDef/ID.php +++ b/library/HTMLPurifier/AttrDef/HTML/ID.php @@ -3,6 +3,22 @@ require_once 'HTMLPurifier/AttrDef.php'; require_once 'HTMLPurifier/IDAccumulator.php'; +HTMLPurifier_ConfigSchema::define( + 'Attr', 'EnableID', false, 'bool', + 'Allows the ID attribute in HTML. This is disabled by default '. + 'due to the fact that without proper configuration user input can '. + 'easily break the validation of a webpage by specifying an ID that is '. + 'already on the surrounding HTML. If you don\'t mind throwing caution to '. + 'the wind, enable this directive, but I strongly recommend you also '. + 'consider blacklisting IDs you use (%Attr.IDBlacklist) or prefixing all '. + 'user supplied IDs (%Attr.IDPrefix). This directive has been available '. + 'since 1.2.0, and when set to true reverts to the behavior of pre-1.2.0 '. + 'versions.' +); +HTMLPurifier_ConfigSchema::defineAlias( + 'HTML', 'EnableAttrID', 'Attr', 'EnableID' +); + HTMLPurifier_ConfigSchema::define( 'Attr', 'IDPrefix', '', 'string', 'String to prefix to IDs. If you have no idea what IDs your pages '. @@ -36,11 +52,16 @@ HTMLPurifier_ConfigSchema::define( * blacklist. If you're hacking around, make sure you use load()! */ -class HTMLPurifier_AttrDef_ID extends HTMLPurifier_AttrDef +class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef { + // ref functionality disabled, since we also have to verify + // whether or not the ID it refers to exists + function validate($id, $config, &$context) { + if (!$config->get('Attr', 'EnableID')) return false; + $id = trim($id); // trim it first if ($id === '') return false; @@ -55,8 +76,10 @@ class HTMLPurifier_AttrDef_ID extends HTMLPurifier_AttrDef '%Attr.IDPrefix is set', E_USER_WARNING); } - $id_accumulator =& $context->get('IDAccumulator'); - if (isset($id_accumulator->ids[$id])) return false; + //if (!$this->ref) { + $id_accumulator =& $context->get('IDAccumulator'); + if (isset($id_accumulator->ids[$id])) return false; + //} // we purposely avoid using regex, hopefully this is faster @@ -71,7 +94,7 @@ class HTMLPurifier_AttrDef_ID extends HTMLPurifier_AttrDef $result = ($trim === ''); } - if ($result) $id_accumulator->add($id); + if (/*!$this->ref && */$result) $id_accumulator->add($id); // if no change was made to the ID, return the result // else, return the new id if stripping whitespace made it diff --git a/library/HTMLPurifier/AttrDef/Length.php b/library/HTMLPurifier/AttrDef/HTML/Length.php similarity index 77% rename from library/HTMLPurifier/AttrDef/Length.php rename to library/HTMLPurifier/AttrDef/HTML/Length.php index 0f27a6c4..ac83295a 100644 --- a/library/HTMLPurifier/AttrDef/Length.php +++ b/library/HTMLPurifier/AttrDef/HTML/Length.php @@ -1,18 +1,16 @@ 8 || !ctype_alnum($subtags[1])) { + if ($length == 0 || ($length == 1 && $subtags[1] != 'x') || $length > 8 || !ctype_alnum($subtags[1])) { return $new_string; } if (!ctype_lower($subtags[1])) $subtags[1] = strtolower($subtags[1]); diff --git a/library/HTMLPurifier/AttrDef/URI.php b/library/HTMLPurifier/AttrDef/URI.php index a3ce6ded..71027181 100644 --- a/library/HTMLPurifier/AttrDef/URI.php +++ b/library/HTMLPurifier/AttrDef/URI.php @@ -3,7 +3,7 @@ require_once 'HTMLPurifier/AttrDef.php'; require_once 'HTMLPurifier/URIScheme.php'; require_once 'HTMLPurifier/URISchemeRegistry.php'; -require_once 'HTMLPurifier/AttrDef/Host.php'; +require_once 'HTMLPurifier/AttrDef/URI/Host.php'; require_once 'HTMLPurifier/PercentEncoder.php'; HTMLPurifier_ConfigSchema::define( @@ -77,6 +77,14 @@ HTMLPurifier_ConfigSchema::define( 'This directive has been available since 1.3.0.' ); +HTMLPurifier_ConfigSchema::define( + 'URI', 'Disable', false, 'bool', + 'Disables all URIs in all forms. Not sure why you\'d want to do that '. + '(after all, the Internet\'s founded on the notion of a hyperlink). '. + 'This directive has been available since 1.3.0.' +); +HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable'); + /** * Validates a URI as defined by RFC 3986. * @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme @@ -92,7 +100,7 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef * @param $embeds_resource_resource Does the URI here result in an extra HTTP request? */ function HTMLPurifier_AttrDef_URI($embeds_resource = false) { - $this->host = new HTMLPurifier_AttrDef_Host(); + $this->host = new HTMLPurifier_AttrDef_URI_Host(); $this->PercentEncoder = new HTMLPurifier_PercentEncoder(); $this->embeds_resource = (bool) $embeds_resource; } @@ -102,6 +110,8 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef // We'll write stack-based parsers later, for now, use regexps to // get things working as fast as possible (irony) + if ($config->get('URI', 'Disable')) return false; + // parse as CDATA $uri = $this->parseCDATA($uri); diff --git a/library/HTMLPurifier/AttrDef/Email.php b/library/HTMLPurifier/AttrDef/URI/Email.php similarity index 75% rename from library/HTMLPurifier/AttrDef/Email.php rename to library/HTMLPurifier/AttrDef/URI/Email.php index 7a7ad6ab..80b8d367 100644 --- a/library/HTMLPurifier/AttrDef/Email.php +++ b/library/HTMLPurifier/AttrDef/URI/Email.php @@ -2,7 +2,7 @@ require_once 'HTMLPurifier/AttrDef.php'; -class HTMLPurifier_AttrDef_Email extends HTMLPurifier_AttrDef +class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef { /** diff --git a/library/HTMLPurifier/AttrDef/Email/SimpleCheck.php b/library/HTMLPurifier/AttrDef/URI/Email/SimpleCheck.php similarity index 79% rename from library/HTMLPurifier/AttrDef/Email/SimpleCheck.php rename to library/HTMLPurifier/AttrDef/URI/Email/SimpleCheck.php index 4b9fdf1a..e35b1b4b 100644 --- a/library/HTMLPurifier/AttrDef/Email/SimpleCheck.php +++ b/library/HTMLPurifier/AttrDef/URI/Email/SimpleCheck.php @@ -1,12 +1,12 @@ ipv4 = new HTMLPurifier_AttrDef_IPv4(); - $this->ipv6 = new HTMLPurifier_AttrDef_IPv6(); + function HTMLPurifier_AttrDef_URI_Host() { + $this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4(); + $this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6(); } function validate($string, $config, &$context) { diff --git a/library/HTMLPurifier/AttrDef/IPv4.php b/library/HTMLPurifier/AttrDef/URI/IPv4.php similarity index 84% rename from library/HTMLPurifier/AttrDef/IPv4.php rename to library/HTMLPurifier/AttrDef/URI/IPv4.php index a16305ad..0730bbc8 100644 --- a/library/HTMLPurifier/AttrDef/IPv4.php +++ b/library/HTMLPurifier/AttrDef/URI/IPv4.php @@ -6,7 +6,7 @@ require_once 'HTMLPurifier/AttrDef.php'; * Validates an IPv4 address * @author Feyd @ forums.devnetwork.net (public domain) */ -class HTMLPurifier_AttrDef_IPv4 extends HTMLPurifier_AttrDef +class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef { /** @@ -15,7 +15,7 @@ class HTMLPurifier_AttrDef_IPv4 extends HTMLPurifier_AttrDef */ var $ip4; - function HTMLPurifier_AttrDef_IPv4() { + function HTMLPurifier_AttrDef_URI_IPv4() { $oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255 $this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})"; } diff --git a/library/HTMLPurifier/AttrDef/IPv6.php b/library/HTMLPurifier/AttrDef/URI/IPv6.php similarity index 95% rename from library/HTMLPurifier/AttrDef/IPv6.php rename to library/HTMLPurifier/AttrDef/URI/IPv6.php index 21b1ed8f..73f085e5 100644 --- a/library/HTMLPurifier/AttrDef/IPv6.php +++ b/library/HTMLPurifier/AttrDef/URI/IPv6.php @@ -1,6 +1,6 @@ info['CDATA'] = new HTMLPurifier_AttrDef_Text(); + $this->info['ID'] = new HTMLPurifier_AttrDef_HTML_ID(); + $this->info['Length'] = new HTMLPurifier_AttrDef_HTML_Length(); + $this->info['MultiLength'] = new HTMLPurifier_AttrDef_HTML_MultiLength(); + $this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_HTML_Nmtokens(); + $this->info['Pixels'] = new HTMLPurifier_AttrDef_HTML_Pixels(); + $this->info['Text'] = new HTMLPurifier_AttrDef_Text(); + $this->info['URI'] = new HTMLPurifier_AttrDef_URI(); + + // number is really a positive integer (one or more digits) + $this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true); + } +} + +?> diff --git a/library/HTMLPurifier/CSSDefinition.php b/library/HTMLPurifier/CSSDefinition.php index 0bbe8af5..55f0adc9 100644 --- a/library/HTMLPurifier/CSSDefinition.php +++ b/library/HTMLPurifier/CSSDefinition.php @@ -1,19 +1,19 @@ info['border-style'] = new HTMLPurifier_AttrDef_Multiple($border_style); + $this->info['border-style'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_style); $this->info['clear'] = new HTMLPurifier_AttrDef_Enum( array('none', 'left', 'right', 'both'), false); @@ -54,10 +54,10 @@ class HTMLPurifier_CSSDefinition $this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum( array('normal', 'small-caps'), false); - $uri_or_none = new HTMLPurifier_AttrDef_Composite( + $uri_or_none = new HTMLPurifier_AttrDef_CSS_Composite( array( new HTMLPurifier_AttrDef_Enum(array('none')), - new HTMLPurifier_AttrDef_CSSURI() + new HTMLPurifier_AttrDef_CSS_URI() ) ); @@ -68,11 +68,11 @@ class HTMLPurifier_CSSDefinition 'upper-roman', 'lower-alpha', 'upper-alpha', 'none'), false); $this->info['list-style-image'] = $uri_or_none; - $this->info['list-style'] = new HTMLPurifier_AttrDef_ListStyle($config); + $this->info['list-style'] = new HTMLPurifier_AttrDef_CSS_ListStyle($config); $this->info['text-transform'] = new HTMLPurifier_AttrDef_Enum( array('capitalize', 'uppercase', 'lowercase', 'none'), false); - $this->info['color'] = new HTMLPurifier_AttrDef_Color(); + $this->info['color'] = new HTMLPurifier_AttrDef_CSS_Color(); $this->info['background-image'] = $uri_or_none; $this->info['background-repeat'] = new HTMLPurifier_AttrDef_Enum( @@ -81,96 +81,96 @@ class HTMLPurifier_CSSDefinition $this->info['background-attachment'] = new HTMLPurifier_AttrDef_Enum( array('scroll', 'fixed') ); - $this->info['background-position'] = new HTMLPurifier_AttrDef_BackgroundPosition(); + $this->info['background-position'] = new HTMLPurifier_AttrDef_CSS_BackgroundPosition(); $border_color = $this->info['border-top-color'] = $this->info['border-bottom-color'] = $this->info['border-left-color'] = $this->info['border-right-color'] = - $this->info['background-color'] = new HTMLPurifier_AttrDef_Composite(array( + $this->info['background-color'] = new HTMLPurifier_AttrDef_CSS_Composite(array( new HTMLPurifier_AttrDef_Enum(array('transparent')), - new HTMLPurifier_AttrDef_Color() + new HTMLPurifier_AttrDef_CSS_Color() )); - $this->info['background'] = new HTMLPurifier_AttrDef_Background($config); + $this->info['background'] = new HTMLPurifier_AttrDef_CSS_Background($config); - $this->info['border-color'] = new HTMLPurifier_AttrDef_Multiple($border_color); + $this->info['border-color'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_color); $border_width = $this->info['border-top-width'] = $this->info['border-bottom-width'] = $this->info['border-left-width'] = - $this->info['border-right-width'] = new HTMLPurifier_AttrDef_Composite(array( + $this->info['border-right-width'] = new HTMLPurifier_AttrDef_CSS_Composite(array( new HTMLPurifier_AttrDef_Enum(array('thin', 'medium', 'thick')), - new HTMLPurifier_AttrDef_CSSLength(true) //disallow negative + new HTMLPurifier_AttrDef_CSS_Length(true) //disallow negative )); - $this->info['border-width'] = new HTMLPurifier_AttrDef_Multiple($border_width); + $this->info['border-width'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_width); - $this->info['letter-spacing'] = new HTMLPurifier_AttrDef_Composite(array( + $this->info['letter-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array( new HTMLPurifier_AttrDef_Enum(array('normal')), - new HTMLPurifier_AttrDef_CSSLength() + new HTMLPurifier_AttrDef_CSS_Length() )); - $this->info['word-spacing'] = new HTMLPurifier_AttrDef_Composite(array( + $this->info['word-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array( new HTMLPurifier_AttrDef_Enum(array('normal')), - new HTMLPurifier_AttrDef_CSSLength() + new HTMLPurifier_AttrDef_CSS_Length() )); - $this->info['font-size'] = new HTMLPurifier_AttrDef_Composite(array( + $this->info['font-size'] = new HTMLPurifier_AttrDef_CSS_Composite(array( new HTMLPurifier_AttrDef_Enum(array('xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large', 'larger', 'smaller')), - new HTMLPurifier_AttrDef_Percentage(), - new HTMLPurifier_AttrDef_CSSLength() + new HTMLPurifier_AttrDef_CSS_Percentage(), + new HTMLPurifier_AttrDef_CSS_Length() )); - $this->info['line-height'] = new HTMLPurifier_AttrDef_Composite(array( + $this->info['line-height'] = new HTMLPurifier_AttrDef_CSS_Composite(array( new HTMLPurifier_AttrDef_Enum(array('normal')), - new HTMLPurifier_AttrDef_Number(true), // no negatives - new HTMLPurifier_AttrDef_CSSLength(true), - new HTMLPurifier_AttrDef_Percentage(true) + new HTMLPurifier_AttrDef_CSS_Number(true), // no negatives + new HTMLPurifier_AttrDef_CSS_Length(true), + new HTMLPurifier_AttrDef_CSS_Percentage(true) )); $margin = $this->info['margin-top'] = $this->info['margin-bottom'] = $this->info['margin-left'] = - $this->info['margin-right'] = new HTMLPurifier_AttrDef_Composite(array( - new HTMLPurifier_AttrDef_CSSLength(), - new HTMLPurifier_AttrDef_Percentage(), + $this->info['margin-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array( + new HTMLPurifier_AttrDef_CSS_Length(), + new HTMLPurifier_AttrDef_CSS_Percentage(), new HTMLPurifier_AttrDef_Enum(array('auto')) )); - $this->info['margin'] = new HTMLPurifier_AttrDef_Multiple($margin); + $this->info['margin'] = new HTMLPurifier_AttrDef_CSS_Multiple($margin); // non-negative $padding = $this->info['padding-top'] = $this->info['padding-bottom'] = $this->info['padding-left'] = - $this->info['padding-right'] = new HTMLPurifier_AttrDef_Composite(array( - new HTMLPurifier_AttrDef_CSSLength(true), - new HTMLPurifier_AttrDef_Percentage(true) + $this->info['padding-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array( + new HTMLPurifier_AttrDef_CSS_Length(true), + new HTMLPurifier_AttrDef_CSS_Percentage(true) )); - $this->info['padding'] = new HTMLPurifier_AttrDef_Multiple($padding); + $this->info['padding'] = new HTMLPurifier_AttrDef_CSS_Multiple($padding); - $this->info['text-indent'] = new HTMLPurifier_AttrDef_Composite(array( - new HTMLPurifier_AttrDef_CSSLength(), - new HTMLPurifier_AttrDef_Percentage() + $this->info['text-indent'] = new HTMLPurifier_AttrDef_CSS_Composite(array( + new HTMLPurifier_AttrDef_CSS_Length(), + new HTMLPurifier_AttrDef_CSS_Percentage() )); - $this->info['width'] = new HTMLPurifier_AttrDef_Composite(array( - new HTMLPurifier_AttrDef_CSSLength(true), - new HTMLPurifier_AttrDef_Percentage(true), + $this->info['width'] = new HTMLPurifier_AttrDef_CSS_Composite(array( + new HTMLPurifier_AttrDef_CSS_Length(true), + new HTMLPurifier_AttrDef_CSS_Percentage(true), new HTMLPurifier_AttrDef_Enum(array('auto')) )); - $this->info['text-decoration'] = new HTMLPurifier_AttrDef_TextDecoration(); + $this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration(); - $this->info['font-family'] = new HTMLPurifier_AttrDef_FontFamily(); + $this->info['font-family'] = new HTMLPurifier_AttrDef_CSS_FontFamily(); // this could use specialized code $this->info['font-weight'] = new HTMLPurifier_AttrDef_Enum( @@ -179,14 +179,14 @@ class HTMLPurifier_CSSDefinition // MUST be called after other font properties, as it references // a CSSDefinition object - $this->info['font'] = new HTMLPurifier_AttrDef_Font($config); + $this->info['font'] = new HTMLPurifier_AttrDef_CSS_Font($config); // same here $this->info['border'] = $this->info['border-bottom'] = $this->info['border-top'] = $this->info['border-left'] = - $this->info['border-right'] = new HTMLPurifier_AttrDef_Border($config); + $this->info['border-right'] = new HTMLPurifier_AttrDef_CSS_Border($config); $this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum(array( 'collapse', 'seperate')); @@ -197,11 +197,11 @@ class HTMLPurifier_CSSDefinition $this->info['table-layout'] = new HTMLPurifier_AttrDef_Enum(array( 'auto', 'fixed')); - $this->info['vertical-align'] = new HTMLPurifier_AttrDef_Composite(array( + $this->info['vertical-align'] = new HTMLPurifier_AttrDef_CSS_Composite(array( new HTMLPurifier_AttrDef_Enum(array('baseline', 'sub', 'super', 'top', 'text-top', 'middle', 'bottom', 'text-bottom')), - new HTMLPurifier_AttrDef_CSSLength(), - new HTMLPurifier_AttrDef_Percentage() + new HTMLPurifier_AttrDef_CSS_Length(), + new HTMLPurifier_AttrDef_CSS_Percentage() )); } diff --git a/library/HTMLPurifier/ChildDef/Chameleon.php b/library/HTMLPurifier/ChildDef/Chameleon.php index feb84a15..afe0299f 100644 --- a/library/HTMLPurifier/ChildDef/Chameleon.php +++ b/library/HTMLPurifier/ChildDef/Chameleon.php @@ -38,22 +38,13 @@ class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef } function validateChildren($tokens_of_children, $config, &$context) { - $parent_type = $context->get('ParentType'); - switch ($parent_type) { - case 'unknown': - case 'inline': - $result = $this->inline->validateChildren( - $tokens_of_children, $config, $context); - break; - case 'block': - $result = $this->block->validateChildren( - $tokens_of_children, $config, $context); - break; - default: - trigger_error('Invalid context', E_USER_ERROR); - return false; + if ($context->get('IsInline') === false) { + return $this->block->validateChildren( + $tokens_of_children, $config, $context); + } else { + return $this->inline->validateChildren( + $tokens_of_children, $config, $context); } - return $result; } } diff --git a/library/HTMLPurifier/ChildDef/Required.php b/library/HTMLPurifier/ChildDef/Required.php index 16ba5e95..c6f706e2 100644 --- a/library/HTMLPurifier/ChildDef/Required.php +++ b/library/HTMLPurifier/ChildDef/Required.php @@ -20,10 +20,13 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef $elements = str_replace(' ', '', $elements); $elements = explode('|', $elements); } - $elements = array_flip($elements); - foreach ($elements as $i => $x) { - $elements[$i] = true; - if (empty($i)) unset($elements[$i]); + $keys = array_keys($elements); + if ($keys == array_keys($keys)) { + $elements = array_flip($elements); + foreach ($elements as $i => $x) { + $elements[$i] = true; + if (empty($i)) unset($elements[$i]); + } } $this->elements = $elements; $this->gen = new HTMLPurifier_Generator(); diff --git a/library/HTMLPurifier/ChildDef/StrictBlockquote.php b/library/HTMLPurifier/ChildDef/StrictBlockquote.php index 980acac3..9280a9f5 100644 --- a/library/HTMLPurifier/ChildDef/StrictBlockquote.php +++ b/library/HTMLPurifier/ChildDef/StrictBlockquote.php @@ -4,27 +4,31 @@ require_once 'HTMLPurifier/ChildDef/Required.php'; /** * Takes the contents of blockquote when in strict and reformats for validation. - * - * From XHTML 1.0 Transitional to Strict, there is a notable change where */ class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Required { + var $real_elements; + var $fake_elements; var $allow_empty = true; var $type = 'strictblockquote'; var $init = false; - function HTMLPurifier_ChildDef_StrictBlockquote() {} function validateChildren($tokens_of_children, $config, &$context) { $def = $config->getHTMLDefinition(); if (!$this->init) { // allow all inline elements - $this->elements = $def->info_flow_elements; - $this->elements['#PCDATA'] = true; + $this->real_elements = $this->elements; + $this->fake_elements = $def->info_content_sets['Flow']; + $this->fake_elements['#PCDATA'] = true; $this->init = true; } + // trick the parent class into thinking it allows more + $this->elements = $this->fake_elements; $result = parent::validateChildren($tokens_of_children, $config, $context); + $this->elements = $this->real_elements; + if ($result === false) return array(); if ($result === true) $result = $tokens_of_children; @@ -40,8 +44,10 @@ extends HTMLPurifier_ChildDef_Required // ifs are nested for readability if (!$is_inline) { if (!$depth) { - if (($token->type == 'text') || - ($def->info[$token->name]->type == 'inline')) { + if ( + $token->type == 'text' || + !isset($this->elements[$token->name]) + ) { $is_inline = true; $ret[] = $block_wrap_start; } @@ -50,7 +56,7 @@ extends HTMLPurifier_ChildDef_Required if (!$depth) { // starting tokens have been inline text / empty if ($token->type == 'start' || $token->type == 'empty') { - if ($def->info[$token->name]->type == 'block') { + if (isset($this->elements[$token->name])) { // ended $ret[] = $block_wrap_end; $is_inline = false; diff --git a/library/HTMLPurifier/Config.php b/library/HTMLPurifier/Config.php index 252fef5a..620a3534 100644 --- a/library/HTMLPurifier/Config.php +++ b/library/HTMLPurifier/Config.php @@ -149,23 +149,36 @@ class HTMLPurifier_Config return; } $this->conf[$namespace][$key] = $value; + if ($namespace == 'HTML' || $namespace == 'Attr') { + // reset HTML definition if relevant attributes changed + $this->html_definition = null; + } + if ($namespace == 'CSS') { + $this->css_definition = null; + } } /** - * Retrieves a copy of the HTML definition. + * Retrieves reference to the HTML definition. + * @param $raw Return a copy that has not been setup yet. Must be + * called before it's been setup, otherwise won't work. */ - function getHTMLDefinition() { - if ($this->html_definition === null) { - $this->html_definition = new HTMLPurifier_HTMLDefinition(); - $this->html_definition->setup($this); + function &getHTMLDefinition($raw = false) { + if ( + empty($this->html_definition) || // hasn't ever been setup + ($raw && $this->html_definition->setup) // requesting new one + ) { + $this->html_definition = new HTMLPurifier_HTMLDefinition($this); + if ($raw) return $this->html_definition; // no setup! } + if (!$this->html_definition->setup) $this->html_definition->setup(); return $this->html_definition; } /** - * Retrieves a copy of the CSS definition + * Retrieves reference to the CSS definition */ - function getCSSDefinition() { + function &getCSSDefinition() { if ($this->css_definition === null) { $this->css_definition = new HTMLPurifier_CSSDefinition(); $this->css_definition->setup($this); diff --git a/library/HTMLPurifier/ConfigDef.php b/library/HTMLPurifier/ConfigDef.php new file mode 100644 index 00000000..b92640dc --- /dev/null +++ b/library/HTMLPurifier/ConfigDef.php @@ -0,0 +1,10 @@ + \ No newline at end of file diff --git a/library/HTMLPurifier/ConfigDef/Directive.php b/library/HTMLPurifier/ConfigDef/Directive.php new file mode 100644 index 00000000..39026540 --- /dev/null +++ b/library/HTMLPurifier/ConfigDef/Directive.php @@ -0,0 +1,74 @@ +type = $type; + if ($descriptions !== null) $this->descriptions = $descriptions; + if ( $allow_null !== null) $this->allow_null = $allow_null; + if ( $allowed !== null) $this->allowed = $allowed; + if ( $aliases !== null) $this->aliases = $aliases; + } + + /** + * Allowed type of the directive. Values are: + * - string + * - istring (case insensitive string) + * - int + * - float + * - bool + * - lookup (array of value => true) + * - list (regular numbered index array) + * - hash (array of key => value) + * - mixed (anything goes) + */ + var $type = 'mixed'; + + /** + * Plaintext descriptions of the configuration entity is. Organized by + * file and line number, so multiple descriptions are allowed. + */ + var $descriptions = array(); + + /** + * Is null allowed? Has no effect for mixed type. + * @bool + */ + var $allow_null = false; + + /** + * Lookup table of allowed values of the element, bool true if all allowed. + */ + var $allowed = true; + + /** + * Hash of value aliases, i.e. values that are equivalent. + */ + var $aliases = array(); + + /** + * Adds a description to the array + */ + function addDescription($file, $line, $description) { + if (!isset($this->descriptions[$file])) $this->descriptions[$file] = array(); + $this->descriptions[$file][$line] = $description; + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/ConfigDef/DirectiveAlias.php b/library/HTMLPurifier/ConfigDef/DirectiveAlias.php new file mode 100644 index 00000000..81a44514 --- /dev/null +++ b/library/HTMLPurifier/ConfigDef/DirectiveAlias.php @@ -0,0 +1,27 @@ +namespace = $namespace; + $this->name = $name; + } +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/ConfigDef/Namespace.php b/library/HTMLPurifier/ConfigDef/Namespace.php new file mode 100644 index 00000000..f53892b4 --- /dev/null +++ b/library/HTMLPurifier/ConfigDef/Namespace.php @@ -0,0 +1,23 @@ +description = $description; + } + + var $class = 'namespace'; + + /** + * String description of what kinds of directives go in this namespace. + */ + var $description; + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/ConfigSchema.php b/library/HTMLPurifier/ConfigSchema.php index 301f2d1b..76c5635e 100644 --- a/library/HTMLPurifier/ConfigSchema.php +++ b/library/HTMLPurifier/ConfigSchema.php @@ -1,6 +1,10 @@ info[$namespace][$name] = - new HTMLPurifier_ConfigEntity_Directive(); + new HTMLPurifier_ConfigDef_Directive(); $def->info[$namespace][$name]->type = $type; $def->info[$namespace][$name]->allow_null = $allow_null; $def->defaults[$namespace][$name] = $default; @@ -172,7 +176,7 @@ class HTMLPurifier_ConfigSchema { return; } $def->info[$namespace] = array(); - $def->info_namespace[$namespace] = new HTMLPurifier_ConfigEntity_Namespace(); + $def->info_namespace[$namespace] = new HTMLPurifier_ConfigDef_Namespace(); $def->info_namespace[$namespace]->description = $description; $def->defaults[$namespace] = array(); } @@ -284,7 +288,7 @@ class HTMLPurifier_ConfigSchema { return; } $def->info[$namespace][$name] = - new HTMLPurifier_ConfigEntity_DirectiveAlias( + new HTMLPurifier_ConfigDef_DirectiveAlias( $new_namespace, $new_name); } @@ -379,120 +383,4 @@ class HTMLPurifier_ConfigSchema { } } -/** - * Base class for configuration entity - */ -class HTMLPurifier_ConfigEntity { - var $class = false; -} - -/** - * Structure object describing of a namespace - */ -class HTMLPurifier_ConfigEntity_Namespace extends HTMLPurifier_ConfigEntity { - - function HTMLPurifier_ConfigEntity_Namespace($description = null) { - $this->description = $description; - } - - var $class = 'namespace'; - - /** - * String description of what kinds of directives go in this namespace. - */ - var $description; - -} - -/** - * Structure object containing definition of a directive. - * @note This structure does not contain default values - */ -class HTMLPurifier_ConfigEntity_Directive extends HTMLPurifier_ConfigEntity -{ - - var $class = 'directive'; - - function HTMLPurifier_ConfigEntity_Directive( - $type = null, - $descriptions = null, - $allow_null = null, - $allowed = null, - $aliases = null - ) { - if ( $type !== null) $this->type = $type; - if ($descriptions !== null) $this->descriptions = $descriptions; - if ( $allow_null !== null) $this->allow_null = $allow_null; - if ( $allowed !== null) $this->allowed = $allowed; - if ( $aliases !== null) $this->aliases = $aliases; - } - - /** - * Allowed type of the directive. Values are: - * - string - * - istring (case insensitive string) - * - int - * - float - * - bool - * - lookup (array of value => true) - * - list (regular numbered index array) - * - hash (array of key => value) - * - mixed (anything goes) - */ - var $type = 'mixed'; - - /** - * Plaintext descriptions of the configuration entity is. Organized by - * file and line number, so multiple descriptions are allowed. - */ - var $descriptions = array(); - - /** - * Is null allowed? Has no effect for mixed type. - * @bool - */ - var $allow_null = false; - - /** - * Lookup table of allowed values of the element, bool true if all allowed. - */ - var $allowed = true; - - /** - * Hash of value aliases, i.e. values that are equivalent. - */ - var $aliases = array(); - - /** - * Adds a description to the array - */ - function addDescription($file, $line, $description) { - if (!isset($this->descriptions[$file])) $this->descriptions[$file] = array(); - $this->descriptions[$file][$line] = $description; - } - -} - -/** - * Structure object describing a directive alias - */ -class HTMLPurifier_ConfigEntity_DirectiveAlias extends HTMLPurifier_ConfigEntity -{ - var $class = 'alias'; - - /** - * Namespace being aliased to - */ - var $namespace; - /** - * Directive being aliased to - */ - var $name; - - function HTMLPurifier_ConfigEntity_DirectiveAlias($namespace, $name) { - $this->namespace = $namespace; - $this->name = $name; - } -} - ?> diff --git a/library/HTMLPurifier/ContentSets.php b/library/HTMLPurifier/ContentSets.php new file mode 100644 index 00000000..de5c532e --- /dev/null +++ b/library/HTMLPurifier/ContentSets.php @@ -0,0 +1,148 @@ + true) indexed by name. + * @note This is in HTMLPurifier_HTMLDefinition->info_content_sets + * @public + */ + var $lookup = array(); + + /** + * Synchronized list of defined content sets (keys of info) + */ + var $keys = array(); + /** + * Synchronized list of defined content values (values of info) + */ + var $values = array(); + + /** + * Merges in module's content sets, expands identifiers in the content + * sets and populates the keys, values and lookup member variables. + * @param $modules List of HTMLPurifier_HTMLModule + */ + function HTMLPurifier_ContentSets($modules) { + if (!is_array($modules)) $modules = array($modules); + // populate content_sets based on module hints + // sorry, no way of overloading + foreach ($modules as $module_i => $module) { + foreach ($module->content_sets as $key => $value) { + if (isset($this->info[$key])) { + // add it into the existing content set + $this->info[$key] = $this->info[$key] . ' | ' . $value; + } else { + $this->info[$key] = $value; + } + } + } + // perform content_set expansions + $this->keys = array_keys($this->info); + foreach ($this->info as $i => $set) { + // only performed once, so infinite recursion is not + // a problem + $this->info[$i] = + str_replace( + $this->keys, + // must be recalculated each time due to + // changing substitutions + array_values($this->info), + $set); + } + $this->values = array_values($this->info); + + // generate lookup tables + foreach ($this->info as $name => $set) { + $this->lookup[$name] = $this->convertToLookup($set); + } + } + + /** + * Accepts a definition; generates and assigns a ChildDef for it + * @param $def HTMLPurifier_ElementDef reference + * @param $module Module that defined the ElementDef + */ + function generateChildDef(&$def, $module) { + if (!empty($def->child)) return; // already done! + $content_model = $def->content_model; + if (is_string($content_model)) { + $def->content_model = str_replace( + $this->keys, $this->values, $content_model); + } + $def->child = $this->getChildDef($def, $module); + } + + /** + * Instantiates a ChildDef based on content_model and content_model_type + * member variables in HTMLPurifier_ElementDef + * @note This will also defer to modules for custom HTMLPurifier_ChildDef + * subclasses that need content set expansion + * @param $def HTMLPurifier_ElementDef to have ChildDef extracted + * @return HTMLPurifier_ChildDef corresponding to ElementDef + */ + function getChildDef($def, $module) { + $value = $def->content_model; + if (is_object($value)) { + trigger_error( + 'Literal object child definitions should be stored in '. + 'ElementDef->child not ElementDef->content_model', + E_USER_NOTICE + ); + return $value; + } + switch ($def->content_model_type) { + case 'required': + return new HTMLPurifier_ChildDef_Required($value); + case 'optional': + return new HTMLPurifier_ChildDef_Optional($value); + case 'empty': + return new HTMLPurifier_ChildDef_Empty(); + case 'custom': + return new HTMLPurifier_ChildDef_Custom($value); + } + // defer to its module + $return = false; + if ($module->defines_child_def) { // save a func call + $return = $module->getChildDef($def); + } + if ($return !== false) return $return; + // error-out + trigger_error( + 'Could not determine which ChildDef class to instantiate', + E_USER_ERROR + ); + return false; + } + + /** + * Converts a string list of elements separated by pipes into + * a lookup array. + * @param $string List of elements + * @return Lookup array of elements + */ + function convertToLookup($string) { + $array = explode('|', str_replace(' ', '', $string)); + $ret = array(); + foreach ($array as $i => $k) { + $ret[$k] = true; + } + return $ret; + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/ElementDef.php b/library/HTMLPurifier/ElementDef.php new file mode 100644 index 00000000..21bc5f36 --- /dev/null +++ b/library/HTMLPurifier/ElementDef.php @@ -0,0 +1,122 @@ +setup(), this array may also + * contain an array at index 0 that indicates which attribute + * collections to load into the full array. It may also + * contain string indentifiers in lieu of HTMLPurifier_AttrDef, + * see HTMLPurifier_AttrTypes on how they are expanded during + * HTMLPurifier_HTMLDefinition->setup() processing. + * @public + */ + var $attr = array(); + + /** + * Indexed list of tag's HTMLPurifier_AttrTransform to be done before validation + * @public + */ + var $attr_transform_pre = array(); + + /** + * Indexed list of tag's HTMLPurifier_AttrTransform to be done after validation + * @public + */ + var $attr_transform_post = array(); + + + + /** + * HTMLPurifier_ChildDef of this tag. + * @public + */ + var $child; + + /** + * Abstract string representation of internal ChildDef rules. See + * HTMLPurifier_ContentSets for how this is parsed and then transformed + * into an HTMLPurifier_ChildDef. + * @public + */ + var $content_model; + + /** + * Value of $child->type, used to determine which ChildDef to use, + * used in combination with $content_model. + * @public + */ + var $content_model_type; + + + + /** + * Lookup table of tags that close this tag. Used during parsing + * to make sure we don't attempt to nest unclosed tags. + * @public + */ + var $auto_close = array(); + + /** + * Does the element have a content model (#PCDATA | Inline)*? This + * is important for chameleon ins and del processing in + * HTMLPurifier_ChildDef_Chameleon. Dynamically set: modules don't + * have to worry about this one. + * @public + */ + var $descendants_are_inline; + + /** + * Lookup table of tags excluded from all descendants of this tag. + * @public + */ + var $excludes = array(); + + /** + * Merges the values of another element definition into this one. + * Values from the new element def take precedence if a value is + * not mergeable. + */ + function mergeIn($def) { + + // later keys takes precedence + foreach($def->attr as $k => $v) { + if ($k == 0) { + // merge in the includes + // sorry, no way to override an include + foreach ($v as $v2) { + $def->attr[0][] = $v2; + } + continue; + } + $this->attr[$k] = $v; + } + foreach($def->attr_transform_pre as $k => $v) $this->attr_transform_pre[$k] = $v; + foreach($def->attr_transform_post as $k => $v) $this->attr_transform_post[$k] = $v; + foreach($def->auto_close as $k => $v) $this->auto_close[$k] = $v; + foreach($def->excludes as $k => $v) $this->excludes[$k] = $v; + + if(!is_null($def->child)) $this->child = $def->child; + if(!empty($def->content_model)) $this->content_model .= ' | ' . $def->content_model; + if(!empty($def->content_model_type)) $this->content_model_type = $def->content_model_type; + if(!is_null($def->descendants_are_inline)) $this->descendants_are_inline = $def->descendants_are_inline; + + } + +} + +?> diff --git a/library/HTMLPurifier/Filter/YouTube.php b/library/HTMLPurifier/Filter/YouTube.php index 1fd7eb08..433f17cf 100644 --- a/library/HTMLPurifier/Filter/YouTube.php +++ b/library/HTMLPurifier/Filter/YouTube.php @@ -9,7 +9,7 @@ class HTMLPurifier_Filter_YouTube extends HTMLPurifier_Filter function preFilter($html, $config, &$context) { $pre_regex = '#]+>.+?'. - 'http://www.youtube.com/v/([A-Za-z0-9\-_]+).+?#'; + 'http://www.youtube.com/v/([A-Za-z0-9\-_]+).+?#s'; $pre_replace = '\1'; return preg_replace($pre_regex, $pre_replace, $html); } diff --git a/library/HTMLPurifier/HTMLDefinition.php b/library/HTMLPurifier/HTMLDefinition.php index 058d40f4..6fca7fb7 100644 --- a/library/HTMLPurifier/HTMLDefinition.php +++ b/library/HTMLPurifier/HTMLDefinition.php @@ -1,656 +1,250 @@ -<blockquote>Foo</blockquote> '. - 'would become <blockquote><p>Foo</p></blockquote>. The '. - '<p> tags can be replaced '. - 'with whatever you desire, as long as it is a block level element. '. - 'This directive has been available since 1.3.0.' -); - -HTMLPurifier_ConfigSchema::define( - 'HTML', 'Parent', 'div', 'string', - 'String name of element that HTML fragment passed to library will be '. - 'inserted in. An interesting variation would be using span as the '. - 'parent element, meaning that only inline tags would be allowed. '. - 'This directive has been available since 1.3.0.' -); - -HTMLPurifier_ConfigSchema::define( - 'HTML', 'AllowedElements', null, 'lookup/null', - 'If HTML Purifier\'s tag set is unsatisfactory for your needs, you '. - 'can overload it with your own list of tags to allow. Note that this '. - 'method is subtractive: it does its job by taking away from HTML Purifier '. - 'usual feature set, so you cannot add a tag that HTML Purifier never '. - 'supported in the first place (like embed, form or head). If you change this, you '. - 'probably also want to change %HTML.AllowedAttributes. '. - 'Warning: If another directive conflicts with the '. - 'elements here, that directive will win and override. '. - 'This directive has been available since 1.3.0.' -); - -HTMLPurifier_ConfigSchema::define( - 'HTML', 'AllowedAttributes', null, 'lookup/null', - 'IF HTML Purifier\'s attribute set is unsatisfactory, overload it! '. - 'The syntax is \'tag.attr\' or \'*.attr\' for the global attributes '. - '(style, id, class, dir, lang, xml:lang).'. - 'Warning: If another directive conflicts with the '. - 'elements here, that directive will win and override. For '. - 'example, %HTML.EnableAttrID will take precedence over *.id in this '. - 'directive. You must set that directive to true before you can use '. - 'IDs at all. This directive has been available since 1.3.0.' -); - -HTMLPurifier_ConfigSchema::define( - 'Attr', 'DisableURI', false, 'bool', - 'Disables all URIs in all forms. Not sure why you\'d want to do that '. - '(after all, the Internet\'s founded on the notion of a hyperlink). '. - 'This directive has been available since 1.3.0.' -); - -/** - * Defines the purified HTML type with large amounts of objects. - * - * The main function of this object is its $info array, which is an - * associative array of all the child and attribute definitions for - * each allowed element. It also contains special use information (always - * prefixed by info) for intelligent tag closing and global attributes. - * - * For optimization, the definition generation may be moved to - * a maintenance script and stipulate that definition be created - * by a factory method that unserializes a serialized version of Definition. - * Customization would entail copying the maintenance script, making the - * necessary changes, generating the serialized object, and then hooking it - * in via the factory method. We would also offer a LiveDefinition for - * automatic recompilation, suggesting that we would have a DefinitionGenerator. - */ - -class HTMLPurifier_HTMLDefinition -{ - - /** - * Associative array of element names to HTMLPurifier_ElementDef - * @public - */ - var $info = array(); - - /** - * Associative array of global attribute name to attribute definition. - * @public - */ - var $info_global_attr = array(); - - /** - * String name of parent element HTML will be going into. - * @public - */ - var $info_parent = 'div'; - - /** - * Definition for parent element, allows parent element to be a - * tag that's not allowed inside the HTML fragment. - * @public - */ - var $info_parent_def; - - /** - * String name of element used to wrap inline elements in block context - * @note This is rarely used except for BLOCKQUOTEs in strict mode - * @public - */ - var $info_block_wrapper = 'p'; - - /** - * Associative array of deprecated tag name to HTMLPurifier_TagTransform - * @public - */ - var $info_tag_transform = array(); - - /** - * List of HTMLPurifier_AttrTransform to be performed before validation. - * @public - */ - var $info_attr_transform_pre = array(); - - /** - * List of HTMLPurifier_AttrTransform to be performed after validation/ - * @public - */ - var $info_attr_transform_post = array(); - - /** - * Lookup table of flow elements - * @public - */ - var $info_flow_elements = array(); - - /** - * Boolean is a strict definition? - * @public - */ - var $strict; - - /** - * Initializes the definition, the meat of the class. - */ - function setup($config) { - - // some cached config values - $this->strict = $config->get('HTML', 'Strict'); - - ////////////////////////////////////////////////////////////////////// - // info[] : initializes the definition objects - - // if you attempt to define rules later on for a tag not in this array - // PHP will create an stdclass - - $allowed_tags = - array( - 'ins', 'del', 'blockquote', 'dd', 'li', 'div', 'em', 'strong', - 'dfn', 'code', 'samp', 'kbd', 'var', 'cite', 'abbr', 'acronym', - 'q', 'sub', 'tt', 'sup', 'i', 'b', 'big', 'small', - 'bdo', 'span', 'dt', 'p', 'h1', 'h2', 'h3', 'h4', - 'h5', 'h6', 'ol', 'ul', 'dl', 'address', 'img', 'br', 'hr', - 'pre', 'a', 'table', 'caption', 'thead', 'tfoot', 'tbody', - 'colgroup', 'col', 'td', 'th', 'tr' - ); - - if (!$this->strict) { - $allowed_tags[] = 'u'; - $allowed_tags[] = 's'; - $allowed_tags[] = 'strike'; - } - - foreach ($allowed_tags as $tag) { - $this->info[$tag] = new HTMLPurifier_ElementDef(); - } - - ////////////////////////////////////////////////////////////////////// - // info[]->child : defines allowed children for elements - - // emulates the structure of the DTD - // however, these are condensed, with bad stuff taken out - // screening process was done by hand - - // entities: prefixed with e_ and _ replaces . from DTD - // double underlines are entities we made up - - // we don't use an array because that complicates interpolation - // strings are used instead of arrays because if you use arrays, - // you have to do some hideous manipulation with array_merge() - - // todo: determine whether or not having allowed children - // that aren't allowed globally affects security (it shouldn't) - // if above works out, extend children definitions to include all - // possible elements (allowed elements will dictate which ones - // get dropped - - $e_special_extra = 'img'; - $e_special_basic = 'br | span | bdo'; - $e_special = "$e_special_basic | $e_special_extra"; - $e_fontstyle_extra = 'big | small'; - $e_fontstyle_basic = 'tt | i | b | u | s | strike'; - $e_fontstyle = "$e_fontstyle_basic | $e_fontstyle_extra"; - $e_phrase_extra = 'sub | sup'; - $e_phrase_basic = 'em | strong | dfn | code | q | samp | kbd | var'. - ' | cite | abbr | acronym'; - $e_phrase = "$e_phrase_basic | $e_phrase_extra"; - $e_misc_inline = 'ins | del'; - $e_misc = "$e_misc_inline"; - $e_inline = "a | $e_special | $e_fontstyle | $e_phrase"; - // pseudo-property we created for convenience, see later on - $e__inline = "#PCDATA | $e_inline | $e_misc_inline"; - // note the casing - $e_Inline = new HTMLPurifier_ChildDef_Optional($e__inline); - $e_heading = 'h1|h2|h3|h4|h5|h6'; - $e_lists = 'ul | ol | dl'; - $e_blocktext = 'pre | hr | blockquote | address'; - $e_block = "p | $e_heading | div | $e_lists | $e_blocktext | table"; - $e_Block = new HTMLPurifier_ChildDef_Optional($e_block); - $e__flow = "#PCDATA | $e_block | $e_inline | $e_misc"; - $e_Flow = new HTMLPurifier_ChildDef_Optional($e__flow); - $e_a_content = new HTMLPurifier_ChildDef_Optional("#PCDATA". - " | $e_special | $e_fontstyle | $e_phrase | $e_misc_inline"); - $e_pre_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | a". - " | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic". - " | $e_misc_inline"); - $e_form_content = new HTMLPurifier_ChildDef_Optional('');//unused - $e_form_button_content = new HTMLPurifier_ChildDef_Optional('');//unused - - $this->info['ins']->child = - $this->info['del']->child = - new HTMLPurifier_ChildDef_Chameleon($e__inline, $e__flow); - - $this->info['dd']->child = - $this->info['li']->child = - $this->info['div']->child = $e_Flow; - - if ($this->strict) { - $this->info['blockquote']->child = new HTMLPurifier_ChildDef_StrictBlockquote(); - } else { - $this->info['blockquote']->child = $e_Flow; - } - - $this->info['caption']->child = - $this->info['em']->child = - $this->info['strong']->child = - $this->info['dfn']->child = - $this->info['code']->child = - $this->info['samp']->child = - $this->info['kbd']->child = - $this->info['var']->child = - $this->info['cite']->child = - $this->info['abbr']->child = - $this->info['acronym']->child = - $this->info['q']->child = - $this->info['sub']->child = - $this->info['tt']->child = - $this->info['sup']->child = - $this->info['i']->child = - $this->info['b']->child = - $this->info['big']->child = - $this->info['small']->child= - $this->info['bdo']->child = - $this->info['span']->child = - $this->info['dt']->child = - $this->info['p']->child = - $this->info['h1']->child = - $this->info['h2']->child = - $this->info['h3']->child = - $this->info['h4']->child = - $this->info['h5']->child = - $this->info['h6']->child = $e_Inline; - - if (!$this->strict) { - $this->info['u']->child = - $this->info['s']->child = - $this->info['strike']->child = $e_Inline; - } - - // the only three required definitions, besides custom table code - $this->info['ol']->child = - $this->info['ul']->child = new HTMLPurifier_ChildDef_Required('li'); - - $this->info['dl']->child = new HTMLPurifier_ChildDef_Required('dt|dd'); - - if ($this->strict) { - $this->info['address']->child = $e_Inline; - } else { - $this->info['address']->child = - new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline". - " | $e_misc_inline"); - } - - $this->info['img']->child = - $this->info['br']->child = - $this->info['hr']->child = new HTMLPurifier_ChildDef_Empty(); - - $this->info['pre']->child = $e_pre_content; - - $this->info['a']->child = $e_a_content; - - $this->info['table']->child = new HTMLPurifier_ChildDef_Table(); - - // not a real entity, watch the double underscore - $e__row = new HTMLPurifier_ChildDef_Required('tr'); - $this->info['thead']->child = $e__row; - $this->info['tfoot']->child = $e__row; - $this->info['tbody']->child = $e__row; - $this->info['colgroup']->child = new HTMLPurifier_ChildDef_Optional('col'); - $this->info['col']->child = new HTMLPurifier_ChildDef_Empty(); - $this->info['tr']->child = new HTMLPurifier_ChildDef_Required('th | td'); - $this->info['th']->child = $e_Flow; - $this->info['td']->child = $e_Flow; - - ////////////////////////////////////////////////////////////////////// - // info[]->type : defines the type of the element (block or inline) - - // reuses $e_Inline and $e_Block - foreach ($e_Inline->elements as $name => $bool) { - if ($name == '#PCDATA') continue; - if (!isset($this->info[$name])) continue; - $this->info[$name]->type = 'inline'; - } - - foreach ($e_Block->elements as $name => $bool) { - if (!isset($this->info[$name])) continue; - $this->info[$name]->type = 'block'; - } - - foreach ($e_Flow->elements as $name => $bool) { - $this->info_flow_elements[$name] = true; - } - - ////////////////////////////////////////////////////////////////////// - // info[]->excludes : defines elements that aren't allowed in here - - // make sure you test using isset() and not !empty() - - $this->info['a']->excludes = array('a' => true); - $this->info['pre']->excludes = array_flip(array('img', 'big', 'small', - // technically useless, but good to be indepth - 'object', 'applet', 'font', 'basefont')); - - ////////////////////////////////////////////////////////////////////// - // info[]->attr : defines allowed attributes for elements - - // this doesn't include REQUIRED declarations, those are handled - // by the transform classes. It will, however, do simple and slightly - // complex attribute value substitution - - // the question of varying allowed attributes is more entangling. - - $e_Text = new HTMLPurifier_AttrDef_Text(); - - // attrs, included in almost every single one except for a few, - // which manually override these in their local definitions - $this->info_global_attr = array( - // core attrs - 'class' => new HTMLPurifier_AttrDef_Class(), - 'title' => $e_Text, - 'style' => new HTMLPurifier_AttrDef_CSS(), - // i18n - 'dir' => new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false), - 'lang' => new HTMLPurifier_AttrDef_Lang(), - 'xml:lang' => new HTMLPurifier_AttrDef_Lang(), - ); - - if ($config->get('HTML', 'EnableAttrID')) { - $this->info_global_attr['id'] = new HTMLPurifier_AttrDef_ID(); - } - - // required attribute stipulation handled in attribute transformation - $this->info['bdo']->attr = array(); // nothing else - - $this->info['br']->attr['dir'] = false; - $this->info['br']->attr['lang'] = false; - $this->info['br']->attr['xml:lang'] = false; - - $this->info['td']->attr['abbr'] = $e_Text; - $this->info['th']->attr['abbr'] = $e_Text; - - $this->setAttrForTableElements('align', new HTMLPurifier_AttrDef_Enum( - array('left', 'center', 'right', 'justify', 'char'), false)); - - $this->setAttrForTableElements('valign', new HTMLPurifier_AttrDef_Enum( - array('top', 'middle', 'bottom', 'baseline'), false)); - - $this->info['img']->attr['alt'] = $e_Text; - - $e_TFrame = new HTMLPurifier_AttrDef_Enum(array('void', 'above', - 'below', 'hsides', 'lhs', 'rhs', 'vsides', 'box', 'border'), false); - $this->info['table']->attr['frame'] = $e_TFrame; - - $e_TRules = new HTMLPurifier_AttrDef_Enum(array('none', 'groups', - 'rows', 'cols', 'all'), false); - $this->info['table']->attr['rules'] = $e_TRules; - - $this->info['table']->attr['summary'] = $e_Text; - - $this->info['table']->attr['border'] = - new HTMLPurifier_AttrDef_Pixels(); - - $e_Length = new HTMLPurifier_AttrDef_Length(); - $this->info['table']->attr['cellpadding'] = - $this->info['table']->attr['cellspacing'] = - $this->info['table']->attr['width'] = - $this->info['img']->attr['height'] = - $this->info['img']->attr['width'] = $e_Length; - $this->setAttrForTableElements('charoff', $e_Length); - - $e_MultiLength = new HTMLPurifier_AttrDef_MultiLength(); - $this->info['col']->attr['width'] = - $this->info['colgroup']->attr['width'] = $e_MultiLength; - - $e__NumberSpan = new HTMLPurifier_AttrDef_Integer(false, false, true); - $this->info['colgroup']->attr['span'] = - $this->info['col']->attr['span'] = - $this->info['td']->attr['rowspan'] = - $this->info['th']->attr['rowspan'] = - $this->info['td']->attr['colspan'] = - $this->info['th']->attr['colspan'] = $e__NumberSpan; - - if (!$config->get('Attr', 'DisableURI')) { - $e_URI = new HTMLPurifier_AttrDef_URI(); - $this->info['a']->attr['href'] = - $this->info['img']->attr['longdesc'] = - $this->info['del']->attr['cite'] = - $this->info['ins']->attr['cite'] = - $this->info['blockquote']->attr['cite'] = - $this->info['q']->attr['cite'] = $e_URI; - - // URI that causes HTTP request - $this->info['img']->attr['src'] = new HTMLPurifier_AttrDef_URI(true); - } - - if (!$this->strict) { - $this->info['li']->attr['value'] = new HTMLPurifier_AttrDef_Integer(); - $this->info['ol']->attr['start'] = new HTMLPurifier_AttrDef_Integer(); - } - - ////////////////////////////////////////////////////////////////////// - // info_tag_transform : transformations of tags - - $this->info_tag_transform['font'] = new HTMLPurifier_TagTransform_Font(); - $this->info_tag_transform['menu'] = new HTMLPurifier_TagTransform_Simple('ul'); - $this->info_tag_transform['dir'] = new HTMLPurifier_TagTransform_Simple('ul'); - $this->info_tag_transform['center'] = new HTMLPurifier_TagTransform_Center(); - - ////////////////////////////////////////////////////////////////////// - // info[]->auto_close : tags that automatically close another - - // todo: determine whether or not SGML-like modeling based on - // mandatory/optional end tags would be a better policy - - // make sure you test using isset() not !empty() - - // these are all block elements: blocks aren't allowed in P - $this->info['p']->auto_close = array_flip(array( - 'address', 'blockquote', 'dd', 'dir', 'div', 'dl', 'dt', - 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'p', 'pre', - 'table', 'ul' - )); - - $this->info['li']->auto_close = array('li' => true); - - // we need TABLE and heading mismatch code - // we may need to make this more flexible for heading mismatch, - // or we can just create another info - - ////////////////////////////////////////////////////////////////////// - // info[]->attr_transform_* : attribute transformations in elements - // pre is applied before any validation is done, post is done after - - $this->info['h1']->attr_transform_pre[] = - $this->info['h2']->attr_transform_pre[] = - $this->info['h3']->attr_transform_pre[] = - $this->info['h4']->attr_transform_pre[] = - $this->info['h5']->attr_transform_pre[] = - $this->info['h6']->attr_transform_pre[] = - $this->info['p'] ->attr_transform_pre[] = - new HTMLPurifier_AttrTransform_TextAlign(); - - $this->info['bdo']->attr_transform_post[] = - new HTMLPurifier_AttrTransform_BdoDir(); - - $this->info['img']->attr_transform_post[] = - new HTMLPurifier_AttrTransform_ImgRequired(); - - ////////////////////////////////////////////////////////////////////// - // info_attr_transform_* : global attribute transformation that is - // unconditionally called. Good for transformations that have complex - // start conditions - // pre is applied before any validation is done, post is done after - - $this->info_attr_transform_post[] = new HTMLPurifier_AttrTransform_Lang(); - - // protect against stdclasses floating around - foreach ($this->info as $key => $obj) { - if ($obj instanceof stdClass) { - unset($this->info[$key]); - } - } - - ////////////////////////////////////////////////////////////////////// - // info_block_wrapper : wraps inline elements in block context - - $block_wrapper = $config->get('HTML', 'BlockWrapper'); - if (isset($e_Block->elements[$block_wrapper])) { - $this->info_block_wrapper = $block_wrapper; - } else { - trigger_error('Cannot use non-block element as block wrapper.', - E_USER_ERROR); - } - - ////////////////////////////////////////////////////////////////////// - // info_parent : parent element of the HTML fragment - - $parent = $config->get('HTML', 'Parent'); - if (isset($this->info[$parent])) { - $this->info_parent = $parent; - } else { - trigger_error('Cannot use unrecognized element as parent.', - E_USER_ERROR); - } - $this->info_parent_def = $this->info[$this->info_parent]; - - ////////////////////////////////////////////////////////////////////// - // %HTML.Allowed(Elements|Attributes) : cut non-allowed elements - - $allowed_elements = $config->get('HTML', 'AllowedElements'); - if (is_array($allowed_elements)) { - foreach ($this->info as $name => $d) { - if(!isset($allowed_elements[$name])) unset($this->info[$name]); - } - } - $allowed_attributes = $config->get('HTML', 'AllowedAttributes'); - if (is_array($allowed_attributes)) { - foreach ($this->info_global_attr as $attr_key => $info) { - if (!isset($allowed_attributes["*.$attr_key"])) { - unset($this->info_global_attr[$attr_key]); - } - } - foreach ($this->info as $tag => $info) { - foreach ($info->attr as $attr => $attr_info) { - if (!isset($allowed_attributes["$tag.$attr"])) { - unset($this->info[$tag]->attr[$attr]); - } - } - } - } - } - - function setAttrForTableElements($attr, $def) { - $this->info['col']->attr[$attr] = - $this->info['colgroup']->attr[$attr] = - $this->info['tbody']->attr[$attr] = - $this->info['td']->attr[$attr] = - $this->info['tfoot']->attr[$attr] = - $this->info['th']->attr[$attr] = - $this->info['thead']->attr[$attr] = - $this->info['tr']->attr[$attr] = $def; - } - -} - -/** - * Structure that stores an element definition. - */ -class HTMLPurifier_ElementDef -{ - - /** - * Associative array of attribute name to HTMLPurifier_AttrDef - * @public - */ - var $attr = array(); - - /** - * List of tag's HTMLPurifier_AttrTransform to be done before validation - * @public - */ - var $attr_transform_pre = array(); - - /** - * List of tag's HTMLPurifier_AttrTransform to be done after validation - * @public - */ - var $attr_transform_post = array(); - - /** - * Lookup table of tags that close this tag. - * @public - */ - var $auto_close = array(); - - /** - * HTMLPurifier_ChildDef of this tag. - * @public - */ - var $child; - - /** - * Type of the tag: inline or block or unknown? - * @public - */ - var $type = 'unknown'; - - /** - * Lookup table of tags excluded from all descendants of this tag. - * @public - */ - var $excludes = array(); - -} - -?> +<blockquote>Foo</blockquote> '. + 'would become <blockquote><p>Foo</p></blockquote>. The '. + '<p> tags can be replaced '. + 'with whatever you desire, as long as it is a block level element. '. + 'This directive has been available since 1.3.0.' +); + +HTMLPurifier_ConfigSchema::define( + 'HTML', 'Parent', 'div', 'string', + 'String name of element that HTML fragment passed to library will be '. + 'inserted in. An interesting variation would be using span as the '. + 'parent element, meaning that only inline tags would be allowed. '. + 'This directive has been available since 1.3.0.' +); + +HTMLPurifier_ConfigSchema::define( + 'HTML', 'AllowedElements', null, 'lookup/null', + 'If HTML Purifier\'s tag set is unsatisfactory for your needs, you '. + 'can overload it with your own list of tags to allow. Note that this '. + 'method is subtractive: it does its job by taking away from HTML Purifier '. + 'usual feature set, so you cannot add a tag that HTML Purifier never '. + 'supported in the first place (like embed, form or head). If you change this, you '. + 'probably also want to change %HTML.AllowedAttributes. '. + 'Warning: If another directive conflicts with the '. + 'elements here, that directive will win and override. '. + 'This directive has been available since 1.3.0.' +); + +HTMLPurifier_ConfigSchema::define( + 'HTML', 'AllowedAttributes', null, 'lookup/null', + 'IF HTML Purifier\'s attribute set is unsatisfactory, overload it! '. + 'The syntax is \'tag.attr\' or \'*.attr\' for the global attributes '. + '(style, id, class, dir, lang, xml:lang).'. + 'Warning: If another directive conflicts with the '. + 'elements here, that directive will win and override. For '. + 'example, %HTML.EnableAttrID will take precedence over *.id in this '. + 'directive. You must set that directive to true before you can use '. + 'IDs at all. This directive has been available since 1.3.0.' +); + +/** + * Definition of the purified HTML that describes allowed children, + * attributes, and many other things. + * + * Conventions: + * + * All member variables that are prefixed with info + * (including the main $info array) are used by HTML Purifier internals + * and should not be directly edited when customizing the HTMLDefinition. + * They can usually be set via configuration directives or custom + * modules. + * + * On the other hand, member variables without the info prefix are used + * internally by the HTMLDefinition and MUST NOT be used by other HTML + * Purifier internals. Many of them, however, are public, and may be + * edited by userspace code to tweak the behavior of HTMLDefinition. + * + * HTMLPurifier_Printer_HTMLDefinition is a notable exception to this + * rule: in the interest of comprehensiveness, it will sniff everything. + */ +class HTMLPurifier_HTMLDefinition +{ + + /** FULLY-PUBLIC VARIABLES */ + + /** + * Associative array of element names to HTMLPurifier_ElementDef + * @public + */ + var $info = array(); + + /** + * Associative array of global attribute name to attribute definition. + * @public + */ + var $info_global_attr = array(); + + /** + * String name of parent element HTML will be going into. + * @public + */ + var $info_parent = 'div'; + + /** + * Definition for parent element, allows parent element to be a + * tag that's not allowed inside the HTML fragment. + * @public + */ + var $info_parent_def; + + /** + * String name of element used to wrap inline elements in block context + * @note This is rarely used except for BLOCKQUOTEs in strict mode + * @public + */ + var $info_block_wrapper = 'p'; + + /** + * Associative array of deprecated tag name to HTMLPurifier_TagTransform + * @public + */ + var $info_tag_transform = array(); + + /** + * Indexed list of HTMLPurifier_AttrTransform to be performed before validation. + * @public + */ + var $info_attr_transform_pre = array(); + + /** + * Indexed list of HTMLPurifier_AttrTransform to be performed after validation. + * @public + */ + var $info_attr_transform_post = array(); + + /** + * Nested lookup array of content set name (Block, Inline) to + * element name to whether or not it belongs in that content set. + * @public + */ + var $info_content_sets = array(); + + + + /** PUBLIC BUT INTERNAL VARIABLES */ + + var $setup = false; /**< Has setup() been called yet? */ + var $config; /**< Temporary instance of HTMLPurifier_Config */ + + var $manager; /**< Instance of HTMLPurifier_HTMLModuleManager */ + + /** + * Performs low-cost, preliminary initialization. + * @param $config Instance of HTMLPurifier_Config + */ + function HTMLPurifier_HTMLDefinition(&$config) { + $this->config =& $config; + $this->manager = new HTMLPurifier_HTMLModuleManager(); + } + + /** + * Processes internals into form usable by HTMLPurifier internals. + * Modifying the definition after calling this function should not + * be done. + */ + function setup() { + + // multiple call guard + if ($this->setup) {return;} else {$this->setup = true;} + + $this->processModules(); + $this->setupConfigStuff(); + + unset($this->config); + unset($this->manager); + + } + + /** + * Extract out the information from the manager + */ + function processModules() { + + $this->manager->setup($this->config); + + foreach ($this->manager->activeModules as $module) { + foreach($module->info_tag_transform as $k => $v) $this->info_tag_transform[$k] = $v; + foreach($module->info_attr_transform_pre as $k => $v) $this->info_attr_transform_pre[$k] = $v; + foreach($module->info_attr_transform_post as $k => $v) $this->info_attr_transform_post[$k]= $v; + } + + $this->info = $this->manager->getElements($this->config); + $this->info_content_sets = $this->manager->contentSets->lookup; + + } + + /** + * Sets up stuff based on config. We need a better way of doing this. + */ + function setupConfigStuff() { + + $block_wrapper = $this->config->get('HTML', 'BlockWrapper'); + if (isset($this->info_content_sets['Block'][$block_wrapper])) { + $this->info_block_wrapper = $block_wrapper; + } else { + trigger_error('Cannot use non-block element as block wrapper.', + E_USER_ERROR); + } + + $parent = $this->config->get('HTML', 'Parent'); + $def = $this->manager->getElement($parent, $this->config); + if ($def) { + $this->info_parent = $parent; + $this->info_parent_def = $def; + } else { + trigger_error('Cannot use unrecognized element as parent.', + E_USER_ERROR); + $this->info_parent_def = $this->manager->getElement( + $this->info_parent, $this->config); + } + + // setup allowed elements, SubtractiveWhitelist module + $allowed_elements = $this->config->get('HTML', 'AllowedElements'); + if (is_array($allowed_elements)) { + foreach ($this->info as $name => $d) { + if(!isset($allowed_elements[$name])) unset($this->info[$name]); + } + } + $allowed_attributes = $this->config->get('HTML', 'AllowedAttributes'); + if (is_array($allowed_attributes)) { + foreach ($this->info_global_attr as $attr_key => $info) { + if (!isset($allowed_attributes["*.$attr_key"])) { + unset($this->info_global_attr[$attr_key]); + } + } + foreach ($this->info as $tag => $info) { + foreach ($info->attr as $attr => $attr_info) { + if (!isset($allowed_attributes["$tag.$attr"]) && + !isset($allowed_attributes["*.$attr"])) { + unset($this->info[$tag]->attr[$attr]); + } + } + } + } + + } + + +} + +?> diff --git a/library/HTMLPurifier/HTMLModule.php b/library/HTMLPurifier/HTMLModule.php new file mode 100644 index 00000000..930b605d --- /dev/null +++ b/library/HTMLPurifier/HTMLModule.php @@ -0,0 +1,125 @@ +info, since the object's data is only info, + * with extra behavior associated with it. + * @public + */ + var $attr_collections = array(); + + /** + * Associative array of deprecated tag name to HTMLPurifier_TagTransform + * @public + */ + var $info_tag_transform = array(); + + /** + * List of HTMLPurifier_AttrTransform to be performed before validation. + * @public + */ + var $info_attr_transform_pre = array(); + + /** + * List of HTMLPurifier_AttrTransform to be performed after validation. + * @public + */ + var $info_attr_transform_post = array(); + + /** + * Boolean flag that indicates whether or not getChildDef is implemented. + * For optimization reasons: may save a call to a function. Be sure + * to set it if you do implement getChildDef(), otherwise it will have + * no effect! + * @public + */ + var $defines_child_def = false; + + /** + * Retrieves a proper HTMLPurifier_ChildDef subclass based on + * content_model and content_model_type member variables of + * the HTMLPurifier_ElementDef class. There is a similar function + * in HTMLPurifier_HTMLDefinition. + * @param $def HTMLPurifier_ElementDef instance + * @return HTMLPurifier_ChildDef subclass + * @public + */ + function getChildDef($def) {return false;} + + /** + * Hook method that lets module perform arbitrary operations on + * HTMLPurifier_HTMLDefinition before the module gets processed. + * @param $definition Reference to HTMLDefinition being setup + */ + function preProcess(&$definition) {} + + /** + * Hook method that lets module perform arbitrary operations + * on HTMLPurifier_HTMLDefinition after the module gets processed. + * @param $definition Reference to HTMLDefinition being setup + */ + function postProcess(&$definition) {} + + /** + * Hook method that is called when a module gets registered to + * the definition. + * @param $definition Reference to HTMLDefinition being setup + */ + function setup(&$definition) {} + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/Bdo.php b/library/HTMLPurifier/HTMLModule/Bdo.php new file mode 100644 index 00000000..17e5e987 --- /dev/null +++ b/library/HTMLPurifier/HTMLModule/Bdo.php @@ -0,0 +1,43 @@ + 'bdo'); + var $attr_collections = array( + 'I18N' => array('dir' => false) + ); + + function HTMLPurifier_HTMLModule_Bdo() { + $dir = new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false); + $this->attr_collections['I18N']['dir'] = $dir; + $this->info['bdo'] = new HTMLPurifier_ElementDef(); + $this->info['bdo']->attr = array( + 0 => array('Core', 'Lang'), + 'dir' => $dir, // required + // The Abstract Module specification has the attribute + // inclusions wrong for bdo: bdo allows + // xml:lang too (and we'll toss in lang for good measure, + // though it is not allowed for XHTML 1.1, this will + // be managed with a global attribute transform) + ); + $this->info['bdo']->content_model = '#PCDATA | Inline'; + $this->info['bdo']->content_model_type = 'optional'; + // provides fallback behavior if dir's missing (dir is required) + $this->info['bdo']->attr_transform_post['required-dir'] = + new HTMLPurifier_AttrTransform_BdoDir(); + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/CommonAttributes.php b/library/HTMLPurifier/HTMLModule/CommonAttributes.php new file mode 100644 index 00000000..8f17c2f0 --- /dev/null +++ b/library/HTMLPurifier/HTMLModule/CommonAttributes.php @@ -0,0 +1,31 @@ + array( + 0 => array('Style'), + // 'xml:space' => false, + 'class' => 'NMTOKENS', + 'id' => 'ID', + 'title' => 'CDATA', + ), + 'Lang' => array( + 'xml:lang' => false, // see constructor + ), + 'I18N' => array( + 0 => array('Lang'), // proprietary, for xml:lang/lang + ), + 'Common' => array( + 0 => array('Core', 'I18N') + ) + ); + + function HTMLPurifier_HTMLModule_CommonAttributes() { + $this->attr_collections['Lang']['xml:lang'] = new HTMLPurifier_AttrDef_Lang(); + } +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/Edit.php b/library/HTMLPurifier/HTMLModule/Edit.php new file mode 100644 index 00000000..6a415906 --- /dev/null +++ b/library/HTMLPurifier/HTMLModule/Edit.php @@ -0,0 +1,46 @@ + 'del | ins'); + + function HTMLPurifier_HTMLModule_Edit() { + foreach ($this->elements as $element) { + $this->info[$element] = new HTMLPurifier_ElementDef(); + $this->info[$element]->attr = array( + 0 => array('Common'), + 'cite' => 'URI', + // 'datetime' => 'Datetime' // Datetime not implemented + ); + // Inline context ! Block context (exclamation mark is + // separator, see getChildDef for parsing) + $this->info[$element]->content_model = + '#PCDATA | Inline ! #PCDATA | Flow'; + // HTML 4.01 specifies that ins/del must not contain block + // elements when used in an inline context, chameleon is + // a complicated workaround to acheive this effect + $this->info[$element]->content_model_type = 'chameleon'; + } + } + + var $defines_child_def = true; + function getChildDef($def) { + if ($def->content_model_type != 'chameleon') return false; + $value = explode('!', $def->content_model); + return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]); + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/Hypertext.php b/library/HTMLPurifier/HTMLModule/Hypertext.php new file mode 100644 index 00000000..0b8a2e98 --- /dev/null +++ b/library/HTMLPurifier/HTMLModule/Hypertext.php @@ -0,0 +1,36 @@ + 'a'); + + function HTMLPurifier_HTMLModule_Hypertext() { + $this->info['a'] = new HTMLPurifier_ElementDef(); + $this->info['a']->attr = array( + 0 => array('Common'), + // 'accesskey' => 'Character', + // 'charset' => 'Charset', + 'href' => 'URI', + //'hreflang' => 'LanguageCode', + //'rel' => 'LinkTypes', + //'rev' => 'LinkTypes', + //'tabindex' => 'Number', + //'type' => 'ContentType', + ); + $this->info['a']->content_model = '#PCDATA | Inline'; + $this->info['a']->content_model_type = 'optional'; + $this->info['a']->excludes = array('a' => true); + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/Image.php b/library/HTMLPurifier/HTMLModule/Image.php new file mode 100644 index 00000000..3852836d --- /dev/null +++ b/library/HTMLPurifier/HTMLModule/Image.php @@ -0,0 +1,38 @@ + 'img'); + + function HTMLPurifier_HTMLModule_Image() { + $this->info['img'] = new HTMLPurifier_ElementDef(); + $this->info['img']->attr = array( + 0 => array('Common'), + 'alt' => 'Text', + 'height' => 'Length', + 'longdesc' => 'URI', + 'src' => new HTMLPurifier_AttrDef_URI(true), // embedded + 'width' => 'Length' + ); + $this->info['img']->content_model_type = 'empty'; + $this->info['img']->attr_transform_post[] = + new HTMLPurifier_AttrTransform_ImgRequired(); + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/Legacy.php b/library/HTMLPurifier/HTMLModule/Legacy.php new file mode 100644 index 00000000..a0613a2f --- /dev/null +++ b/library/HTMLPurifier/HTMLModule/Legacy.php @@ -0,0 +1,60 @@ +elements as $name) { + $this->info[$name] = new HTMLPurifier_ElementDef(); + // for u, s, strike, as more elements get added, add + // conditionals as necessary + $this->info[$name]->content_model = 'Inline | #PCDATA'; + $this->info[$name]->content_model_type = 'optional'; + $this->info[$name]->attr[0] = array('Common'); + } + + // setup modifications to old elements + foreach ($this->non_standalone_elements as $name) { + $this->info[$name] = new HTMLPurifier_ElementDef(); + $this->info[$name]->standalone = false; + } + + $this->info['li']->attr['value'] = new HTMLPurifier_AttrDef_Integer(); + $this->info['ol']->attr['start'] = new HTMLPurifier_AttrDef_Integer(); + + $this->info['address']->content_model = 'Inline | #PCDATA | p'; + $this->info['address']->content_model_type = 'optional'; + $this->info['address']->child = false; + + $this->info['blockquote']->content_model = 'Flow | #PCDATA'; + $this->info['blockquote']->content_model_type = 'optional'; + $this->info['blockquote']->child = false; + + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/List.php b/library/HTMLPurifier/HTMLModule/List.php new file mode 100644 index 00000000..c74982df --- /dev/null +++ b/library/HTMLPurifier/HTMLModule/List.php @@ -0,0 +1,46 @@ + 'dl | ol | ul', 'Flow' => 'List'); + + function HTMLPurifier_HTMLModule_List() { + foreach ($this->elements as $element) { + $this->info[$element] = new HTMLPurifier_ElementDef(); + $this->info[$element]->attr = array(0 => array('Common')); + if ($element == 'li' || $element == 'dd') { + $this->info[$element]->content_model = '#PCDATA | Flow'; + $this->info[$element]->content_model_type = 'optional'; + } elseif ($element == 'ol' || $element == 'ul') { + $this->info[$element]->content_model = 'li'; + $this->info[$element]->content_model_type = 'required'; + } + } + $this->info['dt']->content_model = '#PCDATA | Inline'; + $this->info['dt']->content_model_type = 'optional'; + $this->info['dl']->content_model = 'dt | dd'; + $this->info['dl']->content_model_type = 'required'; + // this could be a LOT more robust + $this->info['li']->auto_close = array('li' => true); + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/Presentation.php b/library/HTMLPurifier/HTMLModule/Presentation.php new file mode 100644 index 00000000..42d9c11e --- /dev/null +++ b/library/HTMLPurifier/HTMLModule/Presentation.php @@ -0,0 +1,41 @@ + 'hr', + 'Inline' => 'b | big | i | small | sub | sup | tt' + ); + + function HTMLPurifier_HTMLModule_Presentation() { + foreach ($this->elements as $element) { + $this->info[$element] = new HTMLPurifier_ElementDef(); + $this->info[$element]->attr = array(0 => array('Common')); + if ($element == 'hr') { + $this->info[$element]->content_model_type = 'empty'; + } else { + $this->info[$element]->content_model = '#PCDATA | Inline'; + $this->info[$element]->content_model_type = 'optional'; + } + } + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/StyleAttribute.php b/library/HTMLPurifier/HTMLModule/StyleAttribute.php new file mode 100644 index 00000000..5ee5d1cf --- /dev/null +++ b/library/HTMLPurifier/HTMLModule/StyleAttribute.php @@ -0,0 +1,27 @@ + array('style' => false), // see constructor + 'Core' => array(0 => array('Style')) + ); + + function HTMLPurifier_HTMLModule_StyleAttribute() { + $this->attr_collections['Style']['style'] = new HTMLPurifier_AttrDef_CSS(); + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/Tables.php b/library/HTMLPurifier/HTMLModule/Tables.php new file mode 100644 index 00000000..ea41f5b1 --- /dev/null +++ b/library/HTMLPurifier/HTMLModule/Tables.php @@ -0,0 +1,88 @@ + 'table'); + + function HTMLPurifier_HTMLModule_Tables() { + foreach ($this->elements as $e) { + $this->info[$e] = new HTMLPurifier_ElementDef(); + $this->info[$e]->attr = array(0 => array('Common')); + $attr =& $this->info[$e]->attr; + if ($e == 'caption') continue; + if ($e == 'table'){ + $attr['border'] = 'Pixels'; + $attr['cellpadding'] = 'Length'; + $attr['cellspacing'] = 'Length'; + $attr['frame'] = new HTMLPurifier_AttrDef_Enum(array( + 'void', 'above', 'below', 'hsides', 'lhs', 'rhs', + 'vsides', 'box', 'border' + ), false); + $attr['rules'] = new HTMLPurifier_AttrDef_Enum(array( + 'none', 'groups', 'rows', 'cols', 'all' + ), false); + $attr['summary'] = 'Text'; + $attr['width'] = 'Length'; + continue; + } + if ($e == 'col' || $e == 'colgroup') { + $attr['span'] = 'Number'; + $attr['width'] = 'MultiLength'; + } + if ($e == 'td' || $e == 'th') { + $attr['abbr'] = 'Text'; + $attr['colspan'] = 'Number'; + $attr['rowspan'] = 'Number'; + } + $attr['align'] = new HTMLPurifier_AttrDef_Enum(array( + 'left', 'center', 'right', 'justify', 'char' + ), false); + $attr['valign'] = new HTMLPurifier_AttrDef_Enum(array( + 'top', 'middle', 'bottom', 'baseline' + ), false); + $attr['charoff'] = 'Length'; + } + $this->info['caption']->content_model = '#PCDATA | Inline'; + $this->info['caption']->content_model_type = 'optional'; + + // Is done directly because it doesn't leverage substitution + // mechanisms. True model is: + // 'caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ ))' + $this->info['table']->child = new HTMLPurifier_ChildDef_Table(); + + $this->info['td']->content_model = + $this->info['th']->content_model = '#PCDATA | Flow'; + $this->info['td']->content_model_type = + $this->info['th']->content_model_type = 'optional'; + + $this->info['tr']->content_model = 'td | th'; + $this->info['tr']->content_model_type = 'required'; + + $this->info['col']->content_model_type = 'empty'; + + $this->info['colgroup']->content_model = 'col'; + $this->info['colgroup']->content_model_type = 'optional'; + + $this->info['tbody']->content_model = + $this->info['thead']->content_model = + $this->info['tfoot']->content_model = 'tr'; + $this->info['tbody']->content_model_type = + $this->info['thead']->content_model_type = + $this->info['tfoot']->content_model_type = 'required'; + + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/Text.php b/library/HTMLPurifier/HTMLModule/Text.php new file mode 100644 index 00000000..56361a39 --- /dev/null +++ b/library/HTMLPurifier/HTMLModule/Text.php @@ -0,0 +1,78 @@ + 'h1 | h2 | h3 | h4 | h5 | h6', + 'Block' => 'address | blockquote | div | p | pre', + 'Inline' => 'abbr | acronym | br | cite | code | dfn | em | kbd | q | samp | span | strong | var', + 'Flow' => 'Heading | Block | Inline' + ); + + function HTMLPurifier_HTMLModule_Text() { + foreach ($this->elements as $element) { + $this->info[$element] = new HTMLPurifier_ElementDef(); + // attributes + if ($element == 'br') { + $this->info[$element]->attr = array(0 => array('Core')); + } elseif ($element == 'blockquote' || $element == 'q') { + $this->info[$element]->attr = array(0 => array('Common'), 'cite' => 'URI'); + } else { + $this->info[$element]->attr = array(0 => array('Common')); + } + // content models + if ($element == 'br') { + $this->info[$element]->content_model_type = 'empty'; + } elseif ($element == 'blockquote') { + $this->info[$element]->content_model = 'Heading | Block | List'; + $this->info[$element]->content_model_type = 'optional'; + } elseif ($element == 'div') { + $this->info[$element]->content_model = '#PCDATA | Flow'; + $this->info[$element]->content_model_type = 'optional'; + } else { + $this->info[$element]->content_model = '#PCDATA | Inline'; + $this->info[$element]->content_model_type = 'optional'; + } + } + // SGML permits exclusions for all descendants, but this is + // not possible with DTDs or XML Schemas. W3C has elected to + // use complicated compositions of content_models to simulate + // exclusion for children, but we go the simpler, SGML-style + // route of flat-out exclusions. Note that the Abstract Module + // is blithely unaware of such distinctions. + $this->info['pre']->excludes = array_flip(array( + 'img', 'big', 'small', + 'object', 'applet', 'font', 'basefont' // generally not allowed + )); + $this->info['p']->auto_close = array_flip(array( + 'address', 'blockquote', 'dd', 'dir', 'div', 'dl', 'dt', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'p', 'pre', + 'table', 'ul' + )); + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/TransformToStrict.php b/library/HTMLPurifier/HTMLModule/TransformToStrict.php new file mode 100644 index 00000000..d228f84f --- /dev/null +++ b/library/HTMLPurifier/HTMLModule/TransformToStrict.php @@ -0,0 +1,86 @@ + false, + 'menu' => false, + 'dir' => false, + 'center'=> false + ); + + var $attr_collections = array( + 'Lang' => array( + 'lang' => false // placeholder + ) + ); + + var $info_attr_transform_post = array( + 'lang' => false // placeholder + ); + + function HTMLPurifier_HTMLModule_TransformToStrict() { + + // deprecated tag transforms + $this->info_tag_transform['font'] = new HTMLPurifier_TagTransform_Font(); + $this->info_tag_transform['menu'] = new HTMLPurifier_TagTransform_Simple('ul'); + $this->info_tag_transform['dir'] = new HTMLPurifier_TagTransform_Simple('ul'); + $this->info_tag_transform['center'] = new HTMLPurifier_TagTransform_Center(); + + foreach ($this->elements as $name) { + $this->info[$name] = new HTMLPurifier_ElementDef(); + $this->info[$name]->standalone = false; + } + + // deprecated attribute transforms + $this->info['h1']->attr_transform_pre['align'] = + $this->info['h2']->attr_transform_pre['align'] = + $this->info['h3']->attr_transform_pre['align'] = + $this->info['h4']->attr_transform_pre['align'] = + $this->info['h5']->attr_transform_pre['align'] = + $this->info['h6']->attr_transform_pre['align'] = + $this->info['p'] ->attr_transform_pre['align'] = + new HTMLPurifier_AttrTransform_TextAlign(); + + // xml:lang <=> lang mirroring, implement in TransformToStrict, + // this is overridden in TransformToXHTML11 + $this->info_attr_transform_post['lang'] = new HTMLPurifier_AttrTransform_Lang(); + $this->attr_collections['Lang']['lang'] = new HTMLPurifier_AttrDef_Lang(); + + // this should not be applied to XHTML 1.0 Transitional, ONLY + // XHTML 1.0 Strict. We may need three classes + $this->info['blockquote']->content_model_type = 'strictblockquote'; + $this->info['blockquote']->child = false; // recalculate please! + + } + + var $defines_child_def = true; + function getChildDef($def) { + if ($def->content_model_type != 'strictblockquote') return false; + return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model); + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/TransformToXHTML11.php b/library/HTMLPurifier/HTMLModule/TransformToXHTML11.php new file mode 100644 index 00000000..0915f5b6 --- /dev/null +++ b/library/HTMLPurifier/HTMLModule/TransformToXHTML11.php @@ -0,0 +1,30 @@ + array( + 'lang' => false // remove it + ) + ); + + var $info_attr_transform_post = array( + 'lang' => false // remove it + ); + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModuleManager.php b/library/HTMLPurifier/HTMLModuleManager.php new file mode 100644 index 00000000..e0090472 --- /dev/null +++ b/library/HTMLPurifier/HTMLModuleManager.php @@ -0,0 +1,558 @@ +attrTypes = new HTMLPurifier_AttrTypes(); + + if (!$blank) $this->initialize(); + + } + + function initialize() { + $this->initialized = true; + + // load default modules to the recognized modules list (not active) + $modules = array( + // define + 'CommonAttributes', + 'Text', 'Hypertext', 'List', 'Presentation', + 'Edit', 'Bdo', 'Tables', 'Image', 'StyleAttribute', + // define-redefine + 'Legacy', + // redefine + 'TransformToStrict', 'TransformToXHTML11' + ); + foreach ($modules as $module) { + $this->addModule($module); + } + + // Safe modules for supported doctypes. These are included + // in the valid and active module lists by default + $this->collections['Safe'] = array( + '_Common' => array( // leading _ indicates private + 'CommonAttributes', 'Text', 'Hypertext', 'List', + 'Presentation', 'Edit', 'Bdo', 'Tables', 'Image', + 'StyleAttribute' + ), + // HTML definitions, defer to XHTML definitions + 'HTML 4.01 Transitional' => array(array('XHTML 1.0 Transitional')), + 'HTML 4.01 Strict' => array(array('XHTML 1.0 Strict')), + // XHTML definitions + 'XHTML 1.0 Transitional' => array( array('XHTML 1.0 Strict'), 'Legacy' ), + 'XHTML 1.0 Strict' => array(array('_Common')), + 'XHTML 1.1' => array(array('_Common')), + ); + + // Modules that specify elements that are unsafe from untrusted + // third-parties. These should be registered in $validModules but + // almost never $activeModules unless you really know what you're + // doing. + $this->collections['Unsafe'] = array(); + + // Modules to import if lenient mode (attempt to convert everything + // to a valid representation) is on. These must not be in $validModules + // unless specified so. + $this->collections['Lenient'] = array( + 'HTML 4.01 Strict' => array(array('XHTML 1.0 Strict')), + 'XHTML 1.0 Strict' => array('TransformToStrict'), + 'XHTML 1.1' => array(array('XHTML 1.0 Strict'), 'TransformToXHTML11') + ); + + // Modules to import if correctional mode (correct everything that + // is feasible to strict mode) is on. These must not be in $validModules + // unless specified so. + $this->collections['Correctional'] = array( + 'HTML 4.01 Transitional' => array(array('XHTML 1.0 Transitional')), + 'XHTML 1.0 Transitional' => array('TransformToStrict'), // probably want a different one + ); + + // User-space modules, custom code or whatever + $this->collections['Extension'] = array(); + + // setup active versus valid modules. ORDER IS IMPORTANT! + // definition modules + $this->makeCollectionActive('Safe'); + $this->makeCollectionValid('Unsafe'); + // redefinition modules + $this->makeCollectionActive('Lenient'); + $this->makeCollectionActive('Correctional'); + + $this->autoDoctype = '*'; + $this->autoCollection = 'Extension'; + + } + + /** + * Adds a module to the recognized module list. This does not + * do anything else: the module must be added to a corresponding + * collection to be "activated". + * @param $module Mixed: string module name, with or without + * HTMLPurifier_HTMLModule prefix, or instance of + * subclass of HTMLPurifier_HTMLModule. + */ + function addModule($module) { + if (is_string($module)) { + $original_module = $module; + if (!class_exists($module)) { + foreach ($this->prefixes as $prefix) { + $module = $prefix . $original_module; + if (class_exists($module)) break; + } + } + if (!class_exists($module)) { + trigger_error($original_module . ' module does not exist', + E_USER_ERROR); + return; + } + $module = new $module(); + } + $module->order = $this->counter++; // assign then increment + $this->modules[$module->name] = $module; + if ($this->autoDoctype !== false && $this->autoCollection !== false) { + $this->collections[$this->autoCollection][$this->autoDoctype][] = $module->name; + } + } + + /** + * Makes a collection active, while also making it valid if not + * already done so. See $activeModules for the semantics of "active". + * @param $collection_name Name of collection to activate + */ + function makeCollectionActive($collection_name) { + if (!in_array($collection_name, $this->validCollections)) { + $this->makeCollectionValid($collection_name); + } + $this->activeCollections[] = $collection_name; + } + + /** + * Makes a collection valid. See $validModules for the semantics of "valid" + */ + function makeCollectionValid($collection_name) { + $this->validCollections[] = $collection_name; + } + + /** + * Adds a class prefix that addModule() will use to resolve a + * string name to a concrete class + */ + function addPrefix($prefix) { + $this->prefixes[] = (string) $prefix; + } + + function setup($config) { + + // load up the autocollection + if ($this->autoCollection !== false) { + $this->makeCollectionActive($this->autoCollection); + } + + // retrieve the doctype + $this->doctype = $this->getDoctype($config); + if (isset($this->doctypeAliases[$this->doctype])) { + $this->doctype = $this->doctypeAliases[$this->doctype]; + } + + // process module collections to module name => module instance form + foreach ($this->collections as $col_i => $x) { + $this->processCollections($this->collections[$col_i]); + } + + $this->validModules = $this->assembleModules($this->validCollections); + $this->activeModules = $this->assembleModules($this->activeCollections); + + // setup lookup table based on all valid modules + foreach ($this->validModules as $module) { + foreach ($module->info as $name => $def) { + if (!isset($this->elementLookup[$name])) { + $this->elementLookup[$name] = array(); + } + $this->elementLookup[$name][] = $module->name; + } + } + + // note the different choice + $this->contentSets = new HTMLPurifier_ContentSets( + // content models that contain non-allowed elements are + // harmless because RemoveForeignElements will ensure + // they never get in anyway, and there is usually no + // reason why you should want to restrict a content + // model beyond what is mandated by the doctype. + // Note, however, that this means redefinitions of + // content models can't be tossed in validModels willy-nilly: + // that stuff still is regulated by configuration. + $this->validModules + ); + $this->attrCollections = new HTMLPurifier_AttrCollections( + $this->attrTypes, + // only explicitly allowed modules are allowed to affect + // the global attribute collections. This mean's there's + // a distinction between loading the Bdo module, and the + // bdo element: Bdo will enable the dir attribute on all + // elements, while bdo will only define the bdo element, + // which will not have an editable directionality. This might + // catch people who are loading only elements by surprise, so + // we should consider loading an entire module if all the + // elements it defines are requested by the user, especially + // if it affects the global attribute collections. + $this->activeModules + ); + + } + + /** + * Takes a list of collections and merges together all the defined + * modules for the current doctype from those collections. + * @param $collections List of collection suffixes we should grab + * modules from (like 'Safe' or 'Lenient') + */ + function assembleModules($collections) { + $modules = array(); + $numOfCollectionsUsed = 0; + foreach ($collections as $name) { + $disable_global = false; + if (!isset($this->collections[$name])) { + trigger_error("$name collection is undefined", E_USER_ERROR); + continue; + } + $cols = $this->collections[$name]; + if (isset($cols[$this->doctype])) { + if (isset($cols[$this->doctype]['*'])) { + unset($cols[$this->doctype]['*']); + $disable_global = true; + } + $modules += $cols[$this->doctype]; + $numOfCollectionsUsed++; + } + // accept catch-all doctype + if ( + $this->doctype !== '*' && + isset($cols['*']) && + !$disable_global + ) { + $modules += $cols['*']; + } + } + + if ($numOfCollectionsUsed < 1) { + // possible XSS injection if user-specified doctypes + // are allowed + trigger_error("Doctype {$this->doctype} does not exist, ". + "check for typos (if you desire a doctype that allows ". + "no elements, use an empty array collection)", E_USER_ERROR); + } + return $modules; + } + + /** + * Takes a collection and performs inclusions and substitutions for it. + * @param $cols Reference to collections class member variable + */ + function processCollections(&$cols) { + + // $cols is the set of collections + // $col_i is the name (index) of a collection + // $col is a collection/list of modules + + // perform inclusions + foreach ($cols as $col_i => $col) { + $seen = array(); + if (!empty($col[0]) && is_array($col[0])) { + $seen[$col_i] = true; // recursion reporting + $includes = $col[0]; + unset($cols[$col_i][0]); // remove inclusions value, recursion guard + } else { + $includes = array(); + } + if (empty($includes)) continue; + for ($i = 0; isset($includes[$i]); $i++) { + $inc = $includes[$i]; + if (isset($seen[$inc])) { + trigger_error( + "Circular inclusion detected in $col_i collection", + E_USER_ERROR + ); + continue; + } else { + $seen[$inc] = true; + } + if (!isset($cols[$inc])) { + trigger_error( + "Collection $col_i tried to include undefined ". + "collection $inc", E_USER_ERROR); + continue; + } + foreach ($cols[$inc] as $module) { + if (is_array($module)) { // another inclusion! + foreach ($module as $inc2) $includes[] = $inc2; + continue; + } + $cols[$col_i][] = $module; // merge in the other modules + } + } + } + + // replace with real modules, invert module from list to + // assoc array of module name to module instance + foreach ($cols as $col_i => $col) { + $ignore_global = false; + $order = array(); + foreach ($col as $module_i => $module) { + unset($cols[$col_i][$module_i]); + if (is_array($module)) { + trigger_error("Illegal inclusion array at index". + " $module_i found collection $col_i, inclusion". + " arrays must be at start of collection (index 0)", + E_USER_ERROR); + continue; + } + if ($module_i === '*' && $module === false) { + $ignore_global = true; + continue; + } + if (!isset($this->modules[$module])) { + trigger_error( + "Collection $col_i references undefined ". + "module $module", + E_USER_ERROR + ); + continue; + } + $module = $this->modules[$module]; + $cols[$col_i][$module->name] = $module; + $order[$module->name] = $module->order; + } + array_multisort( + $order, SORT_ASC, SORT_NUMERIC, $cols[$col_i] + ); + if ($ignore_global) $cols[$col_i]['*'] = false; + } + + // delete pseudo-collections + foreach ($cols as $col_i => $col) { + if ($col_i[0] == '_') unset($cols[$col_i]); + } + + } + + /** + * Retrieves the doctype from the configuration object + */ + function getDoctype($config) { + $doctype = $config->get('HTML', 'Doctype'); + if ($doctype !== null) { + return $doctype; + } + if (!$this->initialized) { + // don't do HTML-oriented backwards compatibility stuff + // use either the auto-doctype, or the catch-all doctype + return $this->autoDoctype ? $this->autoDoctype : '*'; + } + // this is backwards-compatibility stuff + if ($config->get('Core', 'XHTML')) { + $doctype = 'XHTML 1.0'; + } else { + $doctype = 'HTML 4.01'; + } + if ($config->get('HTML', 'Strict')) { + $doctype .= ' Strict'; + } else { + $doctype .= ' Transitional'; + } + return $doctype; + } + + /** + * Retrieves merged element definitions for all active elements. + * @note We may want to generate an elements array during setup + * and pass that on, because a specific combination of + * elements may trigger the loading of a module. + * @param $config Instance of HTMLPurifier_Config, for determining + * stray elements. + */ + function getElements($config) { + + $elements = array(); + foreach ($this->activeModules as $module) { + foreach ($module->elements as $name) { + $elements[$name] = $this->getElement($name, $config); + } + } + + // standalone elements now loaded + + return $elements; + + } + + /** + * Retrieves a single merged element definition + * @param $name Name of element + * @param $config Instance of HTMLPurifier_Config, may not be necessary. + */ + function getElement($name, $config) { + + $def = false; + + $modules = $this->validModules; + + if (!isset($this->elementLookup[$name])) { + return false; + } + + foreach($this->elementLookup[$name] as $module_name) { + + $module = $modules[$module_name]; + $new_def = $module->info[$name]; + + if (!$def && $new_def->standalone) { + $def = $new_def; + } elseif ($def) { + $def->mergeIn($new_def); + } else { + // could "save it for another day": + // non-standalone definitions that don't have a standalone + // to merge into could be deferred to the end + continue; + } + + // attribute value expansions + $this->attrCollections->performInclusions($def->attr); + $this->attrCollections->expandIdentifiers($def->attr, $this->attrTypes); + + // descendants_are_inline, for ChildDef_Chameleon + if (is_string($def->content_model) && + strpos($def->content_model, 'Inline') !== false) { + if ($name != 'del' && $name != 'ins') { + // this is for you, ins/del + $def->descendants_are_inline = true; + } + } + + $this->contentSets->generateChildDef($def, $module); + } + + return $def; + + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/Language.php b/library/HTMLPurifier/Language.php new file mode 100644 index 00000000..ca6fe031 --- /dev/null +++ b/library/HTMLPurifier/Language.php @@ -0,0 +1,56 @@ +_loaded) return; + $factory = HTMLPurifier_LanguageFactory::instance(); + $factory->loadLanguage($this->code); + foreach ($factory->keys as $key) { + $this->$key = $factory->cache[$this->code][$key]; + } + $this->_loaded = true; + } + + /** + * Retrieves a localised message. Does not perform any operations. + * @param $key string identifier of message + * @return string localised message + */ + function getMessage($key) { + if (!$this->_loaded) $this->load(); + if (!isset($this->messages[$key])) return ''; + return $this->messages[$key]; + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/Language/classes/en-x-test.php b/library/HTMLPurifier/Language/classes/en-x-test.php new file mode 100644 index 00000000..303ba4ba --- /dev/null +++ b/library/HTMLPurifier/Language/classes/en-x-test.php @@ -0,0 +1,12 @@ + \ No newline at end of file diff --git a/library/HTMLPurifier/Language/messages/en-x-test.php b/library/HTMLPurifier/Language/messages/en-x-test.php new file mode 100644 index 00000000..115662bd --- /dev/null +++ b/library/HTMLPurifier/Language/messages/en-x-test.php @@ -0,0 +1,11 @@ + 'HTML Purifier X' +); + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/Language/messages/en.php b/library/HTMLPurifier/Language/messages/en.php new file mode 100644 index 00000000..7650b818 --- /dev/null +++ b/library/HTMLPurifier/Language/messages/en.php @@ -0,0 +1,12 @@ + 'HTML Purifier', +'pizza' => 'Pizza', // for unit testing purposes + +); + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/LanguageFactory.php b/library/HTMLPurifier/LanguageFactory.php new file mode 100644 index 00000000..5cdf1281 --- /dev/null +++ b/library/HTMLPurifier/LanguageFactory.php @@ -0,0 +1,196 @@ +cache[$language_code][$key] = $value + * @value array map + */ + var $cache; + + /** + * Valid keys in the HTMLPurifier_Language object. Designates which + * variables to slurp out of a message file. + * @value array list + */ + var $keys = array('fallback', 'messages'); + + /** + * Instance of HTMLPurifier_AttrDef_Lang to validate language codes + * @value object HTMLPurifier_AttrDef_Lang + */ + var $validator; + + /** + * Cached copy of dirname(__FILE__), directory of current file without + * trailing slash + * @value string filename + */ + var $dir; + + /** + * Keys whose contents are a hash map and can be merged + * @value array lookup + */ + var $mergeable_keys_map = array('messages' => true); + + /** + * Keys whose contents are a list and can be merged + * @value array lookup + */ + var $mergeable_keys_list = array(); + + /** + * Retrieve sole instance of the factory. + * @static + * @param $prototype Optional prototype to overload sole instance with, + * or bool true to reset to default factory. + */ + static function &instance($prototype = null) { + static $instance = null; + if ($prototype !== null) { + $instance = $prototype; + } elseif ($instance === null || $prototype == true) { + $instance = new HTMLPurifier_LanguageFactory(); + $instance->setup(); + } + return $instance; + } + + /** + * Sets up the singleton, much like a constructor + * @note Prevents people from getting this outside of the singleton + */ + function setup() { + $this->validator = new HTMLPurifier_AttrDef_Lang(); + $this->dir = dirname(__FILE__); + } + + /** + * Creates a language object, handles class fallbacks + * @param $code string language code + */ + function create($code) { + + $config = $context = false; // hope it doesn't use these! + $code = $this->validator->validate($code, $config, $context); + if ($code === false) $code = 'en'; // malformed code becomes English + + $pcode = str_replace('-', '_', $code); // make valid PHP classname + static $depth = 0; // recursion protection + + if ($code == 'en') { + $class = 'HTMLPurifier_Language'; + $file = $this->dir . '/Language.php'; + } else { + $class = 'HTMLPurifier_Language_' . $pcode; + $file = $this->dir . '/Language/classes/' . $code . '.php'; + // PHP5/APC deps bug workaround can go here + // you can bypass the conditional include by loading the + // file yourself + if (file_exists($file) && !class_exists($class)) { + include_once $file; + } + } + + if (!class_exists($class)) { + // go fallback + $fallback = HTMLPurifier_Language::getFallbackFor($code); + $depth++; + $lang = Language::factory( $fallback ); + $depth--; + } else { + $lang = new $class; + } + $lang->code = $code; + + return $lang; + + } + + /** + * Returns the fallback language for language + * @note Loads the original language into cache + * @param $code string language code + */ + function getFallbackFor($code) { + $this->loadLanguage($code); + return $this->cache[$code]['fallback']; + } + + /** + * Loads language into the cache, handles message file and fallbacks + * @param $code string language code + */ + function loadLanguage($code) { + static $languages_seen = array(); // recursion guard + + // abort if we've already loaded it + if (isset($this->cache[$code])) return; + + // generate filename + $filename = $this->dir . '/Language/messages/' . $code . '.php'; + + // default fallback : may be overwritten by the ensuing include + $fallback = ($code != 'en') ? 'en' : false; + + // load primary localisation + if (!file_exists($filename)) { + // skip the include: will rely solely on fallback + $filename = $this->dir . '/Language/messages/en.php'; + $cache = array(); + } else { + include $filename; + $cache = compact($this->keys); + } + + // load fallback localisation + if (!empty($fallback)) { + + // infinite recursion guard + if (isset($languages_seen[$code])) { + trigger_error('Circular fallback reference in language ' . + $code, E_USER_ERROR); + $fallback = 'en'; + } + $language_seen[$code] = true; + + // load the fallback recursively + $this->loadLanguage($fallback); + $fallback_cache = $this->cache[$fallback]; + + // merge fallback with current language + foreach ( $this->keys as $key ) { + if (isset($cache[$key]) && isset($fallback_cache[$key])) { + if (isset($this->mergeable_keys_map[$key])) { + $cache[$key] = $cache[$key] + $fallback_cache[$key]; + } elseif (isset($this->mergeable_keys_list[$key])) { + $cache[$key] = array_merge( $fallback_cache[$key], $cache[$key] ); + } + } else { + $cache[$key] = $fallback_cache[$key]; + } + } + + } + + // save to cache for later retrieval + $this->cache[$code] = $cache; + + return; + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index ca5a5328..975fb65f 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -151,7 +151,8 @@ class HTMLPurifier_Lexer $lexer = $prototype; } if (empty($lexer)) { - if (class_exists('DOMDocument')) { // check for DOM support + if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5 + class_exists('DOMDocument')) { // check for DOM support require_once 'HTMLPurifier/Lexer/DOMLex.php'; $lexer = new HTMLPurifier_Lexer_DOMLex(); } else { diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index dcf3caee..9286b023 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -21,7 +21,7 @@ require_once 'HTMLPurifier/TokenFactory.php'; * * @warning DOM tends to drop whitespace, which may wreak havoc on indenting. * If this is a huge problem, due to the fact that HTML is hand - * edited and youa re unable to get a parser cache that caches the + * edited and you are unable to get a parser cache that caches the * the output of HTML Purifier while keeping the original HTML lying * around, you may want to run Tidy on the resulting output or use * HTMLPurifier_DirectLex @@ -54,7 +54,13 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer $doc = new DOMDocument(); $doc->encoding = 'UTF-8'; // technically does nothing, but whatever - @$doc->loadHTML($string); // mute all errors, handle it transparently + + // DOM will toss errors if the HTML its parsing has really big + // problems, so we're going to mute them. This can cause problems + // if a custom error handler that doesn't implement error_reporting + // is set, as noted by a Drupal plugin of HTML Purifier. Consider + // making our own error reporter to temporarily load in + @$doc->loadHTML($string); $tokens = array(); $this->tokenizeDOM( diff --git a/library/HTMLPurifier/Printer/HTMLDefinition.php b/library/HTMLPurifier/Printer/HTMLDefinition.php index 2ec297e7..a677c58b 100644 --- a/library/HTMLPurifier/Printer/HTMLDefinition.php +++ b/library/HTMLPurifier/Printer/HTMLDefinition.php @@ -13,6 +13,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer function render($config) { $ret = ''; $this->config =& $config; + $this->def = $config->getHTMLDefinition(); $def =& $this->def; @@ -21,16 +22,14 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer $ret .= $this->element('caption', 'Environment'); $ret .= $this->row('Parent of fragment', $def->info_parent); - $ret .= $this->row('Strict mode', $def->strict); - if ($def->strict) $ret .= $this->row('Block wrap name', $def->info_block_wrapper); + $ret .= $this->renderChildren($def->info_parent_def->child); + $ret .= $this->row('Block wrap name', $def->info_block_wrapper); $ret .= $this->start('tr'); $ret .= $this->element('th', 'Global attributes'); $ret .= $this->element('td', $this->listifyAttr($def->info_global_attr),0,0); $ret .= $this->end('tr'); - $ret .= $this->renderChildren($def->info_parent_def->child); - $ret .= $this->start('tr'); $ret .= $this->element('th', 'Tag transforms'); $list = array(); @@ -81,8 +80,8 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer $ret .= $this->element('th', "<$name>", array('class'=>'heavy', 'colspan' => 2)); $ret .= $this->end('tr'); $ret .= $this->start('tr'); - $ret .= $this->element('th', 'Type'); - $ret .= $this->element('td', ucfirst($def->type)); + $ret .= $this->element('th', 'Inline content'); + $ret .= $this->element('td', $def->descendants_are_inline ? 'Yes' : 'No'); $ret .= $this->end('tr'); if (!empty($def->excludes)) { $ret .= $this->start('tr'); @@ -130,15 +129,17 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer $elements = array(); $attr = array(); if (isset($def->elements)) { - if ($def->type == 'strictblockquote') $def->validateChildren(array(), $this->config, $context); + if ($def->type == 'strictblockquote') { + $def->validateChildren(array(), $this->config, $context); + } $elements = $def->elements; } elseif ($def->type == 'chameleon') { $attr['rowspan'] = 2; } elseif ($def->type == 'empty') { $elements = array(); } elseif ($def->type == 'table') { - $elements = array('col', 'caption', 'colgroup', 'thead', - 'tfoot', 'tbody', 'tr'); + $elements = array_flip(array('col', 'caption', 'colgroup', 'thead', + 'tfoot', 'tbody', 'tr')); } $ret .= $this->element('th', 'Allowed children', $attr); @@ -167,6 +168,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer * @param $array Tag lookup array in form of array('tagname' => true) */ function listifyTagLookup($array) { + ksort($array); $list = array(); foreach ($array as $name => $discard) { if ($name !== '#PCDATA' && !isset($this->def->info[$name])) continue; @@ -181,6 +183,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer * @todo Also add information about internal state */ function listifyObjectList($array) { + ksort($array); $list = array(); foreach ($array as $discard => $obj) { $list[] = $this->getClass($obj, 'AttrTransform_'); @@ -193,6 +196,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer * @param $array Array hash in form of array('attrname' => HTMLPurifier_AttrDef) */ function listifyAttr($array) { + ksort($array); $list = array(); foreach ($array as $name => $obj) { if ($obj === false) continue; diff --git a/library/HTMLPurifier/Strategy/FixNesting.php b/library/HTMLPurifier/Strategy/FixNesting.php index dd5a920f..08f90756 100644 --- a/library/HTMLPurifier/Strategy/FixNesting.php +++ b/library/HTMLPurifier/Strategy/FixNesting.php @@ -49,8 +49,8 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy $tokens[] = new HTMLPurifier_Token_End($parent_name); // setup the context variables - $parent_type = 'unknown'; // reference var that we alter - $context->register('ParentType', $parent_type); + $is_inline = false; // reference var that we alter + $context->register('IsInline', $is_inline); //####################################################################// // Loop initialization @@ -115,11 +115,16 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy } // calculate context - if (isset($parent_def)) { - $parent_type = $parent_def->type; + if ($is_inline === false) { + // check if conditions make it inline + if (!empty($parent_def) && $parent_def->descendants_are_inline) { + $is_inline = $count - 1; + } } else { - // generally found in specialized elements like UL - $parent_type = 'unknown'; + // check if we're out of inline + if ($count === $is_inline) { + $is_inline = false; + } } //################################################################// @@ -273,7 +278,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy array_pop($tokens); // remove context variables - $context->destroy('ParentType'); + $context->destroy('IsInline'); //####################################################################// // Return diff --git a/library/HTMLPurifier/TagTransform.php b/library/HTMLPurifier/TagTransform.php index be0555a0..f5dc5c97 100644 --- a/library/HTMLPurifier/TagTransform.php +++ b/library/HTMLPurifier/TagTransform.php @@ -1,6 +1,6 @@ transform_to = $transform_to; - } - - function transform($tag, $config, &$context) { - $new_tag = $tag->copy(); - $new_tag->name = $this->transform_to; - return $new_tag; - } - -} - -/** - * Transforms CENTER tags into proper version (DIV with text-align CSS) - * - * Takes a CENTER tag, parses the align attribute, and then if it's valid - * assigns it to the CSS property text-align. - */ -class HTMLPurifier_TagTransform_Center extends HTMLPurifier_TagTransform -{ - var $transform_to = 'div'; - - function transform($tag, $config, &$context) { - if ($tag->type == 'end') { - $new_tag = new HTMLPurifier_Token_End($this->transform_to); - return $new_tag; - } - $attr = $tag->attr; - $prepend_css = 'text-align:center;'; - if (isset($attr['style'])) { - $attr['style'] = $prepend_css . $attr['style']; - } else { - $attr['style'] = $prepend_css; - } - $new_tag = $tag->copy(); - $new_tag->name = $this->transform_to; - $new_tag->attr = $attr; - return $new_tag; - } -} - -/** - * Transforms FONT tags to the proper form (SPAN with CSS styling) - * - * This transformation takes the three proprietary attributes of FONT and - * transforms them into their corresponding CSS attributes. These are color, - * face, and size. - * - * @note Size is an interesting case because it doesn't map cleanly to CSS. - * Thanks to - * http://style.cleverchimp.com/font_size_intervals/altintervals.html - * for reasonable mappings. - */ -class HTMLPurifier_TagTransform_Font extends HTMLPurifier_TagTransform -{ - - var $transform_to = 'span'; - - var $_size_lookup = array( - '1' => 'xx-small', - '2' => 'small', - '3' => 'medium', - '4' => 'large', - '5' => 'x-large', - '6' => 'xx-large', - '7' => '300%', - '-1' => 'smaller', - '+1' => 'larger', - '-2' => '60%', - '+2' => '150%', - '+4' => '300%' - ); - - function transform($tag, $config, &$context) { - - if ($tag->type == 'end') { - $new_tag = new HTMLPurifier_Token_End($this->transform_to); - return $new_tag; - } - - $attr = $tag->attr; - $prepend_style = ''; - - // handle color transform - if (isset($attr['color'])) { - $prepend_style .= 'color:' . $attr['color'] . ';'; - unset($attr['color']); - } - - // handle face transform - if (isset($attr['face'])) { - $prepend_style .= 'font-family:' . $attr['face'] . ';'; - unset($attr['face']); - } - - // handle size transform - if (isset($attr['size'])) { - if (isset($this->_size_lookup[$attr['size']])) { - $prepend_style .= 'font-size:' . - $this->_size_lookup[$attr['size']] . ';'; - } - unset($attr['size']); - } - - if ($prepend_style) { - $attr['style'] = isset($attr['style']) ? - $prepend_style . $attr['style'] : - $prepend_style; - } - - $new_tag = $tag->copy(); - $new_tag->name = $this->transform_to; - $new_tag->attr = $attr; - - return $new_tag; - - } -} - ?> \ No newline at end of file diff --git a/library/HTMLPurifier/TagTransform/Center.php b/library/HTMLPurifier/TagTransform/Center.php new file mode 100644 index 00000000..571bb9df --- /dev/null +++ b/library/HTMLPurifier/TagTransform/Center.php @@ -0,0 +1,34 @@ +type == 'end') { + $new_tag = new HTMLPurifier_Token_End($this->transform_to); + return $new_tag; + } + $attr = $tag->attr; + $prepend_css = 'text-align:center;'; + if (isset($attr['style'])) { + $attr['style'] = $prepend_css . $attr['style']; + } else { + $attr['style'] = $prepend_css; + } + $new_tag = $tag->copy(); + $new_tag->name = $this->transform_to; + $new_tag->attr = $attr; + return $new_tag; + } +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/TagTransform/Font.php b/library/HTMLPurifier/TagTransform/Font.php new file mode 100644 index 00000000..ae6d7838 --- /dev/null +++ b/library/HTMLPurifier/TagTransform/Font.php @@ -0,0 +1,83 @@ + 'xx-small', + '2' => 'small', + '3' => 'medium', + '4' => 'large', + '5' => 'x-large', + '6' => 'xx-large', + '7' => '300%', + '-1' => 'smaller', + '+1' => 'larger', + '-2' => '60%', + '+2' => '150%', + '+4' => '300%' + ); + + function transform($tag, $config, &$context) { + + if ($tag->type == 'end') { + $new_tag = new HTMLPurifier_Token_End($this->transform_to); + return $new_tag; + } + + $attr = $tag->attr; + $prepend_style = ''; + + // handle color transform + if (isset($attr['color'])) { + $prepend_style .= 'color:' . $attr['color'] . ';'; + unset($attr['color']); + } + + // handle face transform + if (isset($attr['face'])) { + $prepend_style .= 'font-family:' . $attr['face'] . ';'; + unset($attr['face']); + } + + // handle size transform + if (isset($attr['size'])) { + if (isset($this->_size_lookup[$attr['size']])) { + $prepend_style .= 'font-size:' . + $this->_size_lookup[$attr['size']] . ';'; + } + unset($attr['size']); + } + + if ($prepend_style) { + $attr['style'] = isset($attr['style']) ? + $prepend_style . $attr['style'] : + $prepend_style; + } + + $new_tag = $tag->copy(); + $new_tag->name = $this->transform_to; + $new_tag->attr = $attr; + + return $new_tag; + + } +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/TagTransform/Simple.php b/library/HTMLPurifier/TagTransform/Simple.php new file mode 100644 index 00000000..6ffd0eab --- /dev/null +++ b/library/HTMLPurifier/TagTransform/Simple.php @@ -0,0 +1,26 @@ +transform_to = $transform_to; + } + + function transform($tag, $config, &$context) { + $new_tag = $tag->copy(); + $new_tag->name = $this->transform_to; + return $new_tag; + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/URISchemeRegistry.php b/library/HTMLPurifier/URISchemeRegistry.php index 1d529ba5..1ece1a2a 100644 --- a/library/HTMLPurifier/URISchemeRegistry.php +++ b/library/HTMLPurifier/URISchemeRegistry.php @@ -10,7 +10,7 @@ HTMLPurifier_ConfigSchema::define( 'irc' => true, // "Internet Relay Chat", usually needs another app // for Usenet, these two are similar, but distinct 'nntp' => true, // individual Netnews articles - 'news' => true // newsgroup or individual Netnews articles), + 'news' => true // newsgroup or individual Netnews articles ), 'lookup', 'Whitelist that defines the schemes that a URI is allowed to have. This '. 'prevents XSS attacks from using pseudo-schemes like javascript or mocha.' diff --git a/smoketests/printDefinition.php b/smoketests/printDefinition.php index a616f0d5..8e02d47d 100644 --- a/smoketests/printDefinition.php +++ b/smoketests/printDefinition.php @@ -22,6 +22,17 @@ foreach ($_GET as $key => $value) { @$config->loadArray($get); +/* // sample local definition, obviously needs to be less clunky +$html_definition =& $config->getHTMLDefinition(true); +$module = new HTMLPurifier_HTMLModule(); +$module->name = 'Marquee'; +$module->info['marquee'] = new HTMLPurifier_ElementDef(); +$module->info['marquee']->content_model = '#PCDATA | Inline'; +$module->info['marquee']->content_model_type = 'optional'; +$module->content_sets = array('Inline' => 'marquee'); +$html_definition->manager->addModule($module); +*/ + $printer_html_definition = new HTMLPurifier_Printer_HTMLDefinition(); $printer_css_definition = new HTMLPurifier_Printer_CSSDefinition(); diff --git a/tests/HTMLPurifier/AttrDef/BackgroundPositionTest.php b/tests/HTMLPurifier/AttrDef/CSS/BackgroundPositionTest.php similarity index 90% rename from tests/HTMLPurifier/AttrDef/BackgroundPositionTest.php rename to tests/HTMLPurifier/AttrDef/CSS/BackgroundPositionTest.php index ce720841..911823f4 100644 --- a/tests/HTMLPurifier/AttrDef/BackgroundPositionTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/BackgroundPositionTest.php @@ -1,14 +1,14 @@ def = new HTMLPurifier_AttrDef_BackgroundPosition(); + $this->def = new HTMLPurifier_AttrDef_CSS_BackgroundPosition(); // explicitly cited in spec $this->assertDef('0% 0%'); diff --git a/tests/HTMLPurifier/AttrDef/BackgroundTest.php b/tests/HTMLPurifier/AttrDef/CSS/BackgroundTest.php similarity index 52% rename from tests/HTMLPurifier/AttrDef/BackgroundTest.php rename to tests/HTMLPurifier/AttrDef/CSS/BackgroundTest.php index 69b3c1ba..d4db8493 100644 --- a/tests/HTMLPurifier/AttrDef/BackgroundTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/BackgroundTest.php @@ -1,14 +1,15 @@ def = new HTMLPurifier_AttrDef_Background(HTMLPurifier_Config::createDefault()); + $config = HTMLPurifier_Config::createDefault(); + $this->def = new HTMLPurifier_AttrDef_CSS_Background($config); $valid = '#333 url(chess.png) repeat fixed 50% top'; $this->assertDef($valid); diff --git a/tests/HTMLPurifier/AttrDef/BorderTest.php b/tests/HTMLPurifier/AttrDef/CSS/BorderTest.php similarity index 50% rename from tests/HTMLPurifier/AttrDef/BorderTest.php rename to tests/HTMLPurifier/AttrDef/CSS/BorderTest.php index b18bfe70..521588db 100644 --- a/tests/HTMLPurifier/AttrDef/BorderTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/BorderTest.php @@ -1,14 +1,14 @@ def = new HTMLPurifier_AttrDef_Border(HTMLPurifier_Config::createDefault()); + $config = HTMLPurifier_Config::createDefault(); + $this->def = new HTMLPurifier_AttrDef_CSS_Border($config); $this->assertDef('thick solid red', 'thick solid #F00'); $this->assertDef('thick solid'); diff --git a/tests/HTMLPurifier/AttrDef/ColorTest.php b/tests/HTMLPurifier/AttrDef/CSS/ColorTest.php similarity index 83% rename from tests/HTMLPurifier/AttrDef/ColorTest.php rename to tests/HTMLPurifier/AttrDef/CSS/ColorTest.php index b44082c0..1c29ae68 100644 --- a/tests/HTMLPurifier/AttrDef/ColorTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/ColorTest.php @@ -1,14 +1,14 @@ def = new HTMLPurifier_AttrDef_Color(); + $this->def = new HTMLPurifier_AttrDef_CSS_Color(); $this->assertDef('#F00'); $this->assertDef('#808080'); diff --git a/tests/HTMLPurifier/AttrDef/CompositeTest.php b/tests/HTMLPurifier/AttrDef/CSS/CompositeTest.php similarity index 83% rename from tests/HTMLPurifier/AttrDef/CompositeTest.php rename to tests/HTMLPurifier/AttrDef/CSS/CompositeTest.php index 8ea7b5e8..3ec60e7d 100644 --- a/tests/HTMLPurifier/AttrDef/CompositeTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/CompositeTest.php @@ -1,20 +1,20 @@ defs =& $defs; } } -class HTMLPurifier_AttrDef_CompositeTest extends HTMLPurifier_AttrDefHarness +class HTMLPurifier_AttrDef_CSS_CompositeTest extends HTMLPurifier_AttrDefHarness { var $def1, $def2; @@ -32,7 +32,7 @@ class HTMLPurifier_AttrDef_CompositeTest extends HTMLPurifier_AttrDefHarness $def1 = new HTMLPurifier_AttrDefMock($this); $def2 = new HTMLPurifier_AttrDefMock($this); $defs = array(&$def1, &$def2); - $def = new HTMLPurifier_AttrDef_Composite_Testable($defs); + $def = new HTMLPurifier_AttrDef_CSS_Composite_Testable($defs); $input = 'FOOBAR'; $output = 'foobar'; $def1_params = array($input, $config, $context); @@ -51,7 +51,7 @@ class HTMLPurifier_AttrDef_CompositeTest extends HTMLPurifier_AttrDefHarness $def1 = new HTMLPurifier_AttrDefMock($this); $def2 = new HTMLPurifier_AttrDefMock($this); $defs = array(&$def1, &$def2); - $def = new HTMLPurifier_AttrDef_Composite_Testable($defs); + $def = new HTMLPurifier_AttrDef_CSS_Composite_Testable($defs); $input = 'BOOMA'; $output = 'booma'; $def_params = array($input, $config, $context); @@ -71,7 +71,7 @@ class HTMLPurifier_AttrDef_CompositeTest extends HTMLPurifier_AttrDefHarness $def1 = new HTMLPurifier_AttrDefMock($this); $def2 = new HTMLPurifier_AttrDefMock($this); $defs = array(&$def1, &$def2); - $def = new HTMLPurifier_AttrDef_Composite_Testable($defs); + $def = new HTMLPurifier_AttrDef_CSS_Composite_Testable($defs); $input = 'BOOMA'; $output = false; $def_params = array($input, $config, $context); diff --git a/tests/HTMLPurifier/AttrDef/FontFamilyTest.php b/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php similarity index 69% rename from tests/HTMLPurifier/AttrDef/FontFamilyTest.php rename to tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php index 47c0e779..a802d45f 100644 --- a/tests/HTMLPurifier/AttrDef/FontFamilyTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php @@ -1,14 +1,14 @@ def = new HTMLPurifier_AttrDef_FontFamily(); + $this->def = new HTMLPurifier_AttrDef_CSS_FontFamily(); $this->assertDef('Gill, Helvetica, sans-serif'); $this->assertDef('\'Times New Roman\', serif'); diff --git a/tests/HTMLPurifier/AttrDef/FontTest.php b/tests/HTMLPurifier/AttrDef/CSS/FontTest.php similarity index 79% rename from tests/HTMLPurifier/AttrDef/FontTest.php rename to tests/HTMLPurifier/AttrDef/CSS/FontTest.php index 49b3652c..6bcb4fe2 100644 --- a/tests/HTMLPurifier/AttrDef/FontTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/FontTest.php @@ -1,14 +1,15 @@ def = new HTMLPurifier_AttrDef_Font(HTMLPurifier_Config::createDefault()); + $config = HTMLPurifier_Config::createDefault(); + $this->def = new HTMLPurifier_AttrDef_CSS_Font($config); // hodgepodge of usage cases from W3C spec, but " -> ' $this->assertDef('12px/14px sans-serif'); diff --git a/tests/HTMLPurifier/AttrDef/CSSLengthTest.php b/tests/HTMLPurifier/AttrDef/CSS/LengthTest.php similarity index 75% rename from tests/HTMLPurifier/AttrDef/CSSLengthTest.php rename to tests/HTMLPurifier/AttrDef/CSS/LengthTest.php index fabea20f..56129af2 100644 --- a/tests/HTMLPurifier/AttrDef/CSSLengthTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/LengthTest.php @@ -1,14 +1,14 @@ def = new HTMLPurifier_AttrDef_CSSLength(); + $this->def = new HTMLPurifier_AttrDef_CSS_Length(); $this->assertDef('0'); $this->assertDef('0px'); @@ -31,7 +31,7 @@ class HTMLPurifier_AttrDef_CSSLengthTest extends HTMLPurifier_AttrDefHarness function testNonNegative() { - $this->def = new HTMLPurifier_AttrDef_CSSLength(true); + $this->def = new HTMLPurifier_AttrDef_CSS_Length(true); $this->assertDef('3cm'); $this->assertDef('-3mm', false); diff --git a/tests/HTMLPurifier/AttrDef/ListStyleTest.php b/tests/HTMLPurifier/AttrDef/CSS/ListStyleTest.php similarity index 77% rename from tests/HTMLPurifier/AttrDef/ListStyleTest.php rename to tests/HTMLPurifier/AttrDef/CSS/ListStyleTest.php index 95ef9444..6863c489 100644 --- a/tests/HTMLPurifier/AttrDef/ListStyleTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/ListStyleTest.php @@ -1,14 +1,15 @@ def = new HTMLPurifier_AttrDef_ListStyle(HTMLPurifier_Config::createDefault()); + $config = HTMLPurifier_Config::createDefault(); + $this->def = new HTMLPurifier_AttrDef_CSS_ListStyle($config); $this->assertDef('lower-alpha'); $this->assertDef('upper-roman inside'); diff --git a/tests/HTMLPurifier/AttrDef/MultipleTest.php b/tests/HTMLPurifier/AttrDef/CSS/MultipleTest.php similarity index 77% rename from tests/HTMLPurifier/AttrDef/MultipleTest.php rename to tests/HTMLPurifier/AttrDef/CSS/MultipleTest.php index 8c102b39..075c56ad 100644 --- a/tests/HTMLPurifier/AttrDef/MultipleTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/MultipleTest.php @@ -1,16 +1,16 @@ def = new HTMLPurifier_AttrDef_Multiple( + $this->def = new HTMLPurifier_AttrDef_CSS_Multiple( new HTMLPurifier_AttrDef_Integer() ); diff --git a/tests/HTMLPurifier/AttrDef/NumberTest.php b/tests/HTMLPurifier/AttrDef/CSS/NumberTest.php similarity index 73% rename from tests/HTMLPurifier/AttrDef/NumberTest.php rename to tests/HTMLPurifier/AttrDef/CSS/NumberTest.php index 4ddea5e6..f8f714f6 100644 --- a/tests/HTMLPurifier/AttrDef/NumberTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/NumberTest.php @@ -1,14 +1,14 @@ def = new HTMLPurifier_AttrDef_Number(); + $this->def = new HTMLPurifier_AttrDef_CSS_Number(); $this->assertDef('0'); $this->assertDef('34'); @@ -29,7 +29,7 @@ class HTMLPurifier_AttrDef_NumberTest extends HTMLPurifier_AttrDefHarness function testNonNegative() { - $this->def = new HTMLPurifier_AttrDef_Number(true); + $this->def = new HTMLPurifier_AttrDef_CSS_Number(true); $this->assertDef('23'); $this->assertDef('-12', false); diff --git a/tests/HTMLPurifier/AttrDef/PercentageTest.php b/tests/HTMLPurifier/AttrDef/CSS/PercentageTest.php similarity index 66% rename from tests/HTMLPurifier/AttrDef/PercentageTest.php rename to tests/HTMLPurifier/AttrDef/CSS/PercentageTest.php index 6694296c..2aa0d401 100644 --- a/tests/HTMLPurifier/AttrDef/PercentageTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/PercentageTest.php @@ -1,14 +1,14 @@ def = new HTMLPurifier_AttrDef_Percentage(); + $this->def = new HTMLPurifier_AttrDef_CSS_Percentage(); $this->assertDef('10%'); $this->assertDef('1.607%'); diff --git a/tests/HTMLPurifier/AttrDef/TextDecorationTest.php b/tests/HTMLPurifier/AttrDef/CSS/TextDecorationTest.php similarity index 72% rename from tests/HTMLPurifier/AttrDef/TextDecorationTest.php rename to tests/HTMLPurifier/AttrDef/CSS/TextDecorationTest.php index f633177f..e5f3e0c7 100644 --- a/tests/HTMLPurifier/AttrDef/TextDecorationTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/TextDecorationTest.php @@ -1,14 +1,14 @@ def = new HTMLPurifier_AttrDef_TextDecoration(); + $this->def = new HTMLPurifier_AttrDef_CSS_TextDecoration(); $this->assertDef('underline'); $this->assertDef('overline'); diff --git a/tests/HTMLPurifier/AttrDef/CSSURITest.php b/tests/HTMLPurifier/AttrDef/CSS/URITest.php similarity index 83% rename from tests/HTMLPurifier/AttrDef/CSSURITest.php rename to tests/HTMLPurifier/AttrDef/CSS/URITest.php index 1fe1a3dc..2a238d22 100644 --- a/tests/HTMLPurifier/AttrDef/CSSURITest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/URITest.php @@ -1,14 +1,14 @@ def = new HTMLPurifier_AttrDef_CSSURI(); + $this->def = new HTMLPurifier_AttrDef_CSS_URI(); $this->assertDef('', false); diff --git a/tests/HTMLPurifier/AttrDef/Email/SimpleCheckTest.php b/tests/HTMLPurifier/AttrDef/Email/SimpleCheckTest.php deleted file mode 100644 index 70a77f72..00000000 --- a/tests/HTMLPurifier/AttrDef/Email/SimpleCheckTest.php +++ /dev/null @@ -1,16 +0,0 @@ -def = new HTMLPurifier_AttrDef_Email_SimpleCheck(); - } - -} - -?> \ No newline at end of file diff --git a/tests/HTMLPurifier/AttrDef/IDTest.php b/tests/HTMLPurifier/AttrDef/HTML/IDTest.php similarity index 70% rename from tests/HTMLPurifier/AttrDef/IDTest.php rename to tests/HTMLPurifier/AttrDef/HTML/IDTest.php index e47ad9af..a604ca0c 100644 --- a/tests/HTMLPurifier/AttrDef/IDTest.php +++ b/tests/HTMLPurifier/AttrDef/HTML/IDTest.php @@ -1,10 +1,10 @@ context->register('IDAccumulator', $id_accumulator); - $this->def = new HTMLPurifier_AttrDef_ID(); + $this->config->set('Attr', 'EnableID', true); + $this->def = new HTMLPurifier_AttrDef_HTML_ID(); } @@ -74,6 +75,26 @@ class HTMLPurifier_AttrDef_IDTest extends HTMLPurifier_AttrDefHarness } + // reference functionality is disabled for now + function disabled_testIDReference() { + + $this->def = new HTMLPurifier_AttrDef_HTML_ID(true); + + $this->assertDef('good_id'); + $this->assertDef('good_id'); // duplicates okay + $this->assertDef('', false); + + $this->def = new HTMLPurifier_AttrDef_HTML_ID(); + + $this->assertDef('good_id'); + $this->assertDef('good_id', false); // duplicate now not okay + + $this->def = new HTMLPurifier_AttrDef_HTML_ID(true); + + $this->assertDef('good_id'); // reference still okay + + } + } ?> \ No newline at end of file diff --git a/tests/HTMLPurifier/AttrDef/LengthTest.php b/tests/HTMLPurifier/AttrDef/HTML/LengthTest.php similarity index 67% rename from tests/HTMLPurifier/AttrDef/LengthTest.php rename to tests/HTMLPurifier/AttrDef/HTML/LengthTest.php index f67c70b7..e5b89f22 100644 --- a/tests/HTMLPurifier/AttrDef/LengthTest.php +++ b/tests/HTMLPurifier/AttrDef/HTML/LengthTest.php @@ -1,13 +1,13 @@ def = new HTMLPurifier_AttrDef_Length(); + $this->def = new HTMLPurifier_AttrDef_HTML_Length(); } function test() { diff --git a/tests/HTMLPurifier/AttrDef/MultiLengthTest.php b/tests/HTMLPurifier/AttrDef/HTML/MultiLengthTest.php similarity index 56% rename from tests/HTMLPurifier/AttrDef/MultiLengthTest.php rename to tests/HTMLPurifier/AttrDef/HTML/MultiLengthTest.php index 6d9acd36..eaa34952 100644 --- a/tests/HTMLPurifier/AttrDef/MultiLengthTest.php +++ b/tests/HTMLPurifier/AttrDef/HTML/MultiLengthTest.php @@ -1,13 +1,13 @@ def = new HTMLPurifier_AttrDef_MultiLength(); + $this->def = new HTMLPurifier_AttrDef_HTML_MultiLength(); } function test() { @@ -16,7 +16,7 @@ class HTMLPurifier_AttrDef_MultiLengthTest extends HTMLPurifier_AttrDef_LengthTe parent::test(); $this->assertDef('*'); - $this->assertDef('1*'); + $this->assertDef('1*', '*'); $this->assertDef('56*'); $this->assertDef('**', false); // plain old bad diff --git a/tests/HTMLPurifier/AttrDef/ClassTest.php b/tests/HTMLPurifier/AttrDef/HTML/NmtokensTest.php similarity index 76% rename from tests/HTMLPurifier/AttrDef/ClassTest.php rename to tests/HTMLPurifier/AttrDef/HTML/NmtokensTest.php index 053e5134..00b55eec 100644 --- a/tests/HTMLPurifier/AttrDef/ClassTest.php +++ b/tests/HTMLPurifier/AttrDef/HTML/NmtokensTest.php @@ -1,15 +1,14 @@ def = new HTMLPurifier_AttrDef_Class(); + $this->def = new HTMLPurifier_AttrDef_HTML_Nmtokens(); $this->assertDef('valid'); $this->assertDef('a0-_'); diff --git a/tests/HTMLPurifier/AttrDef/PixelsTest.php b/tests/HTMLPurifier/AttrDef/HTML/PixelsTest.php similarity index 79% rename from tests/HTMLPurifier/AttrDef/PixelsTest.php rename to tests/HTMLPurifier/AttrDef/HTML/PixelsTest.php index cab43e86..414fa3ad 100644 --- a/tests/HTMLPurifier/AttrDef/PixelsTest.php +++ b/tests/HTMLPurifier/AttrDef/HTML/PixelsTest.php @@ -1,13 +1,13 @@ def = new HTMLPurifier_AttrDef_Pixels(); + $this->def = new HTMLPurifier_AttrDef_HTML_Pixels(); } function test() { diff --git a/tests/HTMLPurifier/AttrDef/LangTest.php b/tests/HTMLPurifier/AttrDef/LangTest.php index 7a0e4308..a5472e91 100644 --- a/tests/HTMLPurifier/AttrDef/LangTest.php +++ b/tests/HTMLPurifier/AttrDef/LangTest.php @@ -54,6 +54,8 @@ class HTMLPurifier_AttrDef_LangTest extends HTMLPurifier_AttrDefHarness // Also note that this test-case tests fix-behavior: chop // off subtags until you get a valid language code. $this->assertDef('en-a', 'en'); + // however, x is a reserved single-letter subtag that is allowed + $this->assertDef('en-x', 'en-x'); // 2-8 chars are permitted, but have special meaning that cannot // be checked without maintaining country code lookup tables (for // two characters) or special registration tables (for all above). diff --git a/tests/HTMLPurifier/AttrDef/URI/Email/SimpleCheckTest.php b/tests/HTMLPurifier/AttrDef/URI/Email/SimpleCheckTest.php new file mode 100644 index 00000000..edbde119 --- /dev/null +++ b/tests/HTMLPurifier/AttrDef/URI/Email/SimpleCheckTest.php @@ -0,0 +1,16 @@ +def = new HTMLPurifier_AttrDef_URI_Email_SimpleCheck(); + } + +} + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/AttrDef/EmailHarness.php b/tests/HTMLPurifier/AttrDef/URI/EmailHarness.php similarity index 87% rename from tests/HTMLPurifier/AttrDef/EmailHarness.php rename to tests/HTMLPurifier/AttrDef/URI/EmailHarness.php index 28bd06f1..b0398424 100644 --- a/tests/HTMLPurifier/AttrDef/EmailHarness.php +++ b/tests/HTMLPurifier/AttrDef/URI/EmailHarness.php @@ -1,9 +1,9 @@ def = new HTMLPurifier_AttrDef_Host(); + $this->def = new HTMLPurifier_AttrDef_URI_Host(); $this->assertDef('[2001:DB8:0:0:8:800:200C:417A]'); // IPv6 $this->assertDef('124.15.6.89'); // IPv4 diff --git a/tests/HTMLPurifier/AttrDef/IPv4Test.php b/tests/HTMLPurifier/AttrDef/URI/IPv4Test.php similarity index 78% rename from tests/HTMLPurifier/AttrDef/IPv4Test.php rename to tests/HTMLPurifier/AttrDef/URI/IPv4Test.php index 59f560d9..aa05159c 100644 --- a/tests/HTMLPurifier/AttrDef/IPv4Test.php +++ b/tests/HTMLPurifier/AttrDef/URI/IPv4Test.php @@ -1,17 +1,17 @@ def = new HTMLPurifier_AttrDef_IPv4(); + $this->def = new HTMLPurifier_AttrDef_URI_IPv4(); $this->assertDef('127.0.0.1'); // standard IPv4, loopback, non-routable $this->assertDef('0.0.0.0'); // standard IPv4, unspecified, non-routable diff --git a/tests/HTMLPurifier/AttrDef/IPv6Test.php b/tests/HTMLPurifier/AttrDef/URI/IPv6Test.php similarity index 91% rename from tests/HTMLPurifier/AttrDef/IPv6Test.php rename to tests/HTMLPurifier/AttrDef/URI/IPv6Test.php index 7ad3613f..8a6511b0 100644 --- a/tests/HTMLPurifier/AttrDef/IPv6Test.php +++ b/tests/HTMLPurifier/AttrDef/URI/IPv6Test.php @@ -1,17 +1,17 @@ def = new HTMLPurifier_AttrDef_IPv6(); + $this->def = new HTMLPurifier_AttrDef_URI_IPv6(); $this->assertDef('2001:DB8:0:0:8:800:200C:417A'); // unicast, full $this->assertDef('FF01:0:0:0:0:0:0:101'); // multicast, full diff --git a/tests/HTMLPurifier/ChildDef/ChameleonTest.php b/tests/HTMLPurifier/ChildDef/ChameleonTest.php index b4181196..529d9193 100644 --- a/tests/HTMLPurifier/ChildDef/ChameleonTest.php +++ b/tests/HTMLPurifier/ChildDef/ChameleonTest.php @@ -15,17 +15,17 @@ class HTMLPurifier_ChildDef_ChameleonTest extends HTMLPurifier_ChildDefHarness $this->assertResult( 'Allowed.', true, - array(), array('ParentType' => 'inline') + array(), array('IsInline' => true) ); $this->assertResult( '
Not allowed.
', '', - array(), array('ParentType' => 'inline') + array(), array('IsInline' => true) ); $this->assertResult( '
Allowed.
', true, - array(), array('ParentType' => 'block') + array(), array('IsInline' => false) ); } diff --git a/tests/HTMLPurifier/ChildDef/StrictBlockquoteTest.php b/tests/HTMLPurifier/ChildDef/StrictBlockquoteTest.php index 27aacc81..56405e91 100644 --- a/tests/HTMLPurifier/ChildDef/StrictBlockquoteTest.php +++ b/tests/HTMLPurifier/ChildDef/StrictBlockquoteTest.php @@ -9,7 +9,7 @@ extends HTMLPurifier_ChildDefHarness function test() { - $this->obj = new HTMLPurifier_ChildDef_StrictBlockquote(); + $this->obj = new HTMLPurifier_ChildDef_StrictBlockquote('div | p'); $this->assertResult(''); $this->assertResult('

Valid

'); diff --git a/tests/HTMLPurifier/ConfigSchemaTest.php b/tests/HTMLPurifier/ConfigSchemaTest.php index 075a552c..1f1f7034 100644 --- a/tests/HTMLPurifier/ConfigSchemaTest.php +++ b/tests/HTMLPurifier/ConfigSchemaTest.php @@ -41,14 +41,14 @@ class HTMLPurifier_ConfigSchemaTest extends UnitTestCase function tearDown() { // testing is done, restore the old copy HTMLPurifier_ConfigSchema::instance($this->old_copy); - tally_errors(); + tally_errors($this); } function test_defineNamespace() { CS::defineNamespace('http', $d = 'This is an internet protocol.'); $this->assertIdentical($this->our_copy->info_namespace, array( - 'http' => new HTMLPurifier_ConfigEntity_Namespace($d) + 'http' => new HTMLPurifier_ConfigDef_Namespace($d) )); $this->expectError('Cannot redefine namespace'); @@ -68,7 +68,7 @@ class HTMLPurifier_ConfigSchemaTest extends UnitTestCase $this->assertIdentical($this->our_copy->defaults['Car']['Seats'], 5); $this->assertIdentical($this->our_copy->info['Car']['Seats'], - new HTMLPurifier_ConfigEntity_Directive('int', + new HTMLPurifier_ConfigDef_Directive('int', array($this->file => array($l => $d)) ) ); @@ -77,7 +77,7 @@ class HTMLPurifier_ConfigSchemaTest extends UnitTestCase $this->assertIdentical($this->our_copy->defaults['Car']['Age'], null); $this->assertIdentical($this->our_copy->info['Car']['Age'], - new HTMLPurifier_ConfigEntity_Directive('int', + new HTMLPurifier_ConfigDef_Directive('int', array($this->file => array($l => $d)), true ) ); @@ -106,7 +106,7 @@ class HTMLPurifier_ConfigSchemaTest extends UnitTestCase $this->assertIdentical($this->our_copy->defaults['Cat']['Dead'], false); $this->assertIdentical($this->our_copy->info['Cat']['Dead'], - new HTMLPurifier_ConfigEntity_Directive('bool', + new HTMLPurifier_ConfigDef_Directive('bool', array($this->file => array($l1 => $d1, $l2 => $d2)) ) ); @@ -132,7 +132,7 @@ class HTMLPurifier_ConfigSchemaTest extends UnitTestCase $this->assertIdentical($this->our_copy->defaults['QuantumNumber']['Difficulty'], null); $this->assertIdentical($this->our_copy->info['QuantumNumber']['Difficulty'], - new HTMLPurifier_ConfigEntity_Directive( + new HTMLPurifier_ConfigDef_Directive( 'string', array($this->file => array($l => $d)), true, @@ -184,7 +184,7 @@ class HTMLPurifier_ConfigSchemaTest extends UnitTestCase $this->assertIdentical($this->our_copy->defaults['Abbrev']['HTH'], 'Happy to Help'); $this->assertIdentical($this->our_copy->info['Abbrev']['HTH'], - new HTMLPurifier_ConfigEntity_Directive( + new HTMLPurifier_ConfigDef_Directive( 'string', array($this->file => array($l => $d)), false, @@ -224,7 +224,7 @@ class HTMLPurifier_ConfigSchemaTest extends UnitTestCase $this->assertTrue(!isset($this->our_copy->defaults['Home']['Carpet'])); $this->assertIdentical($this->our_copy->info['Home']['Carpet'], - new HTMLPurifier_ConfigEntity_DirectiveAlias('Home', 'Rug') + new HTMLPurifier_ConfigDef_DirectiveAlias('Home', 'Rug') ); $this->expectError('Cannot define directive alias in undefined namespace'); diff --git a/tests/HTMLPurifier/ConfigTest.php b/tests/HTMLPurifier/ConfigTest.php index e04ac416..f368f8c0 100644 --- a/tests/HTMLPurifier/ConfigTest.php +++ b/tests/HTMLPurifier/ConfigTest.php @@ -20,7 +20,7 @@ class HTMLPurifier_ConfigTest extends UnitTestCase function tearDown() { HTMLPurifier_ConfigSchema::instance($this->old_copy); - tally_errors(); + tally_errors($this); } // test functionality based on ConfigSchema @@ -216,7 +216,7 @@ class HTMLPurifier_ConfigTest extends UnitTestCase } - function test_getDefinition() { + function test_getHTMLDefinition() { // we actually want to use the old copy, because the definition // generation routines have dependencies on configuration values @@ -224,12 +224,41 @@ class HTMLPurifier_ConfigTest extends UnitTestCase $this->old_copy = HTMLPurifier_ConfigSchema::instance($this->old_copy); $config = HTMLPurifier_Config::createDefault(); - $def = $config->getHTMLDefinition(); - $this->assertIsA($def, 'HTMLPurifier_HTMLDefinition'); $def = $config->getCSSDefinition(); $this->assertIsA($def, 'HTMLPurifier_CSSDefinition'); + $def = $config->getHTMLDefinition(); + $def2 = $config->getHTMLDefinition(); + $this->assertIsA($def, 'HTMLPurifier_HTMLDefinition'); + $this->assertEqual($def, $def2); + $this->assertTrue($def->setup); + + // test re-calculation if HTML changes + $config->set('HTML', 'Strict', true); + $def = $config->getHTMLDefinition(); + $this->assertIsA($def, 'HTMLPurifier_HTMLDefinition'); + $this->assertNotEqual($def, $def2); + $this->assertTrue($def->setup); + + // test retrieval of raw definition + $def =& $config->getHTMLDefinition(true); + $this->assertNotEqual($def, $def2); + $this->assertFalse($def->setup); + + // auto initialization + $config->getHTMLDefinition(); + $this->assertTrue($def->setup); + + } + + function test_getCSSDefinition() { + $this->old_copy = HTMLPurifier_ConfigSchema::instance($this->old_copy); + + $config = HTMLPurifier_Config::createDefault(); + + $def = $config->getCSSDefinition(); + $this->assertIsA($def, 'HTMLPurifier_CSSDefinition'); } function test_loadArray() { diff --git a/tests/HTMLPurifier/HTMLModuleManagerTest.php b/tests/HTMLPurifier/HTMLModuleManagerTest.php new file mode 100644 index 00000000..f3efa1c6 --- /dev/null +++ b/tests/HTMLPurifier/HTMLModuleManagerTest.php @@ -0,0 +1,274 @@ +manager = new HTMLPurifier_HTMLModuleManager(true); + } + + function teardown() { + tally_errors($this); + } + + function createModule($name) { + $module = new HTMLPurifier_HTMLModule(); + $module->name = $name; + return $module; + } + + function test_addModule_withAutoload() { + $this->manager->autoDoctype = 'Generic Document 0.1'; + $this->manager->autoCollection = 'Default'; + + $module = new HTMLPurifier_HTMLModule(); + $module->name = 'Module'; + + $module2 = new HTMLPurifier_HTMLModule(); + $module2->name = 'Module2'; + + // we need to grab the dynamically generated orders from + // the object since modules are not passed by reference + + $this->manager->addModule($module); + $module_order = $this->manager->modules['Module']->order; + $module->order = $module_order; + $this->assertEqual($module, $this->manager->modules['Module']); + + $this->manager->addModule($module2); + $module2_order = $this->manager->modules['Module2']->order; + $module2->order = $module2_order; + $this->assertEqual($module2, $this->manager->modules['Module2']); + $this->assertEqual($module_order + 1, $module2_order); + + $this->assertEqual( + $this->manager->collections['Default']['Generic Document 0.1'], + array('Module', 'Module2') + ); + + $this->manager->setup(HTMLPurifier_Config::createDefault()); + + $modules = array( + 'Module' => $this->manager->modules['Module'], + 'Module2' => $this->manager->modules['Module2'] + ); + + $this->assertIdentical( + $this->manager->collections['Default']['Generic Document 0.1'], + $modules + ); + $this->assertIdentical($this->manager->activeModules, $modules); + $this->assertIdentical($this->manager->activeCollections, array('Default')); + + } + + function test_addModule_undefinedClass() { + $this->expectError('TotallyCannotBeDefined module does not exist'); + $this->manager->addModule('TotallyCannotBeDefined'); + } + + function test_addModule_stringExpansion() { + $this->manager->addModule('ManagerTestModule'); + $this->assertIsA($this->manager->modules['ManagerTestModule'], + 'HTMLPurifier_HTMLModule_ManagerTestModule'); + } + + function test_addPrefix() { + $this->manager->addPrefix('HTMLPurifier_HTMLModuleManagerTest_'); + $this->manager->addModule('TestModule'); + $this->assertIsA($this->manager->modules['TestModule'], + 'HTMLPurifier_HTMLModuleManagerTest_TestModule'); + } + + function assertProcessCollections($input, $expect = false) { + if ($expect === false) $expect = $input; + $this->manager->processCollections($input); + // substitute in modules for $expect + foreach ($expect as $col_i => $col) { + $disable = false; + foreach ($col as $mod_i => $mod) { + unset($expect[$col_i][$mod_i]); + if ($mod_i === '*') { + $disable = true; + continue; + } + $expect[$col_i][$mod] = $this->manager->modules[$mod]; + } + if ($disable) $expect[$col_i]['*'] = false; + } + $this->assertIdentical($input, $expect); + } + + function testImpl_processCollections() { + $this->manager->initialize(); + $this->assertProcessCollections( + array() + ); + $this->assertProcessCollections( + array('HTML' => array('Text')) + ); + $this->assertProcessCollections( + array('HTML' => array('Text', 'Legacy')) + ); + $this->assertProcessCollections( // order is important! + array('HTML' => array('Legacy', 'Text')), + array('HTML' => array('Text', 'Legacy')) + ); + $this->assertProcessCollections( // privates removed after process + array('_Private' => array('Legacy', 'Text')), + array() + ); + $this->assertProcessCollections( // inclusions come first + array( + 'HTML' => array(array('XHTML'), 'Legacy'), + 'XHTML' => array('Text', 'Hypertext') + ), + array( + 'HTML' => array('Text', 'Hypertext', 'Legacy'), + 'XHTML' => array('Text', 'Hypertext') + ) + ); + $this->assertProcessCollections( + array( + 'HTML' => array(array('_Common'), 'Legacy'), + '_Common' => array('Text', 'Hypertext') + ), + array( + 'HTML' => array('Text', 'Hypertext', 'Legacy') + ) + ); + $this->assertProcessCollections( // nested inclusions + array( + 'Full' => array(array('Minimal'), 'Hypertext'), + 'Minimal' => array(array('Bare'), 'List'), + 'Bare' => array('Text') + ), + array( + 'Full' => array('Text', 'Hypertext', 'List'), + 'Minimal' => array('Text', 'List'), + 'Bare' => array('Text') + ) + ); + // strange but valid stuff that will be handled in assembleModules + $this->assertProcessCollections( + array( + 'Linky' => array('Hypertext'), + 'Listy' => array('List'), + '*' => array('Text') + ) + ); + $this->assertProcessCollections( + array( + 'Linky' => array('Hypertext'), + 'ListyOnly' => array('List', '*' => false), + '*' => array('Text') + ) + ); + } + + function testImpl_processCollections_error() { + $this->manager->initialize(); + + $this->expectError( // active variables, watch out! + 'Illegal inclusion array at index 1 found collection HTML, '. + 'inclusion arrays must be at start of collection (index 0)'); + $c = array( + 'HTML' => array('Legacy', array('XHTML')), + 'XHTML' => array('Text', 'Hypertext') + ); + $this->manager->processCollections($c); + unset($c); + + $this->expectError('Collection HTML references undefined '. + 'module Foobar'); + $c = array( + 'HTML' => array('Foobar') + ); + $this->manager->processCollections($c); + unset($c); + + $this->expectError('Collection HTML tried to include undefined '. + 'collection _Common'); + $c = array( + 'HTML' => array(array('_Common'), 'Legacy') + ); + $this->manager->processCollections($c); + unset($c); + + // reports the first circular inclusion it runs across + $this->expectError('Circular inclusion detected in HTML collection'); + $c = array( + 'HTML' => array(array('XHTML')), + 'XHTML' => array(array('HTML')) + ); + $this->manager->processCollections($c); + unset($c); + + } + + function test_makeCollection() { + $config = HTMLPurifier_Config::create(array( + 'HTML.Doctype' => 'Custom Doctype' + )); + $this->manager->addModule($this->createModule('ActiveModule')); + $this->manager->addModule($this->createModule('DudModule')); + $this->manager->addModule($this->createModule('ValidModule')); + $ActiveModule = $this->manager->modules['ActiveModule']; + $DudModule = $this->manager->modules['DudModule']; + $ValidModule = $this->manager->modules['ValidModule']; + $this->manager->collections['ToBeValid']['Custom Doctype'] = array('ValidModule'); + $this->manager->collections['ToBeActive']['Custom Doctype'] = array('ActiveModule'); + $this->manager->makeCollectionValid('ToBeValid'); + $this->manager->makeCollectionActive('ToBeActive'); + $this->manager->setup($config); + $this->assertIdentical($this->manager->validModules, array( + 'ValidModule' => $ValidModule, + 'ActiveModule' => $ActiveModule + )); + $this->assertIdentical($this->manager->activeModules, array( + 'ActiveModule' => $ActiveModule + )); + } + + function test_makeCollection_undefinedCollection() { + $config = HTMLPurifier_Config::create(array( + 'HTML.Doctype' => 'Sweets Document 1.0' + )); + $this->manager->addModule($this->createModule('DonutsModule')); + $this->manager->addModule($this->createModule('ChocolateModule')); + $this->manager->collections['CocoaBased']['Sweets Document 1.0'] = array('ChocolateModule'); + // notice how BreadBased collection is missing + $this->manager->makeCollectionActive('CocoaBased'); // to prevent other errors + $this->manager->makeCollectionValid('BreadBased'); + $this->expectError('BreadBased collection is undefined'); + $this->manager->setup($config); + } + + function untest_soupStuff() { + $config = HTMLPurifier_Config::create(array( + 'HTML.Doctype' => 'The Soup Specification 8.0' + )); + $this->manager->addModule($this->createModule('VegetablesModule')); + $this->manager->addModule($this->createModule('MeatModule')); + + } + + +} + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/LanguageFactoryTest.php b/tests/HTMLPurifier/LanguageFactoryTest.php new file mode 100644 index 00000000..050d30d8 --- /dev/null +++ b/tests/HTMLPurifier/LanguageFactoryTest.php @@ -0,0 +1,47 @@ +create('en'); + + $this->assertIsA($language, 'HTMLPurifier_Language'); + $this->assertEqual($language->code, 'en'); + + // lazy loading test + $this->assertEqual(count($language->messages), 0); + $language->load(); + $this->assertNotEqual(count($language->messages), 0); + + // actual tests for content can be found in LanguageTest + + } + + function testFallback() { + + $factory = HTMLPurifier_LanguageFactory::instance(); + + $language = $factory->create('en-x-test'); + + $this->assertIsA($language, 'HTMLPurifier_Language_en_x_test'); + $this->assertEqual($language->code, 'en-x-test'); + + $language->load(); + + // test overloaded message + $this->assertEqual($language->getMessage('htmlpurifier'), 'HTML Purifier X'); + + // test inherited message + $this->assertEqual($language->getMessage('pizza'), 'Pizza'); + + } + +} + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/LanguageTest.php b/tests/HTMLPurifier/LanguageTest.php new file mode 100644 index 00000000..dd88c90f --- /dev/null +++ b/tests/HTMLPurifier/LanguageTest.php @@ -0,0 +1,22 @@ +lang = $factory->create('en'); + } + + function test_getMessage() { + $this->assertIdentical($this->lang->getMessage('htmlpurifier'), 'HTML Purifier'); + $this->assertIdentical($this->lang->getMessage('totally-non-existent-key'), ''); + } + +} + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/Strategy/FixNestingTest.php b/tests/HTMLPurifier/Strategy/FixNestingTest.php index 38bd996b..20636614 100644 --- a/tests/HTMLPurifier/Strategy/FixNestingTest.php +++ b/tests/HTMLPurifier/Strategy/FixNestingTest.php @@ -70,19 +70,33 @@ class HTMLPurifier_Strategy_FixNestingTest extends HTMLPurifier_StrategyHarness 'Not allowed!' ); - // block in inline ins not allowed - $this->assertResult( + $this->assertResult( // alt config '
Not allowed!
', '<div>Not allowed!</div>', array('Core.EscapeInvalidChildren' => true) ); + // test block element that has inline content + $this->assertResult( + '

Not allowed!

', + '

Not allowed!

' + ); + // test exclusions $this->assertResult( 'Not allowed', '' ); + // stacked ins/del + $this->assertResult( + '

Not allowed!

', + '

Not allowed!

' + ); + $this->assertResult( + '
Allowed!
' + ); + // test inline parent $this->assertResult( 'Bold', true, array('HTML.Parent' => 'span') diff --git a/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php b/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php index 750d972c..048369dd 100644 --- a/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php +++ b/tests/HTMLPurifier/Strategy/ValidateAttributesTest.php @@ -100,7 +100,7 @@ class HTMLPurifier_Strategy_ValidateAttributesTest extends $this->assertResult( '- + diff --git a/tests/HTMLPurifier/TagTransformTest.php b/tests/HTMLPurifier/TagTransformTest.php index f2cd18a3..760d85af 100644 --- a/tests/HTMLPurifier/TagTransformTest.php +++ b/tests/HTMLPurifier/TagTransformTest.php @@ -2,6 +2,11 @@ require_once 'HTMLPurifier/TagTransform.php'; +// needs to be seperated into files +require_once 'HTMLPurifier/TagTransform/Center.php'; +require_once 'HTMLPurifier/TagTransform/Font.php'; +require_once 'HTMLPurifier/TagTransform/Simple.php'; + class HTMLPurifier_TagTransformTest extends UnitTestCase { diff --git a/tests/HTMLPurifier/Test.php b/tests/HTMLPurifier/Test.php index daa39f53..3fa54173 100644 --- a/tests/HTMLPurifier/Test.php +++ b/tests/HTMLPurifier/Test.php @@ -83,6 +83,20 @@ class HTMLPurifier_Test extends UnitTestCase } + function testEnableAttrID() { + + $this->purifier = new HTMLPurifier(); + + $this->assertPurification( + 'foobar', + 'foobar' + ); + + $this->purifier = new HTMLPurifier(array('HTML.EnableAttrID' => true)); + $this->assertPurification('foobar'); + + } + } ?> \ No newline at end of file diff --git a/tests/index.php b/tests/index.php index b034d4d8..bc2e2414 100644 --- a/tests/index.php +++ b/tests/index.php @@ -51,6 +51,9 @@ $test_file_lookup = array_flip($test_files); // determine test file if (isset($_GET['f']) && isset($test_file_lookup[$_GET['f']])) { $GLOBALS['HTMLPurifierTest']['File'] = $_GET['f']; +} elseif (isset($argv[1]) && isset($test_file_lookup[$argv[1]])) { + // command-line + $GLOBALS['HTMLPurifierTest']['File'] = $argv[1]; } else { $GLOBALS['HTMLPurifierTest']['File'] = false; } diff --git a/tests/tally_errors.func.php b/tests/tally_errors.func.php index 84aaef04..cd945c3d 100644 --- a/tests/tally_errors.func.php +++ b/tests/tally_errors.func.php @@ -1,6 +1,6 @@ get('SimpleErrorQueue'); @@ -9,7 +9,7 @@ function tally_errors() { if (count($e) != 2) return; // fut-compat if (!isset($e[0])) return; // fut-compat $e[0]->_dumper = new SimpleDumper(); - $this->fail('Error expectation not fulfilled: ' . + $test->fail('Error expectation not fulfilled: ' . $e[0]->testMessage(null)); } $queue->_expectation_queue = array(); diff --git a/tests/test_files.php b/tests/test_files.php index ab83ec47..9a612181 100644 --- a/tests/test_files.php +++ b/tests/test_files.php @@ -2,68 +2,72 @@ if (!defined('HTMLPurifierTest')) exit; -// define callable test files -$test_files[] = 'ConfigTest.php'; -$test_files[] = 'ConfigSchemaTest.php'; -$test_files[] = 'LexerTest.php'; -$test_files[] = 'Lexer/DirectLexTest.php'; -$test_files[] = 'TokenTest.php'; -$test_files[] = 'ChildDef/RequiredTest.php'; -$test_files[] = 'ChildDef/OptionalTest.php'; -$test_files[] = 'ChildDef/ChameleonTest.php'; -$test_files[] = 'ChildDef/CustomTest.php'; -$test_files[] = 'ChildDef/TableTest.php'; -$test_files[] = 'ChildDef/StrictBlockquoteTest.php'; -$test_files[] = 'GeneratorTest.php'; -$test_files[] = 'EntityLookupTest.php'; -$test_files[] = 'Strategy/RemoveForeignElementsTest.php'; -$test_files[] = 'Strategy/MakeWellFormedTest.php'; -$test_files[] = 'Strategy/FixNestingTest.php'; -$test_files[] = 'Strategy/CompositeTest.php'; -$test_files[] = 'Strategy/CoreTest.php'; -$test_files[] = 'Strategy/ValidateAttributesTest.php'; -$test_files[] = 'AttrDefTest.php'; -$test_files[] = 'AttrDef/EnumTest.php'; -$test_files[] = 'AttrDef/IDTest.php'; -$test_files[] = 'AttrDef/ClassTest.php'; -$test_files[] = 'AttrDef/TextTest.php'; -$test_files[] = 'AttrDef/LangTest.php'; -$test_files[] = 'AttrDef/PixelsTest.php'; -$test_files[] = 'AttrDef/LengthTest.php'; -$test_files[] = 'AttrDef/URITest.php'; +// define callable test files (sorted alphabetically) +$test_files[] = 'AttrDef/CSS/BackgroundPositionTest.php'; +$test_files[] = 'AttrDef/CSS/BackgroundTest.php'; +$test_files[] = 'AttrDef/CSS/BorderTest.php'; +$test_files[] = 'AttrDef/CSS/ColorTest.php'; +$test_files[] = 'AttrDef/CSS/CompositeTest.php'; +$test_files[] = 'AttrDef/CSS/FontFamilyTest.php'; +$test_files[] = 'AttrDef/CSS/FontTest.php'; +$test_files[] = 'AttrDef/CSS/LengthTest.php'; +$test_files[] = 'AttrDef/CSS/ListStyleTest.php'; +$test_files[] = 'AttrDef/CSS/MultipleTest.php'; +$test_files[] = 'AttrDef/CSS/NumberTest.php'; +$test_files[] = 'AttrDef/CSS/PercentageTest.php'; +$test_files[] = 'AttrDef/CSS/TextDecorationTest.php'; +$test_files[] = 'AttrDef/CSS/URITest.php'; $test_files[] = 'AttrDef/CSSTest.php'; -$test_files[] = 'AttrDef/CompositeTest.php'; -$test_files[] = 'AttrDef/ColorTest.php'; +$test_files[] = 'AttrDef/EnumTest.php'; +$test_files[] = 'AttrDef/HTML/IDTest.php'; +$test_files[] = 'AttrDef/HTML/LengthTest.php'; +$test_files[] = 'AttrDef/HTML/MultiLengthTest.php'; +$test_files[] = 'AttrDef/HTML/NmtokensTest.php'; +$test_files[] = 'AttrDef/HTML/PixelsTest.php'; $test_files[] = 'AttrDef/IntegerTest.php'; -$test_files[] = 'AttrDef/NumberTest.php'; -$test_files[] = 'AttrDef/CSSLengthTest.php'; -$test_files[] = 'AttrDef/PercentageTest.php'; -$test_files[] = 'AttrDef/MultipleTest.php'; -$test_files[] = 'AttrDef/TextDecorationTest.php'; -$test_files[] = 'AttrDef/FontFamilyTest.php'; -$test_files[] = 'AttrDef/HostTest.php'; -$test_files[] = 'AttrDef/IPv4Test.php'; -$test_files[] = 'AttrDef/IPv6Test.php'; -$test_files[] = 'AttrDef/FontTest.php'; -$test_files[] = 'AttrDef/BorderTest.php'; -$test_files[] = 'AttrDef/ListStyleTest.php'; -$test_files[] = 'AttrDef/Email/SimpleCheckTest.php'; -$test_files[] = 'AttrDef/CSSURITest.php'; -$test_files[] = 'AttrDef/BackgroundPositionTest.php'; -$test_files[] = 'AttrDef/BackgroundTest.php'; -$test_files[] = 'IDAccumulatorTest.php'; -$test_files[] = 'TagTransformTest.php'; -$test_files[] = 'AttrTransform/LangTest.php'; -$test_files[] = 'AttrTransform/TextAlignTest.php'; +$test_files[] = 'AttrDef/LangTest.php'; +$test_files[] = 'AttrDef/TextTest.php'; +$test_files[] = 'AttrDef/URI/Email/SimpleCheckTest.php'; +$test_files[] = 'AttrDef/URI/HostTest.php'; +$test_files[] = 'AttrDef/URI/IPv4Test.php'; +$test_files[] = 'AttrDef/URI/IPv6Test.php'; +$test_files[] = 'AttrDef/URITest.php'; +$test_files[] = 'AttrDefTest.php'; $test_files[] = 'AttrTransform/BdoDirTest.php'; $test_files[] = 'AttrTransform/ImgRequiredTest.php'; +$test_files[] = 'AttrTransform/LangTest.php'; +$test_files[] = 'AttrTransform/TextAlignTest.php'; +$test_files[] = 'ChildDef/ChameleonTest.php'; +$test_files[] = 'ChildDef/CustomTest.php'; +$test_files[] = 'ChildDef/OptionalTest.php'; +$test_files[] = 'ChildDef/RequiredTest.php'; +$test_files[] = 'ChildDef/StrictBlockquoteTest.php'; +$test_files[] = 'ChildDef/TableTest.php'; +$test_files[] = 'ConfigSchemaTest.php'; +$test_files[] = 'ConfigTest.php'; +$test_files[] = 'ContextTest.php'; +$test_files[] = 'EncoderTest.php'; +$test_files[] = 'EntityLookupTest.php'; +$test_files[] = 'EntityParserTest.php'; +$test_files[] = 'GeneratorTest.php'; +$test_files[] = 'HTMLModuleManagerTest.php'; +$test_files[] = 'IDAccumulatorTest.php'; +$test_files[] = 'LanguageFactoryTest.php'; +$test_files[] = 'LanguageTest.php'; +$test_files[] = 'Lexer/DirectLexTest.php'; +$test_files[] = 'LexerTest.php'; +$test_files[] = 'PercentEncoderTest.php'; +$test_files[] = 'Strategy/CompositeTest.php'; +$test_files[] = 'Strategy/CoreTest.php'; +$test_files[] = 'Strategy/FixNestingTest.php'; +$test_files[] = 'Strategy/MakeWellFormedTest.php'; +$test_files[] = 'Strategy/RemoveForeignElementsTest.php'; +$test_files[] = 'Strategy/ValidateAttributesTest.php'; +$test_files[] = 'TagTransformTest.php'; +$test_files[] = 'Test.php'; +$test_files[] = 'TokenTest.php'; $test_files[] = 'URISchemeRegistryTest.php'; $test_files[] = 'URISchemeTest.php'; -$test_files[] = 'EncoderTest.php'; -$test_files[] = 'EntityParserTest.php'; -$test_files[] = 'Test.php'; -$test_files[] = 'ContextTest.php'; -$test_files[] = 'PercentEncoderTest.php'; if (version_compare(PHP_VERSION, '5', '>=')) { $test_files[] = 'TokenFactoryTest.php';
Fiddly name Super-duper-price