0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-08 07:01:53 +00:00

Release 1.5.0, merged in r688-867.

- LanguageFactory::instance() declared static
- HTMLModuleManagerTest pass by reference bug fixed, merge back into trunk scheduled

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/strict@869 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang 2007-03-24 01:04:06 +00:00
parent cec7a1c087
commit dd2fd06591
130 changed files with 4324 additions and 1385 deletions

View File

@ -4,7 +4,7 @@
# Project related configuration options
#---------------------------------------------------------------------------
PROJECT_NAME = HTML Purifier
PROJECT_NUMBER = 1.4.1
PROJECT_NUMBER = 1.5.0
OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen"
CREATE_SUBDIRS = NO
OUTPUT_LANGUAGE = English

30
NEWS
View File

@ -9,6 +9,36 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
. Internal change
==========================
1.5.0, released 2007-03-23
! Added a rudimentary I18N and L10N system modeled off MediaWiki. It
doesn't actually do anything yet, but keep your eyes peeled.
! docs/enduser-utf8.html explains how to use UTF-8 and HTML Purifier
! Newly structured HTMLDefinition modeled off of XHTML 1.1 modules.
I am loathe to release beta quality APIs, but this is exactly that;
don't use the internal interfaces if you're not willing to do migration
later on.
- Allow 'x' subtag in language codes
- Fixed buggy chameleon-support for ins and del
. Added support for IDREF attributes (i.e. for)
. Renamed HTMLPurifier_AttrDef_Class to HTMLPurifier_AttrDef_Nmtokens
. Removed context variable ParentType, replaced with IsInline, which
is false when you're not inline and an integer of the parent that
caused you to become inline when you are (so possibly zero)
. Removed ElementDef->type in favor of ElementDef->descendants_are_inline
and HTMLDefinition->content_sets
. StrictBlockquote now reports what elements its supposed to allow,
rather than what it does allow
. Removed HTMLDefinition->info_flow_elements in favor of
HTMLDefinition->content_sets['Flow']
. Removed redundant "exclusionary" definitions from DTD roster
. StrictBlockquote now requires a construction parameter as if it
were an Required ChildDef, this is the "real" set of allowed elements
. AttrDef partitioned into HTML, CSS and URI segments
. Modify Youtube filter regexp to be multiline
. Require both PHP5 and DOM extension in order to use DOMLex, fixes
some edge cases where a DOMDocument class exists in a PHP4 environment
due to DOM XML extension.
1.4.1, released 2007-01-21
! docs/enduser-youtube.html updated according to new functionality
- YouTube IDs can have underscores and dashes

8
TODO
View File

@ -7,7 +7,7 @@ TODO List
? At-risk
==========================
1.5 release
1.6 release
# Implement all non-essential attribute transforms, configurable
# URI validation routines tighter (see docs/dev-code-quality.html) (COMPLEX)
# Advanced URI filtering schemes (see docs/proposal-new-directives.txt)
@ -15,8 +15,9 @@ TODO List
- Requires I18N facilities to be created first (COMPLEX)
? Configuration profiles: sets of directives that get set with one func call
- XSS-attempt detection
- Implement IDREF support
1.6 release
1.7 release
# Add pre-packaged "levels" of cleaning (custom behavior already done)
- More fine-grained control over escaping behavior
- Silently drop content inbetween SCRIPT tags (can be generalized to allow
@ -29,7 +30,7 @@ TODO List
tag or attribute that is not supported
- Parse TinyMCE whitelist into our %HTML.Allow* whitelists
1.7 release
1.8 release
# Additional support for poorly written HTML
- Microsoft Word HTML cleaning (i.e. MsoNormal, but research essential!)
- Friendly strict handling of <address> (block -> <br>)
@ -76,7 +77,6 @@ Ongoing
- more! (look for ones that use WYSIWYGs)
Unknown release (on a scratch-an-itch basis)
- Upgrade SimpleTest testing code to newest versions
- Have 'lang' attribute be checked against official lists
? Semi-lossy dumb alternate character encoding transformations, achieved by
encoding all characters that have string entity equivalents

View File

@ -7,6 +7,7 @@ set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
require_once 'HTMLPurifier/ConfigSchema.php';
require_once 'HTMLPurifier/Config.php';
require_once 'HTMLPurifier/Context.php';
$LEXERS = array();
$RUNS = isset($GLOBALS['HTMLPurifierTest']['Runs'])
@ -93,11 +94,14 @@ function print_lexers() {
function do_benchmark($name, $document) {
global $LEXERS, $RUNS;
$config = HTMLPurifier_Config::createDefault();
$context = new HTMLPurifier_Context();
$timer = new RowTimer($name);
$timer->start();
foreach($LEXERS as $key => $lexer) {
for ($i=0; $i<$RUNS; $i++) $tokens = $lexer->tokenizeHTML($document);
for ($i=0; $i<$RUNS; $i++) $tokens = $lexer->tokenizeHTML($document, $config, $context);
$timer->setMarker($key);
}

View File

@ -5,12 +5,15 @@ set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
require_once 'HTMLPurifier/ConfigSchema.php';
require_once 'HTMLPurifier/Config.php';
require_once 'HTMLPurifier/Lexer/DirectLex.php';
require_once 'HTMLPurifier/Context.php';
$input = file_get_contents('samples/Lexer/4.html');
$lexer = new HTMLPurifier_Lexer_DirectLex();
$config = HTMLPurifier_Config::createDefault();
$context = new HTMLPurifier_Context();
for ($i = 0; $i < 10; $i++) {
$tokens = $lexer->tokenizeHTML($input);
$tokens = $lexer->tokenizeHTML($input, $config, $context);
}
?>

View File

@ -188,7 +188,7 @@ $xsl_processor->importStylesheet($xsl_dom_stylesheet);
$html_output = $xsl_processor->transformToXML($dom_document);
// some slight fudges to preserve backwards compatibility
$html_output = str_replace('/>', ' />', $html_output); // <br /> not <br>
$html_output = str_replace('/>', ' />', $html_output); // <br /> not <br/>
$html_output = str_replace(' xmlns=""', '', $html_output); // rm unnecessary xmlns
if (class_exists('Tidy')) {

188
docs/dev-advanced-api.html Normal file
View File

@ -0,0 +1,188 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="description" content="Functional specification for HTML Purifier's advanced API for defining custom filtering behavior." />
<link rel="stylesheet" type="text/css" href="style.css" />
<title>Advanced API - HTML Purifier</title>
</head><body>
<h1>Advanced API</h1>
<div id="filing">Filed under Development</div>
<div id="index">Return to the <a href="index.html">index</a>.</div>
<div id="home"><a href="http://hp.jpsband.org/">HTML Purifier</a> End-User Documentation</div>
<p>It makes no sense to adopt a <q>one-size-fits-all</q> approach to
filtersets: therefore, users must be able to define their own sets of
<q>allowed</q> elements, as well as switch in-between doctypes of HTML.</p>
<p>Our goals are to let the user:</p>
<dl>
<dt>Select</dt>
<dd><ul>
<li>Doctype</li>
<li>Filtersets: Rich / Plain / Full ...</li>
<li>Mode: Lenient / Correctional</li>
<li>Collections (?): Safe / Unsafe</li>
<li>Modules / Tags / Attributes</li>
</ul></dd>
<dt>Customize</dt>
<dd><ul>
<li>Tags / Attributes / Attribute Types</li>
<li>Filtersets</li>
<li>Root Node</li>
</ul></dd>
<dt>Create</dt>
<dd><ul>
<li>Modules / Tags / Attributes / Attribute Types</li>
<li>Filtersets</li>
<li>Doctype</li>
</ul></dd>
</dl>
<h2>Select</h2>
<h3>Selecting a Doctype</h3>
<p>By default, users will use a doctype-based, permissive but secure
whitelist. They must define a <strong>doctype</strong>, and this serves
as the first method of determining a filterset.</p>
<p class="technical">This identifier is based
on the name the W3C has given to the document type and <em>not</em>
the DTD identifier.</p>
<p>This parameter is set via the configuration object:</p>
<pre>$config->set('HTML', 'Doctype', 'XHTML 1.0 Transitional');</pre>
<h3>Selecting a Filterset</h3>
<p>However, selecting this doctype doesn't mean much, because if we
adhered exactly to the definition we would be letting XSS and other
nasties through. HTML Purifier must, in its filterset, allow a subset
of the doctype, which we shall call a <strong>filterset</strong>.</p>
<p>By default, HTML Purifier will use the <strong>Rich</strong>
filterset, which allows as many elements as possible with untrusted
sources. Other possible filtersets could be:</p>
<dl>
<dt>Full</dt>
<dd>Allows the full span of elements in the doctype, good if you want
HTML Purifier to work as a Tidy substitute but not to strip
anything out.</dd>
<dt>Plain</dt>
<dd>Provides a minimum set of tags for semantic markup of things
like blog comments.</dd>
</dl>
<p>Extension-authors would be able to define custom filtersets for
other users to use.</p>
<p>A possible call to select a filterset would be:</p>
<pre>$config->set('HTML', 'Filterset', 'Rich');</pre>
<h3>Selecting Mode</h3>
<p>Within filtersets, there are various <strong>modes</strong> of operation.
These indicate variant behaviors that, while not strictly changing the
allowed set of elements and attributes, will definitely affect the output.
Currently, we have two modes, which may be used together:</p>
<dl>
<dt>Lenient</dt>
<dd>Deprecated elements and attributes will be transformed into
standards-compliant alternatives when explicitly disallowed. For
example, in the XHTML 1.0 Strict doctype, a <code>center</code>
tag would be turned into a <code>div</code> with the CSS property
<code>text-align:center;</code>, but in XHTML 1.0 Transitional
the tag would be preserved. This mode is on by default.</dd>
<dt>Correctional</dt>
<dd>Deprecated elements and attributes will be transformed into
standards-compliant alternatives whenever possible. Referring
back to the previous example, the <code>center</code> tag would
be transformed in both cases. However, tags without a
reasonable standards-compliant alternative will be preserved
in their form. This mode is on by default. It may have
various levels of operation.</dd>
</dl>
<p>A possible call to select modes would be:</p>
<pre>$config->set('HTML', 'Mode', array('correctional', 'lenient'));</pre>
<p>If modes have extra parameters, a hash might work well:</p>
<pre>$config->set('HTML', 'Mode', array(
'correctional' => 9, // strongest level
'lenient' => true // this one's just boolean
));</pre>
<p>Modes may possibly be wrapped up with the filterset declaration:</p>
<pre>$config->set('HTML', 'Filterset', 'Rich: correctional, lenient');</pre>
<p>Further investigation in this field is necessary.</p>
<h3>Selecting Modules / Tags / Attributes</h3>
<p>If this cookie cutter approach doesn't appeal to a user, they may
decide to roll their own filterset by selecting modules, tags and
attributes to allow.</p>
<p class="technical">This would make use of the same facilities
as a filterset author would use, except that it would go under an
<q>anonymous</q> filterset that would be auto-selected if any of the
relevant module/tag/attribute selection configuration directives were
non-null.</p>
<p>On the highest level, a user will usually be most interested in
directly specifying which elements and attributes are desired. For
example:</p>
<pre>$config->set('HTML', 'AllowedElements', 'a,b,em,p,blockquote,code,i');</pre>
<p>Attribute declarations could be merged into this declaration as such:</p>
<pre>$config->set('HTML', 'Allowed', 'a[href,title],b,em,p[class],blockquote[cite],code,i');</pre>
<p>...or be kept separate:</p>
<pre>$config->set('HTML', 'AllowedAttributes', 'a.href,a.title,p.class,blockquote.cite');</pre>
<p class="technical">Considering that, internally speaking, as mandated by
the XHTML 1.1 Modularization specification, we have organized our
elements around modules, considerable gymnastics will be needed to
get this sort of functionality working.</p>
<p>A user may also specify a module to load a class of elements and attributes
into their filterest:</p>
<pre>$config->set('HTML', 'Allowed', 'Hypertext,Core');</pre>
<p class="fixme">The granularity of these modules is too coarse for
the average user (for example, the core module loads everything from
the essential <code>p</code> tag to the not-so-safe <code>h1</code>
tag). How do we make this still a viable solution?</p>
<h3>Unified selector</h3>
<p>Because selecting each and every one of these configuration options
is a chore, we may wish to offer a specialized configuration method
for selecting a filterset. Possibility:</p>
<pre>function selectFilter($doctype, $filterset, $mode)</pre>
<p>...which is simply a light wrapper over the individual configuration
calls. A custom config file format or text format could also be adopted.</p>
<div id="version">$Id$</div>
</body></html>

View File

@ -36,7 +36,7 @@ forgiving lexer. You may also be interested in the unit tests located in the
tests/ folder, which provide a living document on how exactly the filter deals
with malformed input.
In summary:
In summary (see corresponding classes for more details):
1. Parse document into an array of tag and text tokens (Lexer)
2. Remove all elements not on whitelist and transform certain other elements

View File

@ -6,45 +6,17 @@ through negligence of people. This class will do its job: no more, no less,
and it's up to you to provide it the proper information and proper context
to be effective. Things to remember:
1. Character Encoding: UTF-8.
This segment will soon be obsoleted by enduser-utf8.html
Currently, the parser runs under the assumption that it is dealing
with UTF-8. Not ISO-8859-1 or Windows-1252, UTF-8. And definitely not "no
character encoding explicitly stated" or UTF-7. If you're not using UTF-8 as
your character encoding, make sure you configure HTML Purifier or switch
to UTF-8. Now. Also, make sure any input is properly converted to UTF-8, or
the parser will mangle it badly (though it won't be a security risk if you're
outputting it as UTF-8 though). Character encoding is, in general, a knotty
issue, but do yourself a favor and learn about it:
<http://www.joelonsoftware.com/articles/Unicode.html>
1. Character Encoding: see enduser-utf8.html for more info.
2. Doctype: XHTML 1.0 Transitional
This is what the parser is outputting. For the most
part, it's compatible with HTML 4.01, but XHTML enforces some very nice things
that all web developers should use. Regardless, NO DOCTYPE is a NO. Quirks mode
has waaaay too many quirks for a little parser to handle. We did not select
strict in order to prevent ourselves from being too draconic on users, but
this may be configurable in the future. Do you want standards compliance?
The doctype is a good place to start.
2. Doctype: document pending feature completion
Not strictly necessary, actually. More in-depth discussion once we figure
out how to get strict loose mode working.
3. IDs
This segment is obsoleted by enduser-id.html
They need to be unique, but without some knowledge of the
rest of the document, it's difficult to know what's unique. %Attr.IDBlacklist
needs to be set: we may want to consider disallowing IDs by default to
save lazy programmers.
3. IDs: see enduser-id.html for more info
4. [PROJECTED] Links
We're not going to try for spam protection (although
some hooks for such a module might be nice) but we may offer the ability to
only accept relative URLs. Pick the one that's right for you.
4. Links: document pending feature completion
Rudimentary blacklisting, we should also allow only relative URIs. We
need a doc to explain the stuff.
5. CSS
While we can prevent the most flagrant cases from affecting your
layout (such as absolutely positioned elements), no amount of code is going
to protect your pages from being attacked by garish colors and plain old
bad taste. A neat feature would be the ability to define acceptable colors
in a document, but that's not likely to be implemented for a while. In the
meantime, be sure to make sure that floated elements (permitted, since they
can be quite useful) can't mess up your layout. Once again, we may want to
disable this by default to protect lazy developers.
5. CSS: document pending
Explain which CSS styles we blocked and why.

View File

@ -10,7 +10,7 @@
.minor td {font-style:italic;}
</style>
<title>UTF-8 - HTML Purifier</title>
<title>UTF-8: The Secret of Character Encoding - HTML Purifier</title>
<!-- Note to users: this document, though professing to be UTF-8, attempts
to use only ASCII characters, because most webservers are configured
@ -19,21 +19,27 @@ own advice for sake of portability. -->
</head><body>
<h1>UTF-8</h1>
<h1>UTF-8: The Secret of Character Encoding</h1>
<div id="filing">Filed under End-User</div>
<div id="index">Return to the <a href="index.html">index</a>.</div>
<div id="home"><a href="http://hp.jpsband.org/">HTML Purifier</a> End-User Documentation</div>
<p>Character encoding and character sets, in truth, are not that
difficult to understand. But if you don't understand them, you are going
to be caught by surprise by some of HTML Purifier's behavior, namely
the fact that it operates UTF-8 or the limitations of the character
encoding transformations it does. This document will walk you through
<p>Character encoding and character sets are not that
difficult to understand, but so many people blithely stumble
through the worlds of programming without knowing what to actually
do about it, or say &quot;Ah, it's a job for those <em>internationalization</em>
experts.&quot; No, it is not! This document will walk you through
determining the encoding of your system and how you should handle
this information. It will stay away from excessive discussion on
the internals of character encoding, but offer the information in
asides that can easily be skipped.</p>
the internals of character encoding.</p>
<p>This document is not designed to be read in its entirety: it will
slowly introduce concepts that build on each other: you need not get to
the bottom to have learned something new. However, I strongly
recommend you read all the way to <strong>Why UTF-8?</strong>, because at least
at that point you'd have made a conscious decision not to migrate,
which can be a rewarding (but difficult) task.</p>
<blockquote class="aside">
<div class="label">Asides</div>
@ -43,6 +49,50 @@ asides that can easily be skipped.</p>
with a greater understanding of the underlying issues.</p>
</blockquote>
<h2>Table of Contents</h2>
<ol id="toc">
<li><a href="#findcharset">Finding the real encoding</a></li>
<li><a href="#findmetacharset">Finding the embedded encoding</a></li>
<li><a href="#fixcharset">Fixing the encoding</a><ol>
<li><a href="#fixcharset-none">No embedded encoding</a></li>
<li><a href="#fixcharset-diff">Embedded encoding disagrees</a></li>
<li><a href="#fixcharset-server">Changing the server encoding</a><ol>
<li><a href="#fixcharset-server-php">PHP header() function</a></li>
<li><a href="#fixcharset-server-phpini">PHP ini directive</a></li>
<li><a href="#fixcharset-server-nophp">Non-PHP</a></li>
<li><a href="#fixcharset-server-htaccess">.htaccess</a></li>
<li><a href="#fixcharset-server-ext">File extensions</a></li>
</ol></li>
<li><a href="#fixcharset-xml">XML</a></li>
<li><a href="#fixcharset-internals">Inside the process</a></li>
</ol></li>
<li><a href="#whyutf8">Why UTF-8?</a><ol>
<li><a href="#whyutf8-i18n">Internationalization</a></li>
<li><a href="#whyutf8-user">User-friendly</a></li>
<li><a href="#whyutf8-forms">Forms</a><ol>
<li><a href="#whyutf8-forms-urlencoded">application/x-www-form-urlencoded</a></li>
<li><a href="#whyutf8-forms-multipart">multipart/form-data</a></li>
</ol></li>
<li><a href="#whyutf8-support">Well supported</a></li>
<li><a href="#whyutf8-htmlpurifier">HTML Purifiers</a></li>
</ol></li>
<li><a href="#migrate">Migrate to UTF-8</a><ol>
<li><a href="#migrate-db">Configuring your database</a><ol>
<li><a href="#migrate-db-legit">Legit method</a></li>
<li><a href="#migrate-db-binary">Binary</a></li>
</ol></li>
<li><a href="#migrate-editor">Text editor</a></li>
<li><a href="#migrate-bom">Byte Order Mark (headers already sent!)</a></li>
<li><a href="#migrate-fonts">Fonts</a><ol>
<li><a href="#migrate-fonts-obscure">Obscure scripts</a></li>
<li><a href="#migrate-fonts-occasional">Occasional use</a></li>
</ol></li>
<li><a href="#migrate-variablewidth">Dealing with variable width in functions</a></li>
</ol></li>
<li><a href="#externallinks">Further Reading</a></li>
</ol>
<h2 id="findcharset">Finding the real encoding</h2>
<p>In the beginning, there was ASCII, and things were simple. But they
@ -275,7 +325,7 @@ your own php.ini file, ask your support for details. Use:</p>
<h4 id="fixcharset-server-nophp">Non-PHP</h4>
<p>You may, for whatever reason, may need to set the character encoding
<p>You may, for whatever reason, need to set the character encoding
on non-PHP files, usually plain ol' HTML files. Doing this
is more of a hit-or-miss process: depending on the software being
used as a webserver and the configuration of that software, certain
@ -386,8 +436,8 @@ processing instructions. They look like:</p>
<p>For XHTML, this processing instruction theoretically
overrides the <code>META</code> tag. In reality, this happens only when the
XHTML is actually served as legit XML and not HTML, which is almost
always never due to Internet Explorer's lack of support for
XHTML is actually served as legit XML and not HTML, which is almost always
never due to Internet Explorer's lack of support for
<code>application/xhtml+xml</code> (even though doing so is often
argued to be <a href="http://www.hixie.ch/advocacy/xhtml">good practice</a>).</p>
@ -398,10 +448,10 @@ for XML files is UTF-8, which often butts heads with more common
ISO-8859-1 encoding (you see this in garbled RSS feeds).</p>
<p>In short, if you use XHTML and have gone through the
trouble of adding the XML header, be sure to make sure it jives
trouble of adding the XML header, make sure it jives
with your <code>META</code> tags and HTTP headers.</p>
<h3>Inside the process</h3>
<h3 id="fixcharset-internals">Inside the process</h3>
<p>This section is not required reading,
but may answer some of your questions on what's going on in all
@ -572,7 +622,7 @@ Each method has deficiencies, especially the former.</p>
the page, you still have the trouble of what to do with characters
that are outside of the character encoding's range. The behavior, once
again, varies: Firefox 2.0 entity-izes them while Internet Explorer
7.0 mangles them beyond intelligibility. For serious I18N purposes,
7.0 mangles them beyond intelligibility. For serious internationalization purposes,
this is not an option.</p>
<p>The other possibility is to set Accept-Encoding to UTF-8, which
@ -604,22 +654,374 @@ hounding you about broken pages.</p>
<h3 id="whyutf8-htmlpurifier">HTML Purifier</h3>
<p>And finally, we get to HTML Purifier.</p>
<p>And finally, we get to HTML Purifier. HTML Purifier is built to
deal with UTF-8: any indications otherwise are the result of an
encoder that converts text from your preferred encoding to UTF-8, and
back again. HTML Purifier never touches anything else, and leaves
it up to the module iconv to do the dirty work.</p>
<p>This approach, however, is not perfect. iconv is blithely unaware
of HTML character entities. HTML Purifier, in order to
protect against sophisticated escaping schemes, normalizes all character
and numeric entities before processing the text. This leads to
one important ramification:</p>
<p><strong>Any character that is not supported by the target character
set, regardless of whether or not it is in the form of a character
entity or a raw character, will be silently ignored.</strong></p>
<p>Example of this principle at work: say you have <code>&amp;theta;</code>
in your HTML, but the output is in Latin-1 (which, understandably,
does not understand Greek), the following process will occur (assuming you've
set the encoding correctly using %Core.Encoding):</p>
<ul>
<li>The <code>Encoder</code> will transform the text from ISO 8859-1 to UTF-8
(note that theta is preserved since it doesn't actually use
any non-ASCII characters): <code>&amp;theta;</code></li>
<li>The <code>EntityParser</code> will transform all named and numeric
character entities to their corresponding raw UTF-8 equivalents:
<code>&theta;</code></li>
<li>HTML Purifier processes the code: <code>&theta;</code></li>
<li>The <code>Encoder</code> now transforms the text back from UTF-8
to ISO 8859-1. Since Greek is not supported by ISO 8859-1, it
will be either ignored or replaced with a question mark:
<code>?</code></li>
</ul>
<p>This behaviour is quite unsatisfactory. It is a deal-breaker for
international applications, and it can be mildly annoying for the provincial
soul who occasionally needs a special character. Since 1.4.0, HTML
Purifier has provided a slightly more palatable workaround using
%Core.EscapeNonASCIICharacters. The process now looks like:</p>
<ul>
<li>The <code>Encoder</code> transforms encoding to UTF-8: <code>&amp;theta;</code></li>
<li>The <code>EntityParser</code> transforms entities: <code>&theta;</code></li>
<li>HTML Purifier processes the code: <code>&theta;</code></li>
<li>The <code>Encoder</code> replaces all non-ASCII characters
with numeric entities: <code>&amp;#952;</code></li>
<li>For good measure, <code>Encoder</code> transforms encoding back to
original (which is strictly unnecessary for 99% of encodings
out there): <code>&amp;#952;</code> (remember, it's all ASCII!)</li>
</ul>
<p>...which means that this is only good for an occasional foray into
the land of Unicode characters, and is totally unacceptable for Chinese
or Japanese texts. The even bigger kicker is that, supposing the
input encoding was actually ISO-8859-7, which <em>does</em> support
theta, the character would get entity-ized anyway! (The Encoder does
not discriminate).</p>
<p>The current functionality is about where HTML Purifier will be for
the rest of eternity. HTML Purifier could attempt to preserve the original
form of the entities so that they could be substituted back in, only the
DOM extension kills them off irreversibly. HTML Purifier could also attempt
to be smart and only convert non-ASCII characters that weren't supported
by the target encoding, but that would require reimplementing iconv
with HTML awareness, something I will not do.</p>
<p>So there: either it's UTF-8 or crippled international support. Your pick! (and I'm
not being sarcastic here: some people could care less about other languages)</p>
<h2 id="migrate">Migrate to UTF-8</h2>
<h3 id="migrate-editor">Text editor</h3>
<p>So, you've decided to bite the bullet, and want to migrate to UTF-8.
Note that this is not for the faint-hearted, and you should expect
the process to take longer than you think it will take.</p>
<p>The general idea is that you convert all existing text to UTF-8,
and then you set all the headers and META tags we discussed earlier
to UTF-8. There are many ways going about doing this: you could
write a conversion script that runs through the database and re-encodes
everything as UTF-8 or you could do the conversion on the fly when someone
reads the page. The details depend on your system, but I will cover
some of the more subtle points of migration that may trip you up.</p>
<h3 id="migrate-db">Configuring your database</h3>
<h3 id="migrate-convert">Convert old text</h3>
<p>Most modern databases, the most prominent open-source ones being MySQL
4.1+ and PostgreSQL, support character encodings. If you're switching
to UTF-8, logically speaking, you'd want to make sure your database
knows about the change too. There are some caveats though:</p>
<h4 id="migrate-db-legit">Legit method</h4>
<p>Standardization in terms of SQL syntax for specifying character
encodings is notoriously spotty. Refer to your respective database's
documentation on how to do this properly.</p>
<p>For <a href="http://dev.mysql.com/doc/refman/5.0/en/charset-conversion.html">MySQL</a>, <code>ALTER</code> will magically perform the
character encoding conversion for you. However, you have
to make sure that the text inside the column is what is says it is:
if you had put Shift-JIS in an ISO 8859-1 column, MySQL will irreversibly mangle
the text when you try to convert it to UTF-8. You'll have to convert
it to a binary field, convert it to a Shift-JIS field (the real encoding),
and then finally to UTF-8. Many a website had pages irreversibly mangled
because they didn't realize that they'd been deluding themselves about
the character encoding all along, don't become the next victim.</p>
<p>For <a href="http://www.postgresql.org/docs/8.2/static/multibyte.html">PostgreSQL</a>, there appears to be no direct way to change the
encoding of a database (as of 8.2). You will have to dump the data, and then reimport
it into a new table. Make sure that your client encoding is set properly:
this is how PostgreSQL knows to perform an encoding conversion.</p>
<p>Many times, you will be also asked about the &quot;collation&quot; of
the new column. Collation is how a DBMS sorts text, like ordering
B, C and A into A, B and C (the problem gets surprisingly complicated
when you get to languages like Thai and Japanese). If in doubt,
going with the default setting is usually a safe bet.</p>
<p>Once the conversion is all said and done, you still have to remember
to set the client encoding (your encoding) properly on each database
connection using <code>SET NAMES</code> (which is standard SQL and is
usually supported).</p>
<h4 id="migrate-db-binary">Binary</h4>
<p>Due to the abovementioned compatibility issues, a more interoperable
way of storing UTF-8 text is to stuff it in a binary datatype.
<code>CHAR</code> becomes <code>BINARY</code>, <code>VARCHAR</code> becomes
<code>VARBINARY</code> and <code>TEXT</code> becomes <code>BLOB</code>.
Doing so can save you some huge headaches:</p>
<ul>
<li>The syntax for binary data types is very portable,</li>
<li>MySQL 4.0 has <em>no</em> support for character encodings, so
if you want to support it you <em>have</em> to use binary,</li>
<li>MySQL, as of 5.1, has no support for four byte UTF-8 characters,
which represent characters beyond the basic multilingual
plane, and</li>
<li>You will never have to worry about your DBMS being too smart
and attempting to convert your text when you don't want it to.</li>
</ul>
<p>MediaWiki, a very prominent international application, uses binary fields
for storing their data because of point three.</p>
<p>There are drawbacks, of course:</p>
<ul>
<li>Database tools like PHPMyAdmin won't be able to offer you inline
text editing, since it is declared as binary,</li>
<li>It's not semantically correct: it's really text not binary
(lying to the database),</li>
<li>Unless you use the not-very-portable wizardry mentioned above,
you have to change the encoding yourself (usually, you'd do
it on the fly), and</li>
<li>You will not have collation.</li>
</ul>
<p>Choose based on your circumstances.</p>
<h3 id="migrate-editor">Text editor</h3>
<p>For more flat-file oriented systems, you will often be tasked with
converting reams of existing text and HTML files into UTF-8, as well as
making sure that all new files uploaded are properly encoded. Once again,
I can only point vaguely in the right direction for converting your
existing files: make sure you backup, make sure you use
<a href="http://php.net/ref.iconv">iconv</a>(), and
make sure you know what the original character encoding of the files
is (or are, depending on the tidiness of your system).</p>
<p>However, I can proffer more specific advice on the subject of
text editors. Many text editors have notoriously spotty Unicode support.
To find out how your editor is doing, you can check out <a
href="http://www.alanwood.net/unicode/utilities_editors.html">this list</a>
or <a href="http://en.wikipedia.org/wiki/Comparison_of_text_editors#Encoding_support">Wikipedia's list.</a>
I personally use Notepad++, which works like a charm when it comes to UTF-8.
Usually, you will have to <strong>explicitly</strong> tell the editor through some dialogue
(usually Save as or Format) what encoding you want it to use. An editor
will often offer &quot;Unicode&quot; as a method of saving, which is
ambiguous. Make sure you know whether or not they really mean UTF-8
or UTF-16 (which is another flavor of Unicode).</p>
<p>The two things to look out for are whether or not the editor
supports <strong>font mixing</strong> (multiple
fonts in one document) and whether or not it adds a <strong>BOM</strong>.
Font mixing is important because fonts rarely have support for every
language known to mankind: in order to be flexible, an editor must
be able to take a little from here and a little from there, otherwise
all your Chinese characters will come as nice boxes. We'll discuss
BOM below.</p>
<h3 id="migrate-bom">Byte Order Mark (headers already sent!)</h3>
<p>The BOM, or <a href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte
Order Mark</a>, is a magical, invisible character placed at
the beginning of UTF-8 files to tell people what the encoding is and
what the endianness of the text is. It is also unnecessary.</p>
<p>Because it's invisible, it often
catches people by surprise when it starts doing things it shouldn't
be doing. For example, this PHP file:</p>
<pre><strong>BOM</strong>&lt;?php
header('Location: index.php');
?&gt;</pre>
<p>...will fail with the all too familiar <strong>Headers already sent</strong>
PHP error. And because the BOM is invisible, this culprit will go unnoticed.
My suggestion is to only use ASCII in PHP pages, but if you must, make
sure the page is saved WITHOUT the BOM.</p>
<blockquote class="aside">
<p>The headers the error is referring to are <strong>HTTP headers</strong>,
which are sent to the browser before any HTML to tell it various
information. The moment any regular text (and yes, a BOM counts as
ordinary text) is output, the headers must be sent, and you are
not allowed to send anymore. Thus, the error.</p>
</blockquote>
<p>If you are reading in text files to insert into the middle of another
page, it is strongly advised (but not strictly necessary) that you replace out the UTF-8 byte
sequence for BOM <code>&quot;\xEF\xBB\xBF&quot;</code> before inserting it in,
via:</p>
<pre>$text = str_replace(&quot;\xEF\xBB\xBF&quot;, '', $text);</pre>
<h3 id="migrate-fonts">Fonts</h3>
<p>Generally speaking, people who are having trouble with fonts fall
into two categories:</p>
<ul>
<li>Those who want to
use an extremely obscure language for which there is very little
support even among native speakers of the language, and</li>
<li>Those where the primary language of the text is
well-supported but there are occasional characters
that aren't supported.</li>
</ul>
<p>Yes, there's always a chance where an English user happens across
a Sinhalese website and doesn't have the right font. But an English user
who happens not to have the right fonts probably has no business reading Sinhalese
anyway. So we'll deal with the other two edge cases.</p>
<h4 id="migrate-fonts-obscure">Obscure scripts</h4>
<p>If you run a Bengali website, you may get comments from users who
would like to read your website but get heaps of question marks or
other meaningless characters. Fixing this problem requires the
installation of a font or language pack which is often highly
dependent on what the language is. <a href="http://bn.wikipedia.org/wiki/%E0%A6%89%E0%A6%87%E0%A6%95%E0%A6%BF%E0%A6%AA%E0%A7%87%E0%A6%A1%E0%A6%BF%E0%A6%AF%E0%A6%BC%E0%A6%BE:Bangla_script_display_help">Here is an example</a>
of such a help file for the Bengali language, I am sure there are
others out there too. You just have to point users to the appropriate
help file.</p>
<h4 id="migrate-fonts-occasional">Occasional use</h4>
<p>A prime example of when you'll see some very obscure Unicode
characters embedded in what otherwise would be very bland ASCII are
letters of the
<a href="http://en.wikipedia.org/wiki/International_Phonetic_Alphabet">International
Phonetic Alphabet (IPA)</a>, use to designate pronounciations in a very standard
manner (you probably see them all the time in your dictionary). Your
average font probably won't have support for all of the IPA characters
like &#664; (bilabial click) or &#658; (voiced postalveolar fricative).
So what's a poor browser to do? Font mix! Smart browsers like Mozilla Firefox
and Internet Explorer 7 will borrow glyphs from other fonts in order
to make sure that all the characters display properly.</p>
<p>But what happens when the browser isn't smart and happens to be the
most widely used browser in the entire world? Microsoft IE 6
is not smart enough to borrow from other fonts when a character isn't
present, so more often than not you'll be slapped with a nice big &#65533;.
To get things to work, MSIE 6 needs a little nudge. You could configure it
to use a different font to render the text, but you can acheive the same
effect by selectively changing the font for blocks of special characters
to known good Unicode fonts.</p>
<p>Fortunantely, the folks over at Wikipedia have already done all the
heavy lifting for you. Get the CSS from the horses mouth here:
<a href="http://en.wikipedia.org/wiki/MediaWiki:Common.css">Common.css</a>,
and search for &quot;.IPA&quot; There are also a smattering of
other classes you can use for other purposes, check out
<a href="http://meta.wikimedia.org/wiki/Help:Special_characters#Displaying_Special_Characters">this page</a>
for more details. For you lazy ones, this should work:</p>
<pre>.Unicode {
font-family: Code2000, &quot;TITUS Cyberbit Basic&quot;, &quot;Doulos SIL&quot;,
&quot;Chrysanthi Unicode&quot;, &quot;Bitstream Cyberbit&quot;,
&quot;Bitstream CyberBase&quot;, Thryomanes, Gentium, GentiumAlt,
&quot;Lucida Grande&quot;, &quot;Arial Unicode MS&quot;, &quot;Microsoft Sans Serif&quot;,
&quot;Lucida Sans Unicode&quot;;
font-family /**/:inherit; /* resets fonts for everyone but IE6 */
}</pre>
<p>The standard usage goes along the lines of <code>&lt;span class=&quot;Unicode&quot;&gt;Crazy
Unicode stuff here&lt;/span&gt;</code>. Characters in the
<a href="http://en.wikipedia.org/wiki/Windows_Glyph_List_4">Windows Glyph List</a>
usually don't need to be fixed, but for anything else you probably
want to play it safe. Unless, of course, you don't care about IE6
users.</p>
<h3 id="migrate-variablewidth">Dealing with variable width in functions</h3>
<p>When people claim that PHP6 will solve all our Unicode problems, they're
misinformed. It will not fix any of the abovementioned troubles. It will,
however, fix the problem we are about to discuss: processing UTF-8 text
in PHP.</p>
<p>PHP (as of PHP5) is blithely unaware of the existence of UTF-8 (with a few
notable exceptions). Sometimes, this will cause problems, other times,
this won't. So far, we've avoided discussing the architecture of
UTF-8, so, we must first ask, what is UTF-8? Yes, it supports Unicode,
and yes, it is variable width. Other traits:</p>
<ul>
<li>Every character's byte sequence is unique and will never be found
inside the byte sequence of another character,</li>
<li>UTF-8 may use up to four bytes to encode a character,</li>
<li>UTF-8 text must be checked for well-formedness,</li>
<li>Pure ASCII is also valid UTF-8, and</li>
<li>Binary sorting will sort UTF-8 in the same order as Unicode.</li>
</ul>
<p>Each of these traits affect different domains of text processing
in different ways. It is beyond the scope of this document to explain
what precisely these implications are. PHPWact provides
a very good <a href="http://www.phpwact.org/php/i18n/utf-8">reference document</a>
on what to expect from each functions, although coverage is spotty in
some areas. Their more general notes on
<a href="http://www.phpwact.org/php/i18n/charsets">character sets</a>
are also worth looking at for information on UTF-8. Some rules of thumb
when dealing with Unicode text:</p>
<ul>
<li>Do not EVER use functions that:<ul>
<li>...convert case (strtolower, strtoupper, ucfirst, ucwords)</li>
<li>...claim to be case-insensitive (str_ireplace, stristr, strcasecmp)</li>
</ul></li>
<li>Think twice before using functions that:<ul>
<li>...count characters (strlen will return bytes, not characters;
str_split and word_wrap may corrupt)</li>
<li>...entity-ize things (UTF-8 doesn't need entities)</li>
<li>...do very complex string processing (*printf)</li>
</ul></li>
</ul>
<p>...and always think in bytes, not characters. If you use strpos()
to find the position of a character, it will be in bytes, but this
usually won't matter since substr() also operates with byte indices!</p>
<p>You'll also need to make sure your UTF-8 is well-formed and will
probably need replacements for some of these functions. I recommend
using Harry Fuecks' <a href="http://phputf8.sourceforge.net/">PHP
UTF-8</a> library, rather than use mb_string directly. HTML Purifier
also defines a few useful UTF-8 compatible functions: check out
<code>Encoder.php</code> in the <code>/library/HTMLPurifier/</code>
directory.</p>
<h2 id="externallinks">Further Reading</h2>
<p>Well, that's it. Hopefully this document has served as a very
practical springboard into knowledge of how UTF-8 works. You may have
decided that you don't want to migrate yet: that's fine, just know
what will happen to your output and what bug reports you may recieve.</p>
<p>Many other developers have already discussed the subject of Unicode,
UTF-8 and internationalization, and I would like to defer to them for
a more in-depth look into character sets and encodings.</p>

6
docs/fixquotes.htc Normal file
View File

@ -0,0 +1,6 @@
<public:attach event="oncontentready" onevent="init();" />
<script>
function init() {
element.innerHTML = '&#8220;'+element.innerHTML+'&#8221;';
}
</script>

View File

@ -31,7 +31,7 @@ information for casual developers using HTML Purifier.</p>
<dt><a href="enduser-slow.html">Speeding up HTML Purifier</a></dt>
<dd>Explains how to speed up HTML Purifier through caching or inbound filtering.</dd>
<dt><a href="enduser-utf8.html">UTF-8</a></dt>
<dt><a href="enduser-utf8.html">UTF-8: The Secret of Character Encoding</a></dt>
<dd>Describes the rationale for using UTF-8, the ramifications otherwise, and how to make the switch.</dd>
</dl>
@ -54,6 +54,10 @@ conventions.</p>
<dt><a href="dev-optimization.html">Optimization</a></dt>
<dd>Discusses possible methods of optimizing HTML Purifier.</dd>
<dt><a href="dev-advanced-api.html">Advanced API</a></dt>
<dd>Functional specification for HTML Purifier's advanced API for defining
custom filtering behavior.</dd>
</dl>
<h2>Proposals</h2>

View File

@ -7,7 +7,7 @@ value is used for. This means decentralized configuration declarations that
are nevertheless error checking and a centralized configuration object.
Directives are divided into namespaces, indicating the major portion of
functionality they cover (although there may be overlaps. Please consult
functionality they cover (although there may be overlaps). Please consult
the documentation in ConfigDef for more information on these namespaces.
Since configuration is dependant on context, internal classes require a
@ -36,4 +36,5 @@ the definition, you'd have to force reconstruction.
In practice, the pulling directives from the config object are
solely need-based, and the flex points are littered throughout the
setup() function. Some sort of refactoring is likely in order.
setup() function. Some sort of refactoring is likely in order. See
ref-xhtml-1.1.txt for more info.

View File

@ -1,42 +1,6 @@
We are going to model our I18N/L10N off of MediaWiki's system. Their's is
obviously quite complicated, so we're going to simplify it a bit for our needs.
== Structure ==
First, you have a Language object. This object contains all the localisable
message strings, as well as other important language-specific settings and
custom behavior (uppercasing, lowercasing, printing dates, formatting
numbers, etc.)
The object is constructed from two sources: subclassed versions of itself
(classes) and Message files (messages).
== General use ==
You load a language object by calling the Language::factory() function.
This function the class file for the object (taking in account fallback
languages by using the fallback langauge's object but overloading the
language key) and returns that object. Nothing else happens.
When a message/etc is requested, a lazy load initializor is called. Now the
real work starts. We're first going to take the scenario that the language
is not cached. The system loads the Messages file by:
require( $filename );
$cache = compact( self::$mLocalisationKeys );
...where self::$mLocalisationKeys is the name of variables that could be used
in the localization file. This lets you use things like:
$fallback = false;
$rtl = false;
...and easily siphon them into arrays.
Then, we load the $fallback language (if not set, English) to fill in the gaps in
the messages. There is specialized behavior for certain keys, as they can be
mergeable maps, lists or alias lists (not sure what the last one is).
== Caching ==
MediaWiki has lots of caching mechanisms built in, which make the code somewhat

View File

@ -32,6 +32,6 @@ A tag's attribute 'target' (for selecting frames) cut
current behavior: no substitute, just delete when in strict, allow in loose
Attribute 'name' deprecated in favor of 'id'
current behavior: dropped silently
projected behavior: create proper AttrTransform (currently not allowed at all)
projected behavior: create proper AttrTransform
[done] PRE tag allows SUB/SUP? (strict dtd comment vs syntax, loose disallows)
current behavior: disallow as usual

View File

@ -1,21 +1,187 @@
Getting XHTML 1.1 Working
It's quite simple, according to <http://www.w3.org/TR/xhtml11/changes.html>
XHTML 1.1 and HTML Purifier
Todo for XHTML 1.1 support <http://www.w3.org/TR/xhtml11/changes.html>
1. Scratch lang entirely in favor of xml:lang
2. Scratch name entirely in favor of id (partially-done)
3. Support Ruby <http://www.w3.org/TR/2001/REC-ruby-20010531/>
...but that's only an informative section. More things to do:
HTML Purifier uses the modularization of XHTML
<http://www.w3.org/TR/xhtml-modularization/> to organize the internals
of HTMLDefinition into a more manageable and extensible fashion. Rather
than have one super-object, HTMLDefinition is split into HTMLModules,
each of which are responsible for defining elements, their attributes,
and other properties (for a more indepth coverage, see
/library/HTMLPurifier/HTMLModule.php's docblock comments).
1. Scratch style attribute (it's deprecated)
2. Be module-aware (this might entail intelligent grouping in the definition
and allowing users to specifically remove certain modules (see 5))
3. Cross-reference minimal content models with existing DTDs and determine
changes (todo)
4. Watch out for the Legacy Module
<http://www.w3.org/TR/2001/REC-xhtml-modularization-20010410/abstract_modules.html#s_legacymodule>
5. Let users specify their own custom modules
6. Study Modularization document
<http://www.w3.org/TR/2001/REC-xhtml-modularization-20010410/>
The modules that W3C defines and we support are:
* 5.1. Attribute Collections (technically not a module
* 5.2. Core Modules
o 5.2.2. Text Module
o 5.2.3. Hypertext Module
o 5.2.4. List Module
* 5.4. Text Extension Modules
o 5.4.1. Presentation Module
o 5.4.2. Edit Module
o 5.4.3. Bi-directional Text Module
* 5.6. Table Modules
o 5.6.2. Tables Module
* 5.7. Image Module
* 5.18. Style Attribute Module
Modules that we don't support but coul support are:
* 5.6. Table Modules
o 5.6.1. Basic Tables Module [?]
* 5.8. Client-side Image Map Module [?]
* 5.9. Server-side Image Map Module [?]
* 5.12. Target Module [?]
* 5.21. Name Identification Module [deprecated]
* 5.22. Legacy Module [deprecated]
These modules will not be implemented due to their dangerousness or
inapplicability as an XHTML fragment:
* 5.2. Core Modules
o 5.2.1. Structure Module
* 5.3. Applet Module
* 5.5. Forms Modules
o 5.5.1. Basic Forms Module
o 5.5.2. Forms Module
* 5.10. Object Module
* 5.11. Frames Module
* 5.13. Iframe Module
* 5.14. Intrinsic Events Module
* 5.15. Metainformation Module
* 5.16. Scripting Module
* 5.17. Style Sheet Module
* 5.19. Link Module
* 5.20. Base Module
We will not be using W3C's XML Schemas or DTDs directly due to the lack
of robust tools for handling them (the main problem is that all the
current parsers are usually PHP 5 only and solely-validating, not
correcting).
The abstraction of the HTMLDefinition creation process will also
contribute to a need for a caching system. Cache invalidation would be
difficult, but could be done by comparing the HTML and Attr config
namespaces with a copy that was packaged along with the serialized
HTMLDefinition object.
== General Use-Case ==
The outwards API of HTMLDefinition has been largely preserved, not
only for backwards-compatibility but also by design. Instead,
HTMLDefinition can be retrieved "raw", in which it loads a structure
that closely resembles the modules of XHTML 1.1. This structure is very
dynamic, making it easy to make cascading changes to global content
sets or remove elements in bulk.
However, once HTML Purifier needs the actual definition, it retrieves
a finalized version of HTMLDefinition. The finalized definition involves
processing the modules into a form that it is optimized for multiple
calls. This final version is immutable and, even if editable, would
be extremely hard to change.
So, some code taking advantage of the XHTML modularization may look
like this:
<?php
$config = HTMLPurifier_Config::createDefault();
$def =& $config->getHTMLDefinition(true); // reference to raw
unset($def->modules['Hypertext']); // rm ''a'' link
$purifier = new HTMLPurifier($config);
$purifier->purify($html); // now the definition is finalized
?>
== Inclusions ==
One of the nice features of HTMLDefinition is that piggy-backing off
of global attribute and content sets is extremely easy to do.
=== Attributes ===
HTMLModule->elements[$element]->attr stores attribute information for the
specific attributes of $element. This is quite close to the final
API that HTML Purifier interfaces with, but there's an important
extra feature: attr may also contain a array with a member index zero.
<?php
HTMLModule->elements[$element]->attr[0] = array('AttrSet');
?>
Rather than map the attribute key 0 to an array (which should be
an AttrDef), it defines a number of attribute collections that should
be merged into this elements attribute array.
Furthermore, the value of an attribute key, attribute value pair need
not be a fully fledged AttrDef object. They can also be a string, which
signifies a AttrDef that is looked up from a centralized registry
AttrTypes. This allows more concise attribute definitions that look
more like W3C's declarations, as well as offering a centralized point
for modifying the behavior of one attribute type. And, of course, the
old method of manually instantiating an AttrDef still works.
=== Attribute Collections ===
Attribute collections are stored and processed in the AttrCollections
object, which is responsible for performing the inclusions signified
by the 0 index. These attribute collections, too, are mutable, by
using HTMLModule->attr_collections. You may add new attributes
to a collection or define an entirely new collection for your module's
use. Inclusions can also be cumulative.
Attribute collections allow us to get rid of so called "global attributes"
(which actually aren't so global).
=== Content Models and ChildDef ===
An implementation of the above-mentioned attributes and attribute
collections was applied to the ChildDef system. HTML Purifier uses
a proprietary system called ChildDef for performance and flexibility
reasons, but this does not line up very well with W3C's notion of
regexps for defining the allowed children of an element.
HTMLPurifier->elements[$element]->content_model and
HTMLPurifier->elements[$element]->content_model_type store information
about the final ChildDef that will be stored in
HTMLPurifier->elements[$element]->child (we use a different variable
because the two forms are sufficiently different).
$content_model is an abstract, string representation of the internal
state of ChildDef, while $content_model_type is a string identifier
of which ChildDef subclass to instantiate. $content_model is processed
by substituting all content set identifiers (capitalized element names)
with their contents. It is then parsed and passed into the appropriate
ChildDef class, as defined by the ContentSets->getChildDef() or the
custom fallback HTMLModule->getChildDef() for custom child definitions
not in the core.
You'll need to use these facilities if you plan on referencing a content
set like "Inline" or "Block", and using them is recommended even if you're
not due to their conciseness.
A few notes on $content_model: it's structure can be as complicated
as you want, but the pipe symbol (|) is reserved for defining possible
choices, due to the content sets implementation. For example, a content
model that looks like:
"Inline -> Block -> a"
...when the Inline content set is defined as "span | b" and the Block
content set is defined as "div | blockquote", will expand into:
"span | b -> div | blockquote -> a"
The custom HTMLModule->getChildDef() function will need to be able to
then feed this information to ChildDef in a usable manner.
=== Content Sets ===
Content sets can be altered using HTMLModule->content_sets, an associative
array of content set names to content set contents. If the content set
already exists, your values are appended on to it (great for, say,
registering the font tag as an inline element), otherwise it is
created. They are substituted into content_model.

View File

@ -42,3 +42,27 @@ blockquote .label {font-weight:bold; font-size:1em; margin:0 0 .1em;
/* Contains, without exception, $Id$, for SVN version info. */
#version {text-align:right; font-style:italic; margin:2em 0;}
#toc ol ol {list-style-type:lower-roman;}
#toc ol {list-style-type:decimal;}
#toc {list-style-type:upper-alpha;}
q {
behavior: url(fixquotes.htc); /* IE fix */
quotes: '\201C' '\201D' '\2018' '\2019';
}
q:before {
content: open-quote;
}
q:after {
content: close-quote;
}
/* Marks off implementation details interesting only to the person writing
the class described in the spec. */
.technical {margin-left:2em; }
.technical:before {content:"Technical note: "; font-weight:bold; color:#061; }
/* Marks off sections that are lacking. */
.fixme {margin-left:2em; }
.fixme:before {content:"Fix me: "; font-weight:bold; color:#C00; }

View File

@ -0,0 +1,100 @@
<?php
require_once 'HTMLPurifier/AttrTypes.php';
require_once 'HTMLPurifier/AttrDef/Lang.php';
/**
* Defines common attribute collections that modules reference
*/
class HTMLPurifier_AttrCollections
{
/**
* Associative array of attribute collections, indexed by name
* @note Technically, the composition of these is more complicated,
* but we bypass it using our own excludes property
*/
var $info = array();
/**
* Performs all expansions on internal data for use by other inclusions
* It also collects all attribute collection extensions from
* modules
* @param $attr_types HTMLPurifier_AttrTypes instance
* @param $modules Hash array of HTMLPurifier_HTMLModule members
*/
function HTMLPurifier_AttrCollections($attr_types, $modules) {
$info =& $this->info;
// load extensions from the modules
foreach ($modules as $module) {
foreach ($module->attr_collections as $coll_i => $coll) {
foreach ($coll as $attr_i => $attr) {
if ($attr_i === 0 && isset($info[$coll_i][$attr_i])) {
// merge in includes
$info[$coll_i][$attr_i] = array_merge(
$info[$coll_i][$attr_i], $attr);
continue;
}
$info[$coll_i][$attr_i] = $attr;
}
}
}
// perform internal expansions and inclusions
foreach ($info as $name => $attr) {
// merge attribute collections that include others
$this->performInclusions($info[$name]);
// replace string identifiers with actual attribute objects
$this->expandIdentifiers($info[$name], $attr_types);
}
}
/**
* Takes a reference to an attribute associative array and performs
* all inclusions specified by the zero index.
* @param &$attr Reference to attribute array
*/
function performInclusions(&$attr) {
if (!isset($attr[0])) return;
$merge = $attr[0];
// loop through all the inclusions
for ($i = 0; isset($merge[$i]); $i++) {
// foreach attribute of the inclusion, copy it over
foreach ($this->info[$merge[$i]] as $key => $value) {
if (isset($attr[$key])) continue; // also catches more inclusions
$attr[$key] = $value;
}
if (isset($info[$merge[$i]][0])) {
// recursion
$merge = array_merge($merge, isset($info[$merge[$i]][0]));
}
}
unset($attr[0]);
}
/**
* Expands all string identifiers in an attribute array by replacing
* them with the appropriate values inside HTMLPurifier_AttrTypes
* @param &$attr Reference to attribute array
* @param $attr_types HTMLPurifier_AttrTypes instance
*/
function expandIdentifiers(&$attr, $attr_types) {
foreach ($attr as $def_i => $def) {
if ($def_i === 0) continue;
if (!is_string($def)) continue;
if ($def === false) {
unset($attr[$def_i]);
continue;
}
if (isset($attr_types->info[$def])) {
$attr[$def_i] = $attr_types->info[$def];
} else {
trigger_error('Attempted to reference undefined attribute type', E_USER_ERROR);
unset($attr[$def_i]);
}
}
}
}
?>

View File

@ -7,7 +7,7 @@ require_once 'HTMLPurifier/CSSDefinition.php';
* Validates shorthand CSS property background.
* @warning Does not support url tokens that have internal spaces.
*/
class HTMLPurifier_AttrDef_Background extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
{
/**
@ -16,7 +16,7 @@ class HTMLPurifier_AttrDef_Background extends HTMLPurifier_AttrDef
*/
var $info;
function HTMLPurifier_AttrDef_Background($config) {
function HTMLPurifier_AttrDef_CSS_Background($config) {
$def = $config->getCSSDefinition();
$this->info['background-color'] = $def->info['background-color'];
$this->info['background-image'] = $def->info['background-image'];

View File

@ -1,8 +1,8 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/CSSLength.php';
require_once 'HTMLPurifier/AttrDef/Percentage.php';
require_once 'HTMLPurifier/AttrDef/CSS/Length.php';
require_once 'HTMLPurifier/AttrDef/CSS/Percentage.php';
/* W3C says:
[ // adjective and number must be in correct order, even if
@ -45,15 +45,15 @@ require_once 'HTMLPurifier/AttrDef/Percentage.php';
/**
* Validates the value of background-position.
*/
class HTMLPurifier_AttrDef_BackgroundPosition extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_BackgroundPosition extends HTMLPurifier_AttrDef
{
var $length;
var $percentage;
function HTMLPurifier_AttrDef_BackgroundPosition() {
$this->length = new HTMLPurifier_AttrDef_CSSLength();
$this->percentage = new HTMLPurifier_AttrDef_Percentage();
function HTMLPurifier_AttrDef_CSS_BackgroundPosition() {
$this->length = new HTMLPurifier_AttrDef_CSS_Length();
$this->percentage = new HTMLPurifier_AttrDef_CSS_Percentage();
}
function validate($string, $config, &$context) {

View File

@ -5,7 +5,7 @@ require_once 'HTMLPurifier/AttrDef.php';
/**
* Validates the border property as defined by CSS.
*/
class HTMLPurifier_AttrDef_Border extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef
{
/**
@ -13,7 +13,7 @@ class HTMLPurifier_AttrDef_Border extends HTMLPurifier_AttrDef
*/
var $info = array();
function HTMLPurifier_AttrDef_Border($config) {
function HTMLPurifier_AttrDef_CSS_Border($config) {
$def = $config->getCSSDefinition();
$this->info['border-width'] = $def->info['border-width'];
$this->info['border-style'] = $def->info['border-style'];

View File

@ -5,7 +5,7 @@ require_once 'HTMLPurifier/AttrDef.php';
/**
* Validates Color as defined by CSS.
*/
class HTMLPurifier_AttrDef_Color extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
{
/**

View File

@ -9,7 +9,7 @@
* especially useful for CSS values, which often are a choice between
* an enumerated set of predefined values or a flexible data type.
*/
class HTMLPurifier_AttrDef_Composite extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Composite extends HTMLPurifier_AttrDef
{
/**
@ -21,7 +21,7 @@ class HTMLPurifier_AttrDef_Composite extends HTMLPurifier_AttrDef
/**
* @param $defs List of HTMLPurifier_AttrDef objects
*/
function HTMLPurifier_AttrDef_Composite($defs) {
function HTMLPurifier_AttrDef_CSS_Composite($defs) {
$this->defs = $defs;
}

View File

@ -5,7 +5,7 @@ require_once 'HTMLPurifier/AttrDef.php';
/**
* Validates shorthand CSS property font.
*/
class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Font extends HTMLPurifier_AttrDef
{
/**
@ -30,7 +30,7 @@ class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef
'status-bar' => true
);
function HTMLPurifier_AttrDef_Font($config) {
function HTMLPurifier_AttrDef_CSS_Font($config) {
$def = $config->getCSSDefinition();
$this->info['font-style'] = $def->info['font-style'];
$this->info['font-variant'] = $def->info['font-variant'];

View File

@ -7,7 +7,7 @@ require_once 'HTMLPurifier/AttrDef.php';
/**
* Validates a font family list according to CSS spec
*/
class HTMLPurifier_AttrDef_FontFamily extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
{
/**

View File

@ -1,13 +1,12 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/Number.php';
require_once 'HTMLPurifier/AttrDef/CSS/Number.php';
/**
* Represents a Length as defined by CSS.
* @warning Be sure not to confuse this with HTMLPurifier_AttrDef_Length!
*/
class HTMLPurifier_AttrDef_CSSLength extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Length extends HTMLPurifier_AttrDef
{
/**
@ -26,8 +25,8 @@ class HTMLPurifier_AttrDef_CSSLength extends HTMLPurifier_AttrDef
* @param $non_negative Bool indication whether or not negative values are
* allowed.
*/
function HTMLPurifier_AttrDef_CSSLength($non_negative = false) {
$this->number_def = new HTMLPurifier_AttrDef_Number($non_negative);
function HTMLPurifier_AttrDef_CSS_Length($non_negative = false) {
$this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
}
function validate($length, $config, &$context) {

View File

@ -6,16 +6,16 @@ require_once 'HTMLPurifier/AttrDef.php';
* Validates shorthand CSS property list-style.
* @warning Does not support url tokens that have internal spaces.
*/
class HTMLPurifier_AttrDef_ListStyle extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_ListStyle extends HTMLPurifier_AttrDef
{
/**
* Local copy of component validators.
* @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
* @note See HTMLPurifier_AttrDef_CSS_Font::$info for a similar impl.
*/
var $info;
function HTMLPurifier_AttrDef_ListStyle($config) {
function HTMLPurifier_AttrDef_CSS_ListStyle($config) {
$def = $config->getCSSDefinition();
$this->info['list-style-type'] = $def->info['list-style-type'];
$this->info['list-style-position'] = $def->info['list-style-position'];

View File

@ -13,7 +13,7 @@ require_once 'HTMLPurifier/AttrDef.php';
* can only be used alone: it will never manifest as part of a multi
* shorthand declaration. Thus, this class does not allow inherit.
*/
class HTMLPurifier_AttrDef_Multiple extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Multiple extends HTMLPurifier_AttrDef
{
/**
@ -30,7 +30,7 @@ class HTMLPurifier_AttrDef_Multiple extends HTMLPurifier_AttrDef
* @param $single HTMLPurifier_AttrDef to multiply
* @param $max Max number of values allowed (usually four)
*/
function HTMLPurifier_AttrDef_Multiple($single, $max = 4) {
function HTMLPurifier_AttrDef_CSS_Multiple($single, $max = 4) {
$this->single = $single;
$this->max = $max;
}

View File

@ -3,7 +3,7 @@
/**
* Validates a number as defined by the CSS spec.
*/
class HTMLPurifier_AttrDef_Number extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef
{
/**
@ -14,7 +14,7 @@ class HTMLPurifier_AttrDef_Number extends HTMLPurifier_AttrDef
/**
* @param $non_negative Bool indicating whether negatives are forbidden
*/
function HTMLPurifier_AttrDef_Number($non_negative = false) {
function HTMLPurifier_AttrDef_CSS_Number($non_negative = false) {
$this->non_negative = $non_negative;
}

View File

@ -1,24 +1,24 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/Number.php';
require_once 'HTMLPurifier/AttrDef/CSS/Number.php';
/**
* Validates a Percentage as defined by the CSS spec.
*/
class HTMLPurifier_AttrDef_Percentage extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Percentage extends HTMLPurifier_AttrDef
{
/**
* Instance of HTMLPurifier_AttrDef_Number to defer number validation
* Instance of HTMLPurifier_AttrDef_CSS_Number to defer number validation
*/
var $number_def;
/**
* @param Bool indicating whether to forbid negative values
*/
function HTMLPurifier_AttrDef_Percentage($non_negative = false) {
$this->number_def = new HTMLPurifier_AttrDef_Number($non_negative);
function HTMLPurifier_AttrDef_CSS_Percentage($non_negative = false) {
$this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
}
function validate($string, $config, &$context) {

View File

@ -7,7 +7,7 @@ require_once 'HTMLPurifier/AttrDef.php';
* @note This class could be generalized into a version that acts sort of
* like Enum except you can compound the allowed values.
*/
class HTMLPurifier_AttrDef_TextDecoration extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef
{
/**

View File

@ -4,17 +4,17 @@ require_once 'HTMLPurifier/AttrDef/URI.php';
/**
* Validates a URI in CSS syntax, which uses url('http://example.com')
* @note While theoretically speaking we a URI in a CSS document could
* @note While theoretically speaking a URI in a CSS document could
* be non-embedded, as of CSS2 there is no such usage so we're
* generalizing it. This may need to be changed in the future.
* @warning Since HTMLPurifier_AttrDef_CSS blindly uses semicolons as
* the separator, you cannot put a literal semicolon in
* in the URI. Try percent encoding it, in that case.
*/
class HTMLPurifier_AttrDef_CSSURI extends HTMLPurifier_AttrDef_URI
class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
{
function HTMLPurifier_AttrDef_CSSURI() {
function HTMLPurifier_AttrDef_CSS_URI() {
$this->HTMLPurifier_AttrDef_URI(true); // always embedded
}

View File

@ -25,8 +25,8 @@ class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
* @param $case_sensitive Bool indicating whether or not case sensitive
*/
function HTMLPurifier_AttrDef_Enum(
$valid_values = array(), $case_sensitive = false) {
$valid_values = array(), $case_sensitive = false
) {
$this->valid_values = array_flip($valid_values);
$this->case_sensitive = $case_sensitive;
}

View File

@ -3,6 +3,22 @@
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/IDAccumulator.php';
HTMLPurifier_ConfigSchema::define(
'Attr', 'EnableID', false, 'bool',
'Allows the ID attribute in HTML. This is disabled by default '.
'due to the fact that without proper configuration user input can '.
'easily break the validation of a webpage by specifying an ID that is '.
'already on the surrounding HTML. If you don\'t mind throwing caution to '.
'the wind, enable this directive, but I strongly recommend you also '.
'consider blacklisting IDs you use (%Attr.IDBlacklist) or prefixing all '.
'user supplied IDs (%Attr.IDPrefix). This directive has been available '.
'since 1.2.0, and when set to true reverts to the behavior of pre-1.2.0 '.
'versions.'
);
HTMLPurifier_ConfigSchema::defineAlias(
'HTML', 'EnableAttrID', 'Attr', 'EnableID'
);
HTMLPurifier_ConfigSchema::define(
'Attr', 'IDPrefix', '', 'string',
'String to prefix to IDs. If you have no idea what IDs your pages '.
@ -36,11 +52,16 @@ HTMLPurifier_ConfigSchema::define(
* blacklist. If you're hacking around, make sure you use load()!
*/
class HTMLPurifier_AttrDef_ID extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
{
// ref functionality disabled, since we also have to verify
// whether or not the ID it refers to exists
function validate($id, $config, &$context) {
if (!$config->get('Attr', 'EnableID')) return false;
$id = trim($id); // trim it first
if ($id === '') return false;
@ -55,8 +76,10 @@ class HTMLPurifier_AttrDef_ID extends HTMLPurifier_AttrDef
'%Attr.IDPrefix is set', E_USER_WARNING);
}
$id_accumulator =& $context->get('IDAccumulator');
if (isset($id_accumulator->ids[$id])) return false;
//if (!$this->ref) {
$id_accumulator =& $context->get('IDAccumulator');
if (isset($id_accumulator->ids[$id])) return false;
//}
// we purposely avoid using regex, hopefully this is faster
@ -71,7 +94,7 @@ class HTMLPurifier_AttrDef_ID extends HTMLPurifier_AttrDef
$result = ($trim === '');
}
if ($result) $id_accumulator->add($id);
if (/*!$this->ref && */$result) $id_accumulator->add($id);
// if no change was made to the ID, return the result
// else, return the new id if stripping whitespace made it

View File

@ -1,18 +1,16 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/Pixels.php';
require_once 'HTMLPurifier/AttrDef/HTML/Pixels.php';
/**
* Validates the HTML type length (not to be confused with CSS's length).
*
* This accepts integer pixels or percentages as lengths for certain
* HTML attributes. Don't use this for CSS: that's
* HTMLPurifier_AttrDef_CSSLength which requires prefixes and allows a lot
* more different types.
* HTML attributes.
*/
class HTMLPurifier_AttrDef_Length extends HTMLPurifier_AttrDef_Pixels
class HTMLPurifier_AttrDef_HTML_Length extends HTMLPurifier_AttrDef_HTML_Pixels
{
function validate($string, $config, &$context) {

View File

@ -1,7 +1,7 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/Length.php';
require_once 'HTMLPurifier/AttrDef/HTML/Length.php';
/**
* Validates a MultiLength as defined by the HTML spec.
@ -9,7 +9,7 @@ require_once 'HTMLPurifier/AttrDef/Length.php';
* A multilength is either a integer (pixel count), a percentage, or
* a relative number.
*/
class HTMLPurifier_AttrDef_MultiLength extends HTMLPurifier_AttrDef_Length
class HTMLPurifier_AttrDef_HTML_MultiLength extends HTMLPurifier_AttrDef_HTML_Length
{
function validate($string, $config, &$context) {
@ -27,12 +27,14 @@ class HTMLPurifier_AttrDef_MultiLength extends HTMLPurifier_AttrDef_Length
$int = substr($string, 0, $length - 1);
if ($int == '') return '*';
if (!is_numeric($int)) return false;
$int = (int) $int;
if ($int < 0) return '0*';
if ($int < 0) return false;
if ($int == 0) return '0';
if ($int == 1) return '*';
return ((string) $int) . '*';
}

View File

@ -4,9 +4,13 @@ require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/Config.php';
/**
* Validates the contents of the global HTML attribute class.
* Validates contents based on NMTOKENS attribute type.
* @note The only current use for this is the class attribute in HTML
* @note Could have some functionality factored out into Nmtoken class
* @warning We cannot assume this class will be used only for 'class'
* attributes. Not sure how to hook in magic behavior, then.
*/
class HTMLPurifier_AttrDef_Class extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_HTML_Nmtokens extends HTMLPurifier_AttrDef
{
function validate($string, $config, &$context) {
@ -31,10 +35,10 @@ class HTMLPurifier_AttrDef_Class extends HTMLPurifier_AttrDef
if (empty($matches[1])) return false;
// reconstruct class string
// reconstruct string
$new_string = '';
foreach ($matches[1] as $class_names) {
$new_string .= $class_names . ' ';
foreach ($matches[1] as $token) {
$new_string .= $token . ' ';
}
$new_string = rtrim($new_string);

View File

@ -5,7 +5,7 @@ require_once 'HTMLPurifier/AttrDef.php';
/**
* Validates an integer representation of pixels according to the HTML spec.
*/
class HTMLPurifier_AttrDef_Pixels extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_HTML_Pixels extends HTMLPurifier_AttrDef
{
function validate($string, $config, &$context) {

View File

@ -46,7 +46,7 @@ class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
// process second subtag : $subtags[1]
$length = strlen($subtags[1]);
if ($length == 0 || $length == 1 || $length > 8 || !ctype_alnum($subtags[1])) {
if ($length == 0 || ($length == 1 && $subtags[1] != 'x') || $length > 8 || !ctype_alnum($subtags[1])) {
return $new_string;
}
if (!ctype_lower($subtags[1])) $subtags[1] = strtolower($subtags[1]);

View File

@ -3,7 +3,7 @@
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/URIScheme.php';
require_once 'HTMLPurifier/URISchemeRegistry.php';
require_once 'HTMLPurifier/AttrDef/Host.php';
require_once 'HTMLPurifier/AttrDef/URI/Host.php';
require_once 'HTMLPurifier/PercentEncoder.php';
HTMLPurifier_ConfigSchema::define(
@ -77,6 +77,14 @@ HTMLPurifier_ConfigSchema::define(
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'Disable', false, 'bool',
'Disables all URIs in all forms. Not sure why you\'d want to do that '.
'(after all, the Internet\'s founded on the notion of a hyperlink). '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable');
/**
* Validates a URI as defined by RFC 3986.
* @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
@ -92,7 +100,7 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
* @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
*/
function HTMLPurifier_AttrDef_URI($embeds_resource = false) {
$this->host = new HTMLPurifier_AttrDef_Host();
$this->host = new HTMLPurifier_AttrDef_URI_Host();
$this->PercentEncoder = new HTMLPurifier_PercentEncoder();
$this->embeds_resource = (bool) $embeds_resource;
}
@ -102,6 +110,8 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
// We'll write stack-based parsers later, for now, use regexps to
// get things working as fast as possible (irony)
if ($config->get('URI', 'Disable')) return false;
// parse as CDATA
$uri = $this->parseCDATA($uri);

View File

@ -2,7 +2,7 @@
require_once 'HTMLPurifier/AttrDef.php';
class HTMLPurifier_AttrDef_Email extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef
{
/**

View File

@ -1,12 +1,12 @@
<?php
require_once 'HTMLPurifier/AttrDef/Email.php';
require_once 'HTMLPurifier/AttrDef/URI/Email.php';
/**
* Primitive email validation class based on the regexp found at
* http://www.regular-expressions.info/email.html
*/
class HTMLPurifier_AttrDef_Email_SimpleCheck extends HTMLPurifier_AttrDef_Email
class HTMLPurifier_AttrDef_URI_Email_SimpleCheck extends HTMLPurifier_AttrDef_URI_Email
{
function validate($string, $config, &$context) {

View File

@ -1,28 +1,28 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/IPv4.php';
require_once 'HTMLPurifier/AttrDef/IPv6.php';
require_once 'HTMLPurifier/AttrDef/URI/IPv4.php';
require_once 'HTMLPurifier/AttrDef/URI/IPv6.php';
/**
* Validates a host according to the IPv4, IPv6 and DNS (future) specifications.
*/
class HTMLPurifier_AttrDef_Host extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
{
/**
* Instance of HTMLPurifier_AttrDef_IPv4 sub-validator
* Instance of HTMLPurifier_AttrDef_URI_IPv4 sub-validator
*/
var $ipv4;
/**
* Instance of HTMLPurifier_AttrDef_IPv6 sub-validator
* Instance of HTMLPurifier_AttrDef_URI_IPv6 sub-validator
*/
var $ipv6;
function HTMLPurifier_AttrDef_Host() {
$this->ipv4 = new HTMLPurifier_AttrDef_IPv4();
$this->ipv6 = new HTMLPurifier_AttrDef_IPv6();
function HTMLPurifier_AttrDef_URI_Host() {
$this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4();
$this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6();
}
function validate($string, $config, &$context) {

View File

@ -6,7 +6,7 @@ require_once 'HTMLPurifier/AttrDef.php';
* Validates an IPv4 address
* @author Feyd @ forums.devnetwork.net (public domain)
*/
class HTMLPurifier_AttrDef_IPv4 extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef
{
/**
@ -15,7 +15,7 @@ class HTMLPurifier_AttrDef_IPv4 extends HTMLPurifier_AttrDef
*/
var $ip4;
function HTMLPurifier_AttrDef_IPv4() {
function HTMLPurifier_AttrDef_URI_IPv4() {
$oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255
$this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})";
}

View File

@ -1,6 +1,6 @@
<?php
require_once 'HTMLPurifier/AttrDef/IPv4.php';
require_once 'HTMLPurifier/AttrDef/URI/IPv4.php';
/**
* Validates an IPv6 address.
@ -8,7 +8,7 @@ require_once 'HTMLPurifier/AttrDef/IPv4.php';
* @note This function requires brackets to have been removed from address
* in URI.
*/
class HTMLPurifier_AttrDef_IPv6 extends HTMLPurifier_AttrDef_IPv4
class HTMLPurifier_AttrDef_URI_IPv6 extends HTMLPurifier_AttrDef_URI_IPv4
{
function validate($aIP, $config, &$context) {

View File

@ -0,0 +1,41 @@
<?php
require_once 'HTMLPurifier/AttrDef/HTML/ID.php';
require_once 'HTMLPurifier/AttrDef/HTML/Length.php';
require_once 'HTMLPurifier/AttrDef/HTML/MultiLength.php';
require_once 'HTMLPurifier/AttrDef/HTML/Nmtokens.php';
require_once 'HTMLPurifier/AttrDef/HTML/Pixels.php';
require_once 'HTMLPurifier/AttrDef/Integer.php';
require_once 'HTMLPurifier/AttrDef/Text.php';
require_once 'HTMLPurifier/AttrDef/URI.php';
/**
* Provides lookup array of attribute types to HTMLPurifier_AttrDef objects
*/
class HTMLPurifier_AttrTypes
{
/**
* Lookup array of attribute string identifiers to concrete implementations
* @public
*/
var $info = array();
/**
* Constructs the info array
*/
function HTMLPurifier_AttrTypes() {
$this->info['CDATA'] = new HTMLPurifier_AttrDef_Text();
$this->info['ID'] = new HTMLPurifier_AttrDef_HTML_ID();
$this->info['Length'] = new HTMLPurifier_AttrDef_HTML_Length();
$this->info['MultiLength'] = new HTMLPurifier_AttrDef_HTML_MultiLength();
$this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_HTML_Nmtokens();
$this->info['Pixels'] = new HTMLPurifier_AttrDef_HTML_Pixels();
$this->info['Text'] = new HTMLPurifier_AttrDef_Text();
$this->info['URI'] = new HTMLPurifier_AttrDef_URI();
// number is really a positive integer (one or more digits)
$this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true);
}
}
?>

View File

@ -1,19 +1,19 @@
<?php
require_once 'HTMLPurifier/AttrDef/CSS/Background.php';
require_once 'HTMLPurifier/AttrDef/CSS/BackgroundPosition.php';
require_once 'HTMLPurifier/AttrDef/CSS/Border.php';
require_once 'HTMLPurifier/AttrDef/CSS/Color.php';
require_once 'HTMLPurifier/AttrDef/CSS/Composite.php';
require_once 'HTMLPurifier/AttrDef/CSS/Font.php';
require_once 'HTMLPurifier/AttrDef/CSS/FontFamily.php';
require_once 'HTMLPurifier/AttrDef/CSS/Length.php';
require_once 'HTMLPurifier/AttrDef/CSS/ListStyle.php';
require_once 'HTMLPurifier/AttrDef/CSS/Multiple.php';
require_once 'HTMLPurifier/AttrDef/CSS/Percentage.php';
require_once 'HTMLPurifier/AttrDef/CSS/TextDecoration.php';
require_once 'HTMLPurifier/AttrDef/CSS/URI.php';
require_once 'HTMLPurifier/AttrDef/Enum.php';
require_once 'HTMLPurifier/AttrDef/Color.php';
require_once 'HTMLPurifier/AttrDef/Composite.php';
require_once 'HTMLPurifier/AttrDef/CSSLength.php';
require_once 'HTMLPurifier/AttrDef/Percentage.php';
require_once 'HTMLPurifier/AttrDef/Multiple.php';
require_once 'HTMLPurifier/AttrDef/TextDecoration.php';
require_once 'HTMLPurifier/AttrDef/FontFamily.php';
require_once 'HTMLPurifier/AttrDef/Font.php';
require_once 'HTMLPurifier/AttrDef/Border.php';
require_once 'HTMLPurifier/AttrDef/ListStyle.php';
require_once 'HTMLPurifier/AttrDef/CSSURI.php';
require_once 'HTMLPurifier/AttrDef/BackgroundPosition.php';
require_once 'HTMLPurifier/AttrDef/Background.php';
/**
* Defines allowed CSS attributes and what their values are.
@ -43,7 +43,7 @@ class HTMLPurifier_CSSDefinition
array('none', 'hidden', 'dotted', 'dashed', 'solid', 'double',
'groove', 'ridge', 'inset', 'outset'), false);
$this->info['border-style'] = new HTMLPurifier_AttrDef_Multiple($border_style);
$this->info['border-style'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_style);
$this->info['clear'] = new HTMLPurifier_AttrDef_Enum(
array('none', 'left', 'right', 'both'), false);
@ -54,10 +54,10 @@ class HTMLPurifier_CSSDefinition
$this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum(
array('normal', 'small-caps'), false);
$uri_or_none = new HTMLPurifier_AttrDef_Composite(
$uri_or_none = new HTMLPurifier_AttrDef_CSS_Composite(
array(
new HTMLPurifier_AttrDef_Enum(array('none')),
new HTMLPurifier_AttrDef_CSSURI()
new HTMLPurifier_AttrDef_CSS_URI()
)
);
@ -68,11 +68,11 @@ class HTMLPurifier_CSSDefinition
'upper-roman', 'lower-alpha', 'upper-alpha', 'none'), false);
$this->info['list-style-image'] = $uri_or_none;
$this->info['list-style'] = new HTMLPurifier_AttrDef_ListStyle($config);
$this->info['list-style'] = new HTMLPurifier_AttrDef_CSS_ListStyle($config);
$this->info['text-transform'] = new HTMLPurifier_AttrDef_Enum(
array('capitalize', 'uppercase', 'lowercase', 'none'), false);
$this->info['color'] = new HTMLPurifier_AttrDef_Color();
$this->info['color'] = new HTMLPurifier_AttrDef_CSS_Color();
$this->info['background-image'] = $uri_or_none;
$this->info['background-repeat'] = new HTMLPurifier_AttrDef_Enum(
@ -81,96 +81,96 @@ class HTMLPurifier_CSSDefinition
$this->info['background-attachment'] = new HTMLPurifier_AttrDef_Enum(
array('scroll', 'fixed')
);
$this->info['background-position'] = new HTMLPurifier_AttrDef_BackgroundPosition();
$this->info['background-position'] = new HTMLPurifier_AttrDef_CSS_BackgroundPosition();
$border_color =
$this->info['border-top-color'] =
$this->info['border-bottom-color'] =
$this->info['border-left-color'] =
$this->info['border-right-color'] =
$this->info['background-color'] = new HTMLPurifier_AttrDef_Composite(array(
$this->info['background-color'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_Enum(array('transparent')),
new HTMLPurifier_AttrDef_Color()
new HTMLPurifier_AttrDef_CSS_Color()
));
$this->info['background'] = new HTMLPurifier_AttrDef_Background($config);
$this->info['background'] = new HTMLPurifier_AttrDef_CSS_Background($config);
$this->info['border-color'] = new HTMLPurifier_AttrDef_Multiple($border_color);
$this->info['border-color'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_color);
$border_width =
$this->info['border-top-width'] =
$this->info['border-bottom-width'] =
$this->info['border-left-width'] =
$this->info['border-right-width'] = new HTMLPurifier_AttrDef_Composite(array(
$this->info['border-right-width'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_Enum(array('thin', 'medium', 'thick')),
new HTMLPurifier_AttrDef_CSSLength(true) //disallow negative
new HTMLPurifier_AttrDef_CSS_Length(true) //disallow negative
));
$this->info['border-width'] = new HTMLPurifier_AttrDef_Multiple($border_width);
$this->info['border-width'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_width);
$this->info['letter-spacing'] = new HTMLPurifier_AttrDef_Composite(array(
$this->info['letter-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_Enum(array('normal')),
new HTMLPurifier_AttrDef_CSSLength()
new HTMLPurifier_AttrDef_CSS_Length()
));
$this->info['word-spacing'] = new HTMLPurifier_AttrDef_Composite(array(
$this->info['word-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_Enum(array('normal')),
new HTMLPurifier_AttrDef_CSSLength()
new HTMLPurifier_AttrDef_CSS_Length()
));
$this->info['font-size'] = new HTMLPurifier_AttrDef_Composite(array(
$this->info['font-size'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_Enum(array('xx-small', 'x-small',
'small', 'medium', 'large', 'x-large', 'xx-large',
'larger', 'smaller')),
new HTMLPurifier_AttrDef_Percentage(),
new HTMLPurifier_AttrDef_CSSLength()
new HTMLPurifier_AttrDef_CSS_Percentage(),
new HTMLPurifier_AttrDef_CSS_Length()
));
$this->info['line-height'] = new HTMLPurifier_AttrDef_Composite(array(
$this->info['line-height'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_Enum(array('normal')),
new HTMLPurifier_AttrDef_Number(true), // no negatives
new HTMLPurifier_AttrDef_CSSLength(true),
new HTMLPurifier_AttrDef_Percentage(true)
new HTMLPurifier_AttrDef_CSS_Number(true), // no negatives
new HTMLPurifier_AttrDef_CSS_Length(true),
new HTMLPurifier_AttrDef_CSS_Percentage(true)
));
$margin =
$this->info['margin-top'] =
$this->info['margin-bottom'] =
$this->info['margin-left'] =
$this->info['margin-right'] = new HTMLPurifier_AttrDef_Composite(array(
new HTMLPurifier_AttrDef_CSSLength(),
new HTMLPurifier_AttrDef_Percentage(),
$this->info['margin-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_CSS_Length(),
new HTMLPurifier_AttrDef_CSS_Percentage(),
new HTMLPurifier_AttrDef_Enum(array('auto'))
));
$this->info['margin'] = new HTMLPurifier_AttrDef_Multiple($margin);
$this->info['margin'] = new HTMLPurifier_AttrDef_CSS_Multiple($margin);
// non-negative
$padding =
$this->info['padding-top'] =
$this->info['padding-bottom'] =
$this->info['padding-left'] =
$this->info['padding-right'] = new HTMLPurifier_AttrDef_Composite(array(
new HTMLPurifier_AttrDef_CSSLength(true),
new HTMLPurifier_AttrDef_Percentage(true)
$this->info['padding-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_CSS_Length(true),
new HTMLPurifier_AttrDef_CSS_Percentage(true)
));
$this->info['padding'] = new HTMLPurifier_AttrDef_Multiple($padding);
$this->info['padding'] = new HTMLPurifier_AttrDef_CSS_Multiple($padding);
$this->info['text-indent'] = new HTMLPurifier_AttrDef_Composite(array(
new HTMLPurifier_AttrDef_CSSLength(),
new HTMLPurifier_AttrDef_Percentage()
$this->info['text-indent'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_CSS_Length(),
new HTMLPurifier_AttrDef_CSS_Percentage()
));
$this->info['width'] = new HTMLPurifier_AttrDef_Composite(array(
new HTMLPurifier_AttrDef_CSSLength(true),
new HTMLPurifier_AttrDef_Percentage(true),
$this->info['width'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_CSS_Length(true),
new HTMLPurifier_AttrDef_CSS_Percentage(true),
new HTMLPurifier_AttrDef_Enum(array('auto'))
));
$this->info['text-decoration'] = new HTMLPurifier_AttrDef_TextDecoration();
$this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration();
$this->info['font-family'] = new HTMLPurifier_AttrDef_FontFamily();
$this->info['font-family'] = new HTMLPurifier_AttrDef_CSS_FontFamily();
// this could use specialized code
$this->info['font-weight'] = new HTMLPurifier_AttrDef_Enum(
@ -179,14 +179,14 @@ class HTMLPurifier_CSSDefinition
// MUST be called after other font properties, as it references
// a CSSDefinition object
$this->info['font'] = new HTMLPurifier_AttrDef_Font($config);
$this->info['font'] = new HTMLPurifier_AttrDef_CSS_Font($config);
// same here
$this->info['border'] =
$this->info['border-bottom'] =
$this->info['border-top'] =
$this->info['border-left'] =
$this->info['border-right'] = new HTMLPurifier_AttrDef_Border($config);
$this->info['border-right'] = new HTMLPurifier_AttrDef_CSS_Border($config);
$this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum(array(
'collapse', 'seperate'));
@ -197,11 +197,11 @@ class HTMLPurifier_CSSDefinition
$this->info['table-layout'] = new HTMLPurifier_AttrDef_Enum(array(
'auto', 'fixed'));
$this->info['vertical-align'] = new HTMLPurifier_AttrDef_Composite(array(
$this->info['vertical-align'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_Enum(array('baseline', 'sub', 'super',
'top', 'text-top', 'middle', 'bottom', 'text-bottom')),
new HTMLPurifier_AttrDef_CSSLength(),
new HTMLPurifier_AttrDef_Percentage()
new HTMLPurifier_AttrDef_CSS_Length(),
new HTMLPurifier_AttrDef_CSS_Percentage()
));
}

View File

@ -38,22 +38,13 @@ class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
}
function validateChildren($tokens_of_children, $config, &$context) {
$parent_type = $context->get('ParentType');
switch ($parent_type) {
case 'unknown':
case 'inline':
$result = $this->inline->validateChildren(
$tokens_of_children, $config, $context);
break;
case 'block':
$result = $this->block->validateChildren(
$tokens_of_children, $config, $context);
break;
default:
trigger_error('Invalid context', E_USER_ERROR);
return false;
if ($context->get('IsInline') === false) {
return $this->block->validateChildren(
$tokens_of_children, $config, $context);
} else {
return $this->inline->validateChildren(
$tokens_of_children, $config, $context);
}
return $result;
}
}

View File

@ -20,10 +20,13 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
$elements = str_replace(' ', '', $elements);
$elements = explode('|', $elements);
}
$elements = array_flip($elements);
foreach ($elements as $i => $x) {
$elements[$i] = true;
if (empty($i)) unset($elements[$i]);
$keys = array_keys($elements);
if ($keys == array_keys($keys)) {
$elements = array_flip($elements);
foreach ($elements as $i => $x) {
$elements[$i] = true;
if (empty($i)) unset($elements[$i]);
}
}
$this->elements = $elements;
$this->gen = new HTMLPurifier_Generator();

View File

@ -4,27 +4,31 @@ require_once 'HTMLPurifier/ChildDef/Required.php';
/**
* Takes the contents of blockquote when in strict and reformats for validation.
*
* From XHTML 1.0 Transitional to Strict, there is a notable change where
*/
class HTMLPurifier_ChildDef_StrictBlockquote
extends HTMLPurifier_ChildDef_Required
{
var $real_elements;
var $fake_elements;
var $allow_empty = true;
var $type = 'strictblockquote';
var $init = false;
function HTMLPurifier_ChildDef_StrictBlockquote() {}
function validateChildren($tokens_of_children, $config, &$context) {
$def = $config->getHTMLDefinition();
if (!$this->init) {
// allow all inline elements
$this->elements = $def->info_flow_elements;
$this->elements['#PCDATA'] = true;
$this->real_elements = $this->elements;
$this->fake_elements = $def->info_content_sets['Flow'];
$this->fake_elements['#PCDATA'] = true;
$this->init = true;
}
// trick the parent class into thinking it allows more
$this->elements = $this->fake_elements;
$result = parent::validateChildren($tokens_of_children, $config, $context);
$this->elements = $this->real_elements;
if ($result === false) return array();
if ($result === true) $result = $tokens_of_children;
@ -40,8 +44,10 @@ extends HTMLPurifier_ChildDef_Required
// ifs are nested for readability
if (!$is_inline) {
if (!$depth) {
if (($token->type == 'text') ||
($def->info[$token->name]->type == 'inline')) {
if (
$token->type == 'text' ||
!isset($this->elements[$token->name])
) {
$is_inline = true;
$ret[] = $block_wrap_start;
}
@ -50,7 +56,7 @@ extends HTMLPurifier_ChildDef_Required
if (!$depth) {
// starting tokens have been inline text / empty
if ($token->type == 'start' || $token->type == 'empty') {
if ($def->info[$token->name]->type == 'block') {
if (isset($this->elements[$token->name])) {
// ended
$ret[] = $block_wrap_end;
$is_inline = false;

View File

@ -149,23 +149,36 @@ class HTMLPurifier_Config
return;
}
$this->conf[$namespace][$key] = $value;
if ($namespace == 'HTML' || $namespace == 'Attr') {
// reset HTML definition if relevant attributes changed
$this->html_definition = null;
}
if ($namespace == 'CSS') {
$this->css_definition = null;
}
}
/**
* Retrieves a copy of the HTML definition.
* Retrieves reference to the HTML definition.
* @param $raw Return a copy that has not been setup yet. Must be
* called before it's been setup, otherwise won't work.
*/
function getHTMLDefinition() {
if ($this->html_definition === null) {
$this->html_definition = new HTMLPurifier_HTMLDefinition();
$this->html_definition->setup($this);
function &getHTMLDefinition($raw = false) {
if (
empty($this->html_definition) || // hasn't ever been setup
($raw && $this->html_definition->setup) // requesting new one
) {
$this->html_definition = new HTMLPurifier_HTMLDefinition($this);
if ($raw) return $this->html_definition; // no setup!
}
if (!$this->html_definition->setup) $this->html_definition->setup();
return $this->html_definition;
}
/**
* Retrieves a copy of the CSS definition
* Retrieves reference to the CSS definition
*/
function getCSSDefinition() {
function &getCSSDefinition() {
if ($this->css_definition === null) {
$this->css_definition = new HTMLPurifier_CSSDefinition();
$this->css_definition->setup($this);

View File

@ -0,0 +1,10 @@
<?php
/**
* Base class for configuration entity
*/
class HTMLPurifier_ConfigDef {
var $class = false;
}
?>

View File

@ -0,0 +1,74 @@
<?php
require_once 'HTMLPurifier/ConfigDef.php';
/**
* Structure object containing definition of a directive.
* @note This structure does not contain default values
*/
class HTMLPurifier_ConfigDef_Directive extends HTMLPurifier_ConfigDef
{
var $class = 'directive';
function HTMLPurifier_ConfigDef_Directive(
$type = null,
$descriptions = null,
$allow_null = null,
$allowed = null,
$aliases = null
) {
if ( $type !== null) $this->type = $type;
if ($descriptions !== null) $this->descriptions = $descriptions;
if ( $allow_null !== null) $this->allow_null = $allow_null;
if ( $allowed !== null) $this->allowed = $allowed;
if ( $aliases !== null) $this->aliases = $aliases;
}
/**
* Allowed type of the directive. Values are:
* - string
* - istring (case insensitive string)
* - int
* - float
* - bool
* - lookup (array of value => true)
* - list (regular numbered index array)
* - hash (array of key => value)
* - mixed (anything goes)
*/
var $type = 'mixed';
/**
* Plaintext descriptions of the configuration entity is. Organized by
* file and line number, so multiple descriptions are allowed.
*/
var $descriptions = array();
/**
* Is null allowed? Has no effect for mixed type.
* @bool
*/
var $allow_null = false;
/**
* Lookup table of allowed values of the element, bool true if all allowed.
*/
var $allowed = true;
/**
* Hash of value aliases, i.e. values that are equivalent.
*/
var $aliases = array();
/**
* Adds a description to the array
*/
function addDescription($file, $line, $description) {
if (!isset($this->descriptions[$file])) $this->descriptions[$file] = array();
$this->descriptions[$file][$line] = $description;
}
}
?>

View File

@ -0,0 +1,27 @@
<?php
require_once 'HTMLPurifier/ConfigDef.php';
/**
* Structure object describing a directive alias
*/
class HTMLPurifier_ConfigDef_DirectiveAlias extends HTMLPurifier_ConfigDef
{
var $class = 'alias';
/**
* Namespace being aliased to
*/
var $namespace;
/**
* Directive being aliased to
*/
var $name;
function HTMLPurifier_ConfigDef_DirectiveAlias($namespace, $name) {
$this->namespace = $namespace;
$this->name = $name;
}
}
?>

View File

@ -0,0 +1,23 @@
<?php
require_once 'HTMLPurifier/ConfigDef.php';
/**
* Structure object describing of a namespace
*/
class HTMLPurifier_ConfigDef_Namespace extends HTMLPurifier_ConfigDef {
function HTMLPurifier_ConfigDef_Namespace($description = null) {
$this->description = $description;
}
var $class = 'namespace';
/**
* String description of what kinds of directives go in this namespace.
*/
var $description;
}
?>

View File

@ -1,6 +1,10 @@
<?php
require_once 'HTMLPurifier/Error.php';
require_once 'HTMLPurifier/ConfigDef.php';
require_once 'HTMLPurifier/ConfigDef/Namespace.php';
require_once 'HTMLPurifier/ConfigDef/Directive.php';
require_once 'HTMLPurifier/ConfigDef/DirectiveAlias.php';
/**
* Configuration definition, defines directives and their defaults.
@ -138,7 +142,7 @@ class HTMLPurifier_ConfigSchema {
return;
}
$def->info[$namespace][$name] =
new HTMLPurifier_ConfigEntity_Directive();
new HTMLPurifier_ConfigDef_Directive();
$def->info[$namespace][$name]->type = $type;
$def->info[$namespace][$name]->allow_null = $allow_null;
$def->defaults[$namespace][$name] = $default;
@ -172,7 +176,7 @@ class HTMLPurifier_ConfigSchema {
return;
}
$def->info[$namespace] = array();
$def->info_namespace[$namespace] = new HTMLPurifier_ConfigEntity_Namespace();
$def->info_namespace[$namespace] = new HTMLPurifier_ConfigDef_Namespace();
$def->info_namespace[$namespace]->description = $description;
$def->defaults[$namespace] = array();
}
@ -284,7 +288,7 @@ class HTMLPurifier_ConfigSchema {
return;
}
$def->info[$namespace][$name] =
new HTMLPurifier_ConfigEntity_DirectiveAlias(
new HTMLPurifier_ConfigDef_DirectiveAlias(
$new_namespace, $new_name);
}
@ -379,120 +383,4 @@ class HTMLPurifier_ConfigSchema {
}
}
/**
* Base class for configuration entity
*/
class HTMLPurifier_ConfigEntity {
var $class = false;
}
/**
* Structure object describing of a namespace
*/
class HTMLPurifier_ConfigEntity_Namespace extends HTMLPurifier_ConfigEntity {
function HTMLPurifier_ConfigEntity_Namespace($description = null) {
$this->description = $description;
}
var $class = 'namespace';
/**
* String description of what kinds of directives go in this namespace.
*/
var $description;
}
/**
* Structure object containing definition of a directive.
* @note This structure does not contain default values
*/
class HTMLPurifier_ConfigEntity_Directive extends HTMLPurifier_ConfigEntity
{
var $class = 'directive';
function HTMLPurifier_ConfigEntity_Directive(
$type = null,
$descriptions = null,
$allow_null = null,
$allowed = null,
$aliases = null
) {
if ( $type !== null) $this->type = $type;
if ($descriptions !== null) $this->descriptions = $descriptions;
if ( $allow_null !== null) $this->allow_null = $allow_null;
if ( $allowed !== null) $this->allowed = $allowed;
if ( $aliases !== null) $this->aliases = $aliases;
}
/**
* Allowed type of the directive. Values are:
* - string
* - istring (case insensitive string)
* - int
* - float
* - bool
* - lookup (array of value => true)
* - list (regular numbered index array)
* - hash (array of key => value)
* - mixed (anything goes)
*/
var $type = 'mixed';
/**
* Plaintext descriptions of the configuration entity is. Organized by
* file and line number, so multiple descriptions are allowed.
*/
var $descriptions = array();
/**
* Is null allowed? Has no effect for mixed type.
* @bool
*/
var $allow_null = false;
/**
* Lookup table of allowed values of the element, bool true if all allowed.
*/
var $allowed = true;
/**
* Hash of value aliases, i.e. values that are equivalent.
*/
var $aliases = array();
/**
* Adds a description to the array
*/
function addDescription($file, $line, $description) {
if (!isset($this->descriptions[$file])) $this->descriptions[$file] = array();
$this->descriptions[$file][$line] = $description;
}
}
/**
* Structure object describing a directive alias
*/
class HTMLPurifier_ConfigEntity_DirectiveAlias extends HTMLPurifier_ConfigEntity
{
var $class = 'alias';
/**
* Namespace being aliased to
*/
var $namespace;
/**
* Directive being aliased to
*/
var $name;
function HTMLPurifier_ConfigEntity_DirectiveAlias($namespace, $name) {
$this->namespace = $namespace;
$this->name = $name;
}
}
?>

View File

@ -0,0 +1,148 @@
<?php
// common defs that we'll support by default
require_once 'HTMLPurifier/ChildDef.php';
require_once 'HTMLPurifier/ChildDef/Empty.php';
require_once 'HTMLPurifier/ChildDef/Required.php';
require_once 'HTMLPurifier/ChildDef/Optional.php';
class HTMLPurifier_ContentSets
{
/**
* List of content set strings (pipe seperators) indexed by name.
* @public
*/
var $info = array();
/**
* List of content set lookups (element => true) indexed by name.
* @note This is in HTMLPurifier_HTMLDefinition->info_content_sets
* @public
*/
var $lookup = array();
/**
* Synchronized list of defined content sets (keys of info)
*/
var $keys = array();
/**
* Synchronized list of defined content values (values of info)
*/
var $values = array();
/**
* Merges in module's content sets, expands identifiers in the content
* sets and populates the keys, values and lookup member variables.
* @param $modules List of HTMLPurifier_HTMLModule
*/
function HTMLPurifier_ContentSets($modules) {
if (!is_array($modules)) $modules = array($modules);
// populate content_sets based on module hints
// sorry, no way of overloading
foreach ($modules as $module_i => $module) {
foreach ($module->content_sets as $key => $value) {
if (isset($this->info[$key])) {
// add it into the existing content set
$this->info[$key] = $this->info[$key] . ' | ' . $value;
} else {
$this->info[$key] = $value;
}
}
}
// perform content_set expansions
$this->keys = array_keys($this->info);
foreach ($this->info as $i => $set) {
// only performed once, so infinite recursion is not
// a problem
$this->info[$i] =
str_replace(
$this->keys,
// must be recalculated each time due to
// changing substitutions
array_values($this->info),
$set);
}
$this->values = array_values($this->info);
// generate lookup tables
foreach ($this->info as $name => $set) {
$this->lookup[$name] = $this->convertToLookup($set);
}
}
/**
* Accepts a definition; generates and assigns a ChildDef for it
* @param $def HTMLPurifier_ElementDef reference
* @param $module Module that defined the ElementDef
*/
function generateChildDef(&$def, $module) {
if (!empty($def->child)) return; // already done!
$content_model = $def->content_model;
if (is_string($content_model)) {
$def->content_model = str_replace(
$this->keys, $this->values, $content_model);
}
$def->child = $this->getChildDef($def, $module);
}
/**
* Instantiates a ChildDef based on content_model and content_model_type
* member variables in HTMLPurifier_ElementDef
* @note This will also defer to modules for custom HTMLPurifier_ChildDef
* subclasses that need content set expansion
* @param $def HTMLPurifier_ElementDef to have ChildDef extracted
* @return HTMLPurifier_ChildDef corresponding to ElementDef
*/
function getChildDef($def, $module) {
$value = $def->content_model;
if (is_object($value)) {
trigger_error(
'Literal object child definitions should be stored in '.
'ElementDef->child not ElementDef->content_model',
E_USER_NOTICE
);
return $value;
}
switch ($def->content_model_type) {
case 'required':
return new HTMLPurifier_ChildDef_Required($value);
case 'optional':
return new HTMLPurifier_ChildDef_Optional($value);
case 'empty':
return new HTMLPurifier_ChildDef_Empty();
case 'custom':
return new HTMLPurifier_ChildDef_Custom($value);
}
// defer to its module
$return = false;
if ($module->defines_child_def) { // save a func call
$return = $module->getChildDef($def);
}
if ($return !== false) return $return;
// error-out
trigger_error(
'Could not determine which ChildDef class to instantiate',
E_USER_ERROR
);
return false;
}
/**
* Converts a string list of elements separated by pipes into
* a lookup array.
* @param $string List of elements
* @return Lookup array of elements
*/
function convertToLookup($string) {
$array = explode('|', str_replace(' ', '', $string));
$ret = array();
foreach ($array as $i => $k) {
$ret[$k] = true;
}
return $ret;
}
}
?>

View File

@ -0,0 +1,122 @@
<?php
/**
* Structure that stores an HTML element definition. Used by
* HTMLPurifier_HTMLDefinition and HTMLPurifier_HTMLModule.
*/
class HTMLPurifier_ElementDef
{
/**
* Does the definition work by itself, or is it created solely
* for the purpose of merging into another definition?
*/
var $standalone = true;
/**
* Associative array of attribute name to HTMLPurifier_AttrDef
* @note Before being processed by HTMLPurifier_AttrCollections
* when modules are finalized during
* HTMLPurifier_HTMLDefinition->setup(), this array may also
* contain an array at index 0 that indicates which attribute
* collections to load into the full array. It may also
* contain string indentifiers in lieu of HTMLPurifier_AttrDef,
* see HTMLPurifier_AttrTypes on how they are expanded during
* HTMLPurifier_HTMLDefinition->setup() processing.
* @public
*/
var $attr = array();
/**
* Indexed list of tag's HTMLPurifier_AttrTransform to be done before validation
* @public
*/
var $attr_transform_pre = array();
/**
* Indexed list of tag's HTMLPurifier_AttrTransform to be done after validation
* @public
*/
var $attr_transform_post = array();
/**
* HTMLPurifier_ChildDef of this tag.
* @public
*/
var $child;
/**
* Abstract string representation of internal ChildDef rules. See
* HTMLPurifier_ContentSets for how this is parsed and then transformed
* into an HTMLPurifier_ChildDef.
* @public
*/
var $content_model;
/**
* Value of $child->type, used to determine which ChildDef to use,
* used in combination with $content_model.
* @public
*/
var $content_model_type;
/**
* Lookup table of tags that close this tag. Used during parsing
* to make sure we don't attempt to nest unclosed tags.
* @public
*/
var $auto_close = array();
/**
* Does the element have a content model (#PCDATA | Inline)*? This
* is important for chameleon ins and del processing in
* HTMLPurifier_ChildDef_Chameleon. Dynamically set: modules don't
* have to worry about this one.
* @public
*/
var $descendants_are_inline;
/**
* Lookup table of tags excluded from all descendants of this tag.
* @public
*/
var $excludes = array();
/**
* Merges the values of another element definition into this one.
* Values from the new element def take precedence if a value is
* not mergeable.
*/
function mergeIn($def) {
// later keys takes precedence
foreach($def->attr as $k => $v) {
if ($k == 0) {
// merge in the includes
// sorry, no way to override an include
foreach ($v as $v2) {
$def->attr[0][] = $v2;
}
continue;
}
$this->attr[$k] = $v;
}
foreach($def->attr_transform_pre as $k => $v) $this->attr_transform_pre[$k] = $v;
foreach($def->attr_transform_post as $k => $v) $this->attr_transform_post[$k] = $v;
foreach($def->auto_close as $k => $v) $this->auto_close[$k] = $v;
foreach($def->excludes as $k => $v) $this->excludes[$k] = $v;
if(!is_null($def->child)) $this->child = $def->child;
if(!empty($def->content_model)) $this->content_model .= ' | ' . $def->content_model;
if(!empty($def->content_model_type)) $this->content_model_type = $def->content_model_type;
if(!is_null($def->descendants_are_inline)) $this->descendants_are_inline = $def->descendants_are_inline;
}
}
?>

View File

@ -9,7 +9,7 @@ class HTMLPurifier_Filter_YouTube extends HTMLPurifier_Filter
function preFilter($html, $config, &$context) {
$pre_regex = '#<object[^>]+>.+?'.
'http://www.youtube.com/v/([A-Za-z0-9\-_]+).+?</object>#';
'http://www.youtube.com/v/([A-Za-z0-9\-_]+).+?</object>#s';
$pre_replace = '<span class="youtube-embed">\1</span>';
return preg_replace($pre_regex, $pre_replace, $html);
}

View File

@ -1,656 +1,250 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/Enum.php';
require_once 'HTMLPurifier/AttrDef/ID.php';
require_once 'HTMLPurifier/AttrDef/Class.php';
require_once 'HTMLPurifier/AttrDef/Text.php';
require_once 'HTMLPurifier/AttrDef/Lang.php';
require_once 'HTMLPurifier/AttrDef/Pixels.php';
require_once 'HTMLPurifier/AttrDef/Length.php';
require_once 'HTMLPurifier/AttrDef/MultiLength.php';
require_once 'HTMLPurifier/AttrDef/Integer.php';
require_once 'HTMLPurifier/AttrDef/URI.php';
require_once 'HTMLPurifier/AttrDef/CSS.php';
require_once 'HTMLPurifier/AttrTransform.php';
require_once 'HTMLPurifier/AttrTransform/Lang.php';
require_once 'HTMLPurifier/AttrTransform/TextAlign.php';
require_once 'HTMLPurifier/AttrTransform/BdoDir.php';
require_once 'HTMLPurifier/AttrTransform/ImgRequired.php';
require_once 'HTMLPurifier/ChildDef.php';
require_once 'HTMLPurifier/ChildDef/Chameleon.php';
require_once 'HTMLPurifier/ChildDef/Empty.php';
require_once 'HTMLPurifier/ChildDef/Required.php';
require_once 'HTMLPurifier/ChildDef/Optional.php';
require_once 'HTMLPurifier/ChildDef/Table.php';
require_once 'HTMLPurifier/ChildDef/StrictBlockquote.php';
require_once 'HTMLPurifier/Generator.php';
require_once 'HTMLPurifier/Token.php';
require_once 'HTMLPurifier/TagTransform.php';
HTMLPurifier_ConfigSchema::define(
'HTML', 'EnableAttrID', false, 'bool',
'Allows the ID attribute in HTML. This is disabled by default '.
'due to the fact that without proper configuration user input can '.
'easily break the validation of a webpage by specifying an ID that is '.
'already on the surrounding HTML. If you don\'t mind throwing caution to '.
'the wind, enable this directive, but I strongly recommend you also '.
'consider blacklisting IDs you use (%Attr.IDBlacklist) or prefixing all '.
'user supplied IDs (%Attr.IDPrefix). This directive has been available '.
'since 1.2.0, and when set to true reverts to the behavior of pre-1.2.0 '.
'versions.'
);
HTMLPurifier_ConfigSchema::define(
'HTML', 'Strict', false, 'bool',
'Determines whether or not to use Transitional (loose) or Strict rulesets. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'HTML', 'BlockWrapper', 'p', 'string',
'String name of element to wrap inline elements that are inside a block '.
'context. This only occurs in the children of blockquote in strict mode. '.
'Example: by default value, <code>&lt;blockquote&gt;Foo&lt;/blockquote&gt;</code> '.
'would become <code>&lt;blockquote&gt;&lt;p&gt;Foo&lt;/p&gt;&lt;/blockquote&gt;</code>. The '.
'<code>&lt;p&gt;</code> tags can be replaced '.
'with whatever you desire, as long as it is a block level element. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'HTML', 'Parent', 'div', 'string',
'String name of element that HTML fragment passed to library will be '.
'inserted in. An interesting variation would be using span as the '.
'parent element, meaning that only inline tags would be allowed. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'HTML', 'AllowedElements', null, 'lookup/null',
'If HTML Purifier\'s tag set is unsatisfactory for your needs, you '.
'can overload it with your own list of tags to allow. Note that this '.
'method is subtractive: it does its job by taking away from HTML Purifier '.
'usual feature set, so you cannot add a tag that HTML Purifier never '.
'supported in the first place (like embed, form or head). If you change this, you '.
'probably also want to change %HTML.AllowedAttributes. '.
'<strong>Warning:</strong> If another directive conflicts with the '.
'elements here, <em>that</em> directive will win and override. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'HTML', 'AllowedAttributes', null, 'lookup/null',
'IF HTML Purifier\'s attribute set is unsatisfactory, overload it! '.
'The syntax is \'tag.attr\' or \'*.attr\' for the global attributes '.
'(style, id, class, dir, lang, xml:lang).'.
'<strong>Warning:</strong> If another directive conflicts with the '.
'elements here, <em>that</em> directive will win and override. For '.
'example, %HTML.EnableAttrID will take precedence over *.id in this '.
'directive. You must set that directive to true before you can use '.
'IDs at all. This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'Attr', 'DisableURI', false, 'bool',
'Disables all URIs in all forms. Not sure why you\'d want to do that '.
'(after all, the Internet\'s founded on the notion of a hyperlink). '.
'This directive has been available since 1.3.0.'
);
/**
* Defines the purified HTML type with large amounts of objects.
*
* The main function of this object is its $info array, which is an
* associative array of all the child and attribute definitions for
* each allowed element. It also contains special use information (always
* prefixed by info) for intelligent tag closing and global attributes.
*
* For optimization, the definition generation may be moved to
* a maintenance script and stipulate that definition be created
* by a factory method that unserializes a serialized version of Definition.
* Customization would entail copying the maintenance script, making the
* necessary changes, generating the serialized object, and then hooking it
* in via the factory method. We would also offer a LiveDefinition for
* automatic recompilation, suggesting that we would have a DefinitionGenerator.
*/
class HTMLPurifier_HTMLDefinition
{
/**
* Associative array of element names to HTMLPurifier_ElementDef
* @public
*/
var $info = array();
/**
* Associative array of global attribute name to attribute definition.
* @public
*/
var $info_global_attr = array();
/**
* String name of parent element HTML will be going into.
* @public
*/
var $info_parent = 'div';
/**
* Definition for parent element, allows parent element to be a
* tag that's not allowed inside the HTML fragment.
* @public
*/
var $info_parent_def;
/**
* String name of element used to wrap inline elements in block context
* @note This is rarely used except for BLOCKQUOTEs in strict mode
* @public
*/
var $info_block_wrapper = 'p';
/**
* Associative array of deprecated tag name to HTMLPurifier_TagTransform
* @public
*/
var $info_tag_transform = array();
/**
* List of HTMLPurifier_AttrTransform to be performed before validation.
* @public
*/
var $info_attr_transform_pre = array();
/**
* List of HTMLPurifier_AttrTransform to be performed after validation/
* @public
*/
var $info_attr_transform_post = array();
/**
* Lookup table of flow elements
* @public
*/
var $info_flow_elements = array();
/**
* Boolean is a strict definition?
* @public
*/
var $strict;
/**
* Initializes the definition, the meat of the class.
*/
function setup($config) {
// some cached config values
$this->strict = $config->get('HTML', 'Strict');
//////////////////////////////////////////////////////////////////////
// info[] : initializes the definition objects
// if you attempt to define rules later on for a tag not in this array
// PHP will create an stdclass
$allowed_tags =
array(
'ins', 'del', 'blockquote', 'dd', 'li', 'div', 'em', 'strong',
'dfn', 'code', 'samp', 'kbd', 'var', 'cite', 'abbr', 'acronym',
'q', 'sub', 'tt', 'sup', 'i', 'b', 'big', 'small',
'bdo', 'span', 'dt', 'p', 'h1', 'h2', 'h3', 'h4',
'h5', 'h6', 'ol', 'ul', 'dl', 'address', 'img', 'br', 'hr',
'pre', 'a', 'table', 'caption', 'thead', 'tfoot', 'tbody',
'colgroup', 'col', 'td', 'th', 'tr'
);
if (!$this->strict) {
$allowed_tags[] = 'u';
$allowed_tags[] = 's';
$allowed_tags[] = 'strike';
}
foreach ($allowed_tags as $tag) {
$this->info[$tag] = new HTMLPurifier_ElementDef();
}
//////////////////////////////////////////////////////////////////////
// info[]->child : defines allowed children for elements
// emulates the structure of the DTD
// however, these are condensed, with bad stuff taken out
// screening process was done by hand
// entities: prefixed with e_ and _ replaces . from DTD
// double underlines are entities we made up
// we don't use an array because that complicates interpolation
// strings are used instead of arrays because if you use arrays,
// you have to do some hideous manipulation with array_merge()
// todo: determine whether or not having allowed children
// that aren't allowed globally affects security (it shouldn't)
// if above works out, extend children definitions to include all
// possible elements (allowed elements will dictate which ones
// get dropped
$e_special_extra = 'img';
$e_special_basic = 'br | span | bdo';
$e_special = "$e_special_basic | $e_special_extra";
$e_fontstyle_extra = 'big | small';
$e_fontstyle_basic = 'tt | i | b | u | s | strike';
$e_fontstyle = "$e_fontstyle_basic | $e_fontstyle_extra";
$e_phrase_extra = 'sub | sup';
$e_phrase_basic = 'em | strong | dfn | code | q | samp | kbd | var'.
' | cite | abbr | acronym';
$e_phrase = "$e_phrase_basic | $e_phrase_extra";
$e_misc_inline = 'ins | del';
$e_misc = "$e_misc_inline";
$e_inline = "a | $e_special | $e_fontstyle | $e_phrase";
// pseudo-property we created for convenience, see later on
$e__inline = "#PCDATA | $e_inline | $e_misc_inline";
// note the casing
$e_Inline = new HTMLPurifier_ChildDef_Optional($e__inline);
$e_heading = 'h1|h2|h3|h4|h5|h6';
$e_lists = 'ul | ol | dl';
$e_blocktext = 'pre | hr | blockquote | address';
$e_block = "p | $e_heading | div | $e_lists | $e_blocktext | table";
$e_Block = new HTMLPurifier_ChildDef_Optional($e_block);
$e__flow = "#PCDATA | $e_block | $e_inline | $e_misc";
$e_Flow = new HTMLPurifier_ChildDef_Optional($e__flow);
$e_a_content = new HTMLPurifier_ChildDef_Optional("#PCDATA".
" | $e_special | $e_fontstyle | $e_phrase | $e_misc_inline");
$e_pre_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | a".
" | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic".
" | $e_misc_inline");
$e_form_content = new HTMLPurifier_ChildDef_Optional('');//unused
$e_form_button_content = new HTMLPurifier_ChildDef_Optional('');//unused
$this->info['ins']->child =
$this->info['del']->child =
new HTMLPurifier_ChildDef_Chameleon($e__inline, $e__flow);
$this->info['dd']->child =
$this->info['li']->child =
$this->info['div']->child = $e_Flow;
if ($this->strict) {
$this->info['blockquote']->child = new HTMLPurifier_ChildDef_StrictBlockquote();
} else {
$this->info['blockquote']->child = $e_Flow;
}
$this->info['caption']->child =
$this->info['em']->child =
$this->info['strong']->child =
$this->info['dfn']->child =
$this->info['code']->child =
$this->info['samp']->child =
$this->info['kbd']->child =
$this->info['var']->child =
$this->info['cite']->child =
$this->info['abbr']->child =
$this->info['acronym']->child =
$this->info['q']->child =
$this->info['sub']->child =
$this->info['tt']->child =
$this->info['sup']->child =
$this->info['i']->child =
$this->info['b']->child =
$this->info['big']->child =
$this->info['small']->child=
$this->info['bdo']->child =
$this->info['span']->child =
$this->info['dt']->child =
$this->info['p']->child =
$this->info['h1']->child =
$this->info['h2']->child =
$this->info['h3']->child =
$this->info['h4']->child =
$this->info['h5']->child =
$this->info['h6']->child = $e_Inline;
if (!$this->strict) {
$this->info['u']->child =
$this->info['s']->child =
$this->info['strike']->child = $e_Inline;
}
// the only three required definitions, besides custom table code
$this->info['ol']->child =
$this->info['ul']->child = new HTMLPurifier_ChildDef_Required('li');
$this->info['dl']->child = new HTMLPurifier_ChildDef_Required('dt|dd');
if ($this->strict) {
$this->info['address']->child = $e_Inline;
} else {
$this->info['address']->child =
new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline".
" | $e_misc_inline");
}
$this->info['img']->child =
$this->info['br']->child =
$this->info['hr']->child = new HTMLPurifier_ChildDef_Empty();
$this->info['pre']->child = $e_pre_content;
$this->info['a']->child = $e_a_content;
$this->info['table']->child = new HTMLPurifier_ChildDef_Table();
// not a real entity, watch the double underscore
$e__row = new HTMLPurifier_ChildDef_Required('tr');
$this->info['thead']->child = $e__row;
$this->info['tfoot']->child = $e__row;
$this->info['tbody']->child = $e__row;
$this->info['colgroup']->child = new HTMLPurifier_ChildDef_Optional('col');
$this->info['col']->child = new HTMLPurifier_ChildDef_Empty();
$this->info['tr']->child = new HTMLPurifier_ChildDef_Required('th | td');
$this->info['th']->child = $e_Flow;
$this->info['td']->child = $e_Flow;
//////////////////////////////////////////////////////////////////////
// info[]->type : defines the type of the element (block or inline)
// reuses $e_Inline and $e_Block
foreach ($e_Inline->elements as $name => $bool) {
if ($name == '#PCDATA') continue;
if (!isset($this->info[$name])) continue;
$this->info[$name]->type = 'inline';
}
foreach ($e_Block->elements as $name => $bool) {
if (!isset($this->info[$name])) continue;
$this->info[$name]->type = 'block';
}
foreach ($e_Flow->elements as $name => $bool) {
$this->info_flow_elements[$name] = true;
}
//////////////////////////////////////////////////////////////////////
// info[]->excludes : defines elements that aren't allowed in here
// make sure you test using isset() and not !empty()
$this->info['a']->excludes = array('a' => true);
$this->info['pre']->excludes = array_flip(array('img', 'big', 'small',
// technically useless, but good to be indepth
'object', 'applet', 'font', 'basefont'));
//////////////////////////////////////////////////////////////////////
// info[]->attr : defines allowed attributes for elements
// this doesn't include REQUIRED declarations, those are handled
// by the transform classes. It will, however, do simple and slightly
// complex attribute value substitution
// the question of varying allowed attributes is more entangling.
$e_Text = new HTMLPurifier_AttrDef_Text();
// attrs, included in almost every single one except for a few,
// which manually override these in their local definitions
$this->info_global_attr = array(
// core attrs
'class' => new HTMLPurifier_AttrDef_Class(),
'title' => $e_Text,
'style' => new HTMLPurifier_AttrDef_CSS(),
// i18n
'dir' => new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false),
'lang' => new HTMLPurifier_AttrDef_Lang(),
'xml:lang' => new HTMLPurifier_AttrDef_Lang(),
);
if ($config->get('HTML', 'EnableAttrID')) {
$this->info_global_attr['id'] = new HTMLPurifier_AttrDef_ID();
}
// required attribute stipulation handled in attribute transformation
$this->info['bdo']->attr = array(); // nothing else
$this->info['br']->attr['dir'] = false;
$this->info['br']->attr['lang'] = false;
$this->info['br']->attr['xml:lang'] = false;
$this->info['td']->attr['abbr'] = $e_Text;
$this->info['th']->attr['abbr'] = $e_Text;
$this->setAttrForTableElements('align', new HTMLPurifier_AttrDef_Enum(
array('left', 'center', 'right', 'justify', 'char'), false));
$this->setAttrForTableElements('valign', new HTMLPurifier_AttrDef_Enum(
array('top', 'middle', 'bottom', 'baseline'), false));
$this->info['img']->attr['alt'] = $e_Text;
$e_TFrame = new HTMLPurifier_AttrDef_Enum(array('void', 'above',
'below', 'hsides', 'lhs', 'rhs', 'vsides', 'box', 'border'), false);
$this->info['table']->attr['frame'] = $e_TFrame;
$e_TRules = new HTMLPurifier_AttrDef_Enum(array('none', 'groups',
'rows', 'cols', 'all'), false);
$this->info['table']->attr['rules'] = $e_TRules;
$this->info['table']->attr['summary'] = $e_Text;
$this->info['table']->attr['border'] =
new HTMLPurifier_AttrDef_Pixels();
$e_Length = new HTMLPurifier_AttrDef_Length();
$this->info['table']->attr['cellpadding'] =
$this->info['table']->attr['cellspacing'] =
$this->info['table']->attr['width'] =
$this->info['img']->attr['height'] =
$this->info['img']->attr['width'] = $e_Length;
$this->setAttrForTableElements('charoff', $e_Length);
$e_MultiLength = new HTMLPurifier_AttrDef_MultiLength();
$this->info['col']->attr['width'] =
$this->info['colgroup']->attr['width'] = $e_MultiLength;
$e__NumberSpan = new HTMLPurifier_AttrDef_Integer(false, false, true);
$this->info['colgroup']->attr['span'] =
$this->info['col']->attr['span'] =
$this->info['td']->attr['rowspan'] =
$this->info['th']->attr['rowspan'] =
$this->info['td']->attr['colspan'] =
$this->info['th']->attr['colspan'] = $e__NumberSpan;
if (!$config->get('Attr', 'DisableURI')) {
$e_URI = new HTMLPurifier_AttrDef_URI();
$this->info['a']->attr['href'] =
$this->info['img']->attr['longdesc'] =
$this->info['del']->attr['cite'] =
$this->info['ins']->attr['cite'] =
$this->info['blockquote']->attr['cite'] =
$this->info['q']->attr['cite'] = $e_URI;
// URI that causes HTTP request
$this->info['img']->attr['src'] = new HTMLPurifier_AttrDef_URI(true);
}
if (!$this->strict) {
$this->info['li']->attr['value'] = new HTMLPurifier_AttrDef_Integer();
$this->info['ol']->attr['start'] = new HTMLPurifier_AttrDef_Integer();
}
//////////////////////////////////////////////////////////////////////
// info_tag_transform : transformations of tags
$this->info_tag_transform['font'] = new HTMLPurifier_TagTransform_Font();
$this->info_tag_transform['menu'] = new HTMLPurifier_TagTransform_Simple('ul');
$this->info_tag_transform['dir'] = new HTMLPurifier_TagTransform_Simple('ul');
$this->info_tag_transform['center'] = new HTMLPurifier_TagTransform_Center();
//////////////////////////////////////////////////////////////////////
// info[]->auto_close : tags that automatically close another
// todo: determine whether or not SGML-like modeling based on
// mandatory/optional end tags would be a better policy
// make sure you test using isset() not !empty()
// these are all block elements: blocks aren't allowed in P
$this->info['p']->auto_close = array_flip(array(
'address', 'blockquote', 'dd', 'dir', 'div', 'dl', 'dt',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'p', 'pre',
'table', 'ul'
));
$this->info['li']->auto_close = array('li' => true);
// we need TABLE and heading mismatch code
// we may need to make this more flexible for heading mismatch,
// or we can just create another info
//////////////////////////////////////////////////////////////////////
// info[]->attr_transform_* : attribute transformations in elements
// pre is applied before any validation is done, post is done after
$this->info['h1']->attr_transform_pre[] =
$this->info['h2']->attr_transform_pre[] =
$this->info['h3']->attr_transform_pre[] =
$this->info['h4']->attr_transform_pre[] =
$this->info['h5']->attr_transform_pre[] =
$this->info['h6']->attr_transform_pre[] =
$this->info['p'] ->attr_transform_pre[] =
new HTMLPurifier_AttrTransform_TextAlign();
$this->info['bdo']->attr_transform_post[] =
new HTMLPurifier_AttrTransform_BdoDir();
$this->info['img']->attr_transform_post[] =
new HTMLPurifier_AttrTransform_ImgRequired();
//////////////////////////////////////////////////////////////////////
// info_attr_transform_* : global attribute transformation that is
// unconditionally called. Good for transformations that have complex
// start conditions
// pre is applied before any validation is done, post is done after
$this->info_attr_transform_post[] = new HTMLPurifier_AttrTransform_Lang();
// protect against stdclasses floating around
foreach ($this->info as $key => $obj) {
if ($obj instanceof stdClass) {
unset($this->info[$key]);
}
}
//////////////////////////////////////////////////////////////////////
// info_block_wrapper : wraps inline elements in block context
$block_wrapper = $config->get('HTML', 'BlockWrapper');
if (isset($e_Block->elements[$block_wrapper])) {
$this->info_block_wrapper = $block_wrapper;
} else {
trigger_error('Cannot use non-block element as block wrapper.',
E_USER_ERROR);
}
//////////////////////////////////////////////////////////////////////
// info_parent : parent element of the HTML fragment
$parent = $config->get('HTML', 'Parent');
if (isset($this->info[$parent])) {
$this->info_parent = $parent;
} else {
trigger_error('Cannot use unrecognized element as parent.',
E_USER_ERROR);
}
$this->info_parent_def = $this->info[$this->info_parent];
//////////////////////////////////////////////////////////////////////
// %HTML.Allowed(Elements|Attributes) : cut non-allowed elements
$allowed_elements = $config->get('HTML', 'AllowedElements');
if (is_array($allowed_elements)) {
foreach ($this->info as $name => $d) {
if(!isset($allowed_elements[$name])) unset($this->info[$name]);
}
}
$allowed_attributes = $config->get('HTML', 'AllowedAttributes');
if (is_array($allowed_attributes)) {
foreach ($this->info_global_attr as $attr_key => $info) {
if (!isset($allowed_attributes["*.$attr_key"])) {
unset($this->info_global_attr[$attr_key]);
}
}
foreach ($this->info as $tag => $info) {
foreach ($info->attr as $attr => $attr_info) {
if (!isset($allowed_attributes["$tag.$attr"])) {
unset($this->info[$tag]->attr[$attr]);
}
}
}
}
}
function setAttrForTableElements($attr, $def) {
$this->info['col']->attr[$attr] =
$this->info['colgroup']->attr[$attr] =
$this->info['tbody']->attr[$attr] =
$this->info['td']->attr[$attr] =
$this->info['tfoot']->attr[$attr] =
$this->info['th']->attr[$attr] =
$this->info['thead']->attr[$attr] =
$this->info['tr']->attr[$attr] = $def;
}
}
/**
* Structure that stores an element definition.
*/
class HTMLPurifier_ElementDef
{
/**
* Associative array of attribute name to HTMLPurifier_AttrDef
* @public
*/
var $attr = array();
/**
* List of tag's HTMLPurifier_AttrTransform to be done before validation
* @public
*/
var $attr_transform_pre = array();
/**
* List of tag's HTMLPurifier_AttrTransform to be done after validation
* @public
*/
var $attr_transform_post = array();
/**
* Lookup table of tags that close this tag.
* @public
*/
var $auto_close = array();
/**
* HTMLPurifier_ChildDef of this tag.
* @public
*/
var $child;
/**
* Type of the tag: inline or block or unknown?
* @public
*/
var $type = 'unknown';
/**
* Lookup table of tags excluded from all descendants of this tag.
* @public
*/
var $excludes = array();
}
?>
<?php
// components
require_once 'HTMLPurifier/HTMLModuleManager.php';
// this definition and its modules MUST NOT define configuration directives
// outside of the HTML or Attr namespaces
// will be superceded by more accurate doctype declaration schemes
HTMLPurifier_ConfigSchema::define(
'HTML', 'Strict', false, 'bool',
'Determines whether or not to use Transitional (loose) or Strict rulesets. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'HTML', 'BlockWrapper', 'p', 'string',
'String name of element to wrap inline elements that are inside a block '.
'context. This only occurs in the children of blockquote in strict mode. '.
'Example: by default value, <code>&lt;blockquote&gt;Foo&lt;/blockquote&gt;</code> '.
'would become <code>&lt;blockquote&gt;&lt;p&gt;Foo&lt;/p&gt;&lt;/blockquote&gt;</code>. The '.
'<code>&lt;p&gt;</code> tags can be replaced '.
'with whatever you desire, as long as it is a block level element. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'HTML', 'Parent', 'div', 'string',
'String name of element that HTML fragment passed to library will be '.
'inserted in. An interesting variation would be using span as the '.
'parent element, meaning that only inline tags would be allowed. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'HTML', 'AllowedElements', null, 'lookup/null',
'If HTML Purifier\'s tag set is unsatisfactory for your needs, you '.
'can overload it with your own list of tags to allow. Note that this '.
'method is subtractive: it does its job by taking away from HTML Purifier '.
'usual feature set, so you cannot add a tag that HTML Purifier never '.
'supported in the first place (like embed, form or head). If you change this, you '.
'probably also want to change %HTML.AllowedAttributes. '.
'<strong>Warning:</strong> If another directive conflicts with the '.
'elements here, <em>that</em> directive will win and override. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'HTML', 'AllowedAttributes', null, 'lookup/null',
'IF HTML Purifier\'s attribute set is unsatisfactory, overload it! '.
'The syntax is \'tag.attr\' or \'*.attr\' for the global attributes '.
'(style, id, class, dir, lang, xml:lang).'.
'<strong>Warning:</strong> If another directive conflicts with the '.
'elements here, <em>that</em> directive will win and override. For '.
'example, %HTML.EnableAttrID will take precedence over *.id in this '.
'directive. You must set that directive to true before you can use '.
'IDs at all. This directive has been available since 1.3.0.'
);
/**
* Definition of the purified HTML that describes allowed children,
* attributes, and many other things.
*
* Conventions:
*
* All member variables that are prefixed with info
* (including the main $info array) are used by HTML Purifier internals
* and should not be directly edited when customizing the HTMLDefinition.
* They can usually be set via configuration directives or custom
* modules.
*
* On the other hand, member variables without the info prefix are used
* internally by the HTMLDefinition and MUST NOT be used by other HTML
* Purifier internals. Many of them, however, are public, and may be
* edited by userspace code to tweak the behavior of HTMLDefinition.
*
* HTMLPurifier_Printer_HTMLDefinition is a notable exception to this
* rule: in the interest of comprehensiveness, it will sniff everything.
*/
class HTMLPurifier_HTMLDefinition
{
/** FULLY-PUBLIC VARIABLES */
/**
* Associative array of element names to HTMLPurifier_ElementDef
* @public
*/
var $info = array();
/**
* Associative array of global attribute name to attribute definition.
* @public
*/
var $info_global_attr = array();
/**
* String name of parent element HTML will be going into.
* @public
*/
var $info_parent = 'div';
/**
* Definition for parent element, allows parent element to be a
* tag that's not allowed inside the HTML fragment.
* @public
*/
var $info_parent_def;
/**
* String name of element used to wrap inline elements in block context
* @note This is rarely used except for BLOCKQUOTEs in strict mode
* @public
*/
var $info_block_wrapper = 'p';
/**
* Associative array of deprecated tag name to HTMLPurifier_TagTransform
* @public
*/
var $info_tag_transform = array();
/**
* Indexed list of HTMLPurifier_AttrTransform to be performed before validation.
* @public
*/
var $info_attr_transform_pre = array();
/**
* Indexed list of HTMLPurifier_AttrTransform to be performed after validation.
* @public
*/
var $info_attr_transform_post = array();
/**
* Nested lookup array of content set name (Block, Inline) to
* element name to whether or not it belongs in that content set.
* @public
*/
var $info_content_sets = array();
/** PUBLIC BUT INTERNAL VARIABLES */
var $setup = false; /**< Has setup() been called yet? */
var $config; /**< Temporary instance of HTMLPurifier_Config */
var $manager; /**< Instance of HTMLPurifier_HTMLModuleManager */
/**
* Performs low-cost, preliminary initialization.
* @param $config Instance of HTMLPurifier_Config
*/
function HTMLPurifier_HTMLDefinition(&$config) {
$this->config =& $config;
$this->manager = new HTMLPurifier_HTMLModuleManager();
}
/**
* Processes internals into form usable by HTMLPurifier internals.
* Modifying the definition after calling this function should not
* be done.
*/
function setup() {
// multiple call guard
if ($this->setup) {return;} else {$this->setup = true;}
$this->processModules();
$this->setupConfigStuff();
unset($this->config);
unset($this->manager);
}
/**
* Extract out the information from the manager
*/
function processModules() {
$this->manager->setup($this->config);
foreach ($this->manager->activeModules as $module) {
foreach($module->info_tag_transform as $k => $v) $this->info_tag_transform[$k] = $v;
foreach($module->info_attr_transform_pre as $k => $v) $this->info_attr_transform_pre[$k] = $v;
foreach($module->info_attr_transform_post as $k => $v) $this->info_attr_transform_post[$k]= $v;
}
$this->info = $this->manager->getElements($this->config);
$this->info_content_sets = $this->manager->contentSets->lookup;
}
/**
* Sets up stuff based on config. We need a better way of doing this.
*/
function setupConfigStuff() {
$block_wrapper = $this->config->get('HTML', 'BlockWrapper');
if (isset($this->info_content_sets['Block'][$block_wrapper])) {
$this->info_block_wrapper = $block_wrapper;
} else {
trigger_error('Cannot use non-block element as block wrapper.',
E_USER_ERROR);
}
$parent = $this->config->get('HTML', 'Parent');
$def = $this->manager->getElement($parent, $this->config);
if ($def) {
$this->info_parent = $parent;
$this->info_parent_def = $def;
} else {
trigger_error('Cannot use unrecognized element as parent.',
E_USER_ERROR);
$this->info_parent_def = $this->manager->getElement(
$this->info_parent, $this->config);
}
// setup allowed elements, SubtractiveWhitelist module
$allowed_elements = $this->config->get('HTML', 'AllowedElements');
if (is_array($allowed_elements)) {
foreach ($this->info as $name => $d) {
if(!isset($allowed_elements[$name])) unset($this->info[$name]);
}
}
$allowed_attributes = $this->config->get('HTML', 'AllowedAttributes');
if (is_array($allowed_attributes)) {
foreach ($this->info_global_attr as $attr_key => $info) {
if (!isset($allowed_attributes["*.$attr_key"])) {
unset($this->info_global_attr[$attr_key]);
}
}
foreach ($this->info as $tag => $info) {
foreach ($info->attr as $attr => $attr_info) {
if (!isset($allowed_attributes["$tag.$attr"]) &&
!isset($allowed_attributes["*.$attr"])) {
unset($this->info[$tag]->attr[$attr]);
}
}
}
}
}
}
?>

View File

@ -0,0 +1,125 @@
<?php
/**
* Represents an XHTML 1.1 module, with information on elements, tags
* and attributes.
* @note Even though this is technically XHTML 1.1, it is also used for
* regular HTML parsing. We are using modulization as a convenient
* way to represent the internals of HTMLDefinition, and our
* implementation is by no means conforming and does not directly
* use the normative DTDs or XML schemas.
* @note The public variables in a module should almost directly
* correspond to the variables in HTMLPurifier_HTMLDefinition.
* However, the prefix info carries no special meaning in these
* objects (include it anyway if that's the correspondence though).
*/
class HTMLPurifier_HTMLModule
{
/**
* Short unique string identifier of the module
*/
var $name;
/**
* Dynamically set integer that specifies when the module was loaded in.
*/
var $order;
/**
* Informally, a list of elements this module changes. Not used in
* any significant way.
* @protected
*/
var $elements = array();
/**
* Associative array of element names to element definitions.
* Some definitions may be incomplete, to be merged in later
* with the full definition.
* @public
*/
var $info = array();
/**
* Associative array of content set names to content set additions.
* This is commonly used to, say, add an A element to the Inline
* content set. This corresponds to an internal variable $content_sets
* and NOT info_content_sets member variable of HTMLDefinition.
* @public
*/
var $content_sets = array();
/**
* Associative array of attribute collection names to attribute
* collection additions. More rarely used for adding attributes to
* the global collections. Example is the StyleAttribute module adding
* the style attribute to the Core. Corresponds to HTMLDefinition's
* attr_collections->info, since the object's data is only info,
* with extra behavior associated with it.
* @public
*/
var $attr_collections = array();
/**
* Associative array of deprecated tag name to HTMLPurifier_TagTransform
* @public
*/
var $info_tag_transform = array();
/**
* List of HTMLPurifier_AttrTransform to be performed before validation.
* @public
*/
var $info_attr_transform_pre = array();
/**
* List of HTMLPurifier_AttrTransform to be performed after validation.
* @public
*/
var $info_attr_transform_post = array();
/**
* Boolean flag that indicates whether or not getChildDef is implemented.
* For optimization reasons: may save a call to a function. Be sure
* to set it if you do implement getChildDef(), otherwise it will have
* no effect!
* @public
*/
var $defines_child_def = false;
/**
* Retrieves a proper HTMLPurifier_ChildDef subclass based on
* content_model and content_model_type member variables of
* the HTMLPurifier_ElementDef class. There is a similar function
* in HTMLPurifier_HTMLDefinition.
* @param $def HTMLPurifier_ElementDef instance
* @return HTMLPurifier_ChildDef subclass
* @public
*/
function getChildDef($def) {return false;}
/**
* Hook method that lets module perform arbitrary operations on
* HTMLPurifier_HTMLDefinition before the module gets processed.
* @param $definition Reference to HTMLDefinition being setup
*/
function preProcess(&$definition) {}
/**
* Hook method that lets module perform arbitrary operations
* on HTMLPurifier_HTMLDefinition after the module gets processed.
* @param $definition Reference to HTMLDefinition being setup
*/
function postProcess(&$definition) {}
/**
* Hook method that is called when a module gets registered to
* the definition.
* @param $definition Reference to HTMLDefinition being setup
*/
function setup(&$definition) {}
}
?>

View File

@ -0,0 +1,43 @@
<?php
require_once 'HTMLPurifier/HTMLModule.php';
require_once 'HTMLPurifier/AttrTransform/BdoDir.php';
/**
* XHTML 1.1 Bi-directional Text Module, defines elements that
* declare directionality of content. Text Extension Module.
*/
class HTMLPurifier_HTMLModule_Bdo extends HTMLPurifier_HTMLModule
{
var $name = 'Bdo';
var $elements = array('bdo');
var $info = array();
var $content_sets = array('Inline' => 'bdo');
var $attr_collections = array(
'I18N' => array('dir' => false)
);
function HTMLPurifier_HTMLModule_Bdo() {
$dir = new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false);
$this->attr_collections['I18N']['dir'] = $dir;
$this->info['bdo'] = new HTMLPurifier_ElementDef();
$this->info['bdo']->attr = array(
0 => array('Core', 'Lang'),
'dir' => $dir, // required
// The Abstract Module specification has the attribute
// inclusions wrong for bdo: bdo allows
// xml:lang too (and we'll toss in lang for good measure,
// though it is not allowed for XHTML 1.1, this will
// be managed with a global attribute transform)
);
$this->info['bdo']->content_model = '#PCDATA | Inline';
$this->info['bdo']->content_model_type = 'optional';
// provides fallback behavior if dir's missing (dir is required)
$this->info['bdo']->attr_transform_post['required-dir'] =
new HTMLPurifier_AttrTransform_BdoDir();
}
}
?>

View File

@ -0,0 +1,31 @@
<?php
class HTMLPurifier_HTMLModule_CommonAttributes extends HTMLPurifier_HTMLModule
{
var $name = 'CommonAttributes';
var $attr_collections = array(
'Core' => array(
0 => array('Style'),
// 'xml:space' => false,
'class' => 'NMTOKENS',
'id' => 'ID',
'title' => 'CDATA',
),
'Lang' => array(
'xml:lang' => false, // see constructor
),
'I18N' => array(
0 => array('Lang'), // proprietary, for xml:lang/lang
),
'Common' => array(
0 => array('Core', 'I18N')
)
);
function HTMLPurifier_HTMLModule_CommonAttributes() {
$this->attr_collections['Lang']['xml:lang'] = new HTMLPurifier_AttrDef_Lang();
}
}
?>

View File

@ -0,0 +1,46 @@
<?php
require_once 'HTMLPurifier/HTMLModule.php';
require_once 'HTMLPurifier/ChildDef/Chameleon.php';
/**
* XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
* Module.
*/
class HTMLPurifier_HTMLModule_Edit extends HTMLPurifier_HTMLModule
{
var $name = 'Edit';
var $elements = array('del', 'ins');
var $info = array();
var $content_sets = array('Inline' => 'del | ins');
function HTMLPurifier_HTMLModule_Edit() {
foreach ($this->elements as $element) {
$this->info[$element] = new HTMLPurifier_ElementDef();
$this->info[$element]->attr = array(
0 => array('Common'),
'cite' => 'URI',
// 'datetime' => 'Datetime' // Datetime not implemented
);
// Inline context ! Block context (exclamation mark is
// separator, see getChildDef for parsing)
$this->info[$element]->content_model =
'#PCDATA | Inline ! #PCDATA | Flow';
// HTML 4.01 specifies that ins/del must not contain block
// elements when used in an inline context, chameleon is
// a complicated workaround to acheive this effect
$this->info[$element]->content_model_type = 'chameleon';
}
}
var $defines_child_def = true;
function getChildDef($def) {
if ($def->content_model_type != 'chameleon') return false;
$value = explode('!', $def->content_model);
return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]);
}
}
?>

View File

@ -0,0 +1,36 @@
<?php
require_once 'HTMLPurifier/HTMLModule.php';
/**
* XHTML 1.1 Hypertext Module, defines hypertext links. Core Module.
*/
class HTMLPurifier_HTMLModule_Hypertext extends HTMLPurifier_HTMLModule
{
var $name = 'Hypertext';
var $elements = array('a');
var $info = array();
var $content_sets = array('Inline' => 'a');
function HTMLPurifier_HTMLModule_Hypertext() {
$this->info['a'] = new HTMLPurifier_ElementDef();
$this->info['a']->attr = array(
0 => array('Common'),
// 'accesskey' => 'Character',
// 'charset' => 'Charset',
'href' => 'URI',
//'hreflang' => 'LanguageCode',
//'rel' => 'LinkTypes',
//'rev' => 'LinkTypes',
//'tabindex' => 'Number',
//'type' => 'ContentType',
);
$this->info['a']->content_model = '#PCDATA | Inline';
$this->info['a']->content_model_type = 'optional';
$this->info['a']->excludes = array('a' => true);
}
}
?>

View File

@ -0,0 +1,38 @@
<?php
require_once 'HTMLPurifier/HTMLModule.php';
require_once 'HTMLPurifier/AttrDef/URI.php';
require_once 'HTMLPurifier/AttrTransform/ImgRequired.php';
/**
* XHTML 1.1 Image Module provides basic image embedding.
* @note There is specialized code for removing empty images in
* HTMLPurifier_Strategy_RemoveForeignElements
*/
class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule
{
var $name = 'Image';
var $elements = array('img');
var $info = array();
var $content_sets = array('Inline' => 'img');
function HTMLPurifier_HTMLModule_Image() {
$this->info['img'] = new HTMLPurifier_ElementDef();
$this->info['img']->attr = array(
0 => array('Common'),
'alt' => 'Text',
'height' => 'Length',
'longdesc' => 'URI',
'src' => new HTMLPurifier_AttrDef_URI(true), // embedded
'width' => 'Length'
);
$this->info['img']->content_model_type = 'empty';
$this->info['img']->attr_transform_post[] =
new HTMLPurifier_AttrTransform_ImgRequired();
}
}
?>

View File

@ -0,0 +1,60 @@
<?php
/**
* XHTML 1.1 Legacy module defines elements that were previously
* deprecated.
*
* @note Not all legacy elements have been implemented yet, which
* is a bit of a reverse problem as compared to browsers! In
* addition, this legacy module may implement a bit more than
* mandated by XHTML 1.1.
*
* This module can be used in combination with TransformToStrict in order
* to transform as many deprecated elements as possible, but retain
* questionably deprecated elements that do not have good alternatives
* as well as transform elements that don't have an implementation.
* See docs/ref-strictness.txt for more details.
*/
class HTMLPurifier_HTMLModule_Legacy extends HTMLPurifier_HTMLModule
{
// incomplete
var $name = 'Legacy';
var $elements = array('u', 's', 'strike');
var $non_standalone_elements = array('li', 'ol', 'address', 'blockquote');
function HTMLPurifier_HTMLModule_Legacy() {
// setup new elements
foreach ($this->elements as $name) {
$this->info[$name] = new HTMLPurifier_ElementDef();
// for u, s, strike, as more elements get added, add
// conditionals as necessary
$this->info[$name]->content_model = 'Inline | #PCDATA';
$this->info[$name]->content_model_type = 'optional';
$this->info[$name]->attr[0] = array('Common');
}
// setup modifications to old elements
foreach ($this->non_standalone_elements as $name) {
$this->info[$name] = new HTMLPurifier_ElementDef();
$this->info[$name]->standalone = false;
}
$this->info['li']->attr['value'] = new HTMLPurifier_AttrDef_Integer();
$this->info['ol']->attr['start'] = new HTMLPurifier_AttrDef_Integer();
$this->info['address']->content_model = 'Inline | #PCDATA | p';
$this->info['address']->content_model_type = 'optional';
$this->info['address']->child = false;
$this->info['blockquote']->content_model = 'Flow | #PCDATA';
$this->info['blockquote']->content_model_type = 'optional';
$this->info['blockquote']->child = false;
}
}
?>

View File

@ -0,0 +1,46 @@
<?php
require_once 'HTMLPurifier/HTMLModule.php';
/**
* XHTML 1.1 List Module, defines list-oriented elements. Core Module.
*/
class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule
{
var $name = 'List';
var $elements = array('dl', 'dt', 'dd', 'ol', 'ul', 'li');
var $info = array();
// According to the abstract schema, the List content set is a fully formed
// one or more expr, but it invariably occurs in an optional declaration
// so we're not going to do that subtlety. It might cause trouble
// if a user defines "List" and expects that multiple lists are
// allowed to be specified, but then again, that's not very intuitive.
// Furthermore, the actual XML Schema may disagree. Regardless,
// we don't have support for such nested expressions without using
// the incredibly inefficient and draconic Custom ChildDef.
var $content_sets = array('List' => 'dl | ol | ul', 'Flow' => 'List');
function HTMLPurifier_HTMLModule_List() {
foreach ($this->elements as $element) {
$this->info[$element] = new HTMLPurifier_ElementDef();
$this->info[$element]->attr = array(0 => array('Common'));
if ($element == 'li' || $element == 'dd') {
$this->info[$element]->content_model = '#PCDATA | Flow';
$this->info[$element]->content_model_type = 'optional';
} elseif ($element == 'ol' || $element == 'ul') {
$this->info[$element]->content_model = 'li';
$this->info[$element]->content_model_type = 'required';
}
}
$this->info['dt']->content_model = '#PCDATA | Inline';
$this->info['dt']->content_model_type = 'optional';
$this->info['dl']->content_model = 'dt | dd';
$this->info['dl']->content_model_type = 'required';
// this could be a LOT more robust
$this->info['li']->auto_close = array('li' => true);
}
}
?>

View File

@ -0,0 +1,41 @@
<?php
require_once 'HTMLPurifier/HTMLModule.php';
/**
* XHTML 1.1 Presentation Module, defines simple presentation-related
* markup. Text Extension Module.
* @note The official XML Schema and DTD specs further divide this into
* two modules:
* - Block Presentation (hr)
* - Inline Presentation (b, big, i, small, sub, sup, tt)
* We have chosen not to heed this distinction, as content_sets
* provides satisfactory disambiguation.
*/
class HTMLPurifier_HTMLModule_Presentation extends HTMLPurifier_HTMLModule
{
var $name = 'Presentation';
var $elements = array('b', 'big', 'hr', 'i', 'small', 'sub', 'sup', 'tt');
var $info = array();
var $content_sets = array(
'Block' => 'hr',
'Inline' => 'b | big | i | small | sub | sup | tt'
);
function HTMLPurifier_HTMLModule_Presentation() {
foreach ($this->elements as $element) {
$this->info[$element] = new HTMLPurifier_ElementDef();
$this->info[$element]->attr = array(0 => array('Common'));
if ($element == 'hr') {
$this->info[$element]->content_model_type = 'empty';
} else {
$this->info[$element]->content_model = '#PCDATA | Inline';
$this->info[$element]->content_model_type = 'optional';
}
}
}
}
?>

View File

@ -0,0 +1,27 @@
<?php
require_once 'HTMLPurifier/HTMLModule.php';
require_once 'HTMLPurifier/AttrDef/CSS.php';
/**
* XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
* Module.
*/
class HTMLPurifier_HTMLModule_StyleAttribute extends HTMLPurifier_HTMLModule
{
var $name = 'StyleAttribute';
var $attr_collections = array(
// The inclusion routine differs from the Abstract Modules but
// is in line with the DTD and XML Schemas.
'Style' => array('style' => false), // see constructor
'Core' => array(0 => array('Style'))
);
function HTMLPurifier_HTMLModule_StyleAttribute() {
$this->attr_collections['Style']['style'] = new HTMLPurifier_AttrDef_CSS();
}
}
?>

View File

@ -0,0 +1,88 @@
<?php
require_once 'HTMLPurifier/HTMLModule.php';
require_once 'HTMLPurifier/ChildDef/Table.php';
/**
* XHTML 1.1 Tables Module, fully defines accessible table elements.
*/
class HTMLPurifier_HTMLModule_Tables extends HTMLPurifier_HTMLModule
{
var $name = 'Tables';
var $elements = array('caption', 'table', 'td', 'th', 'tr', 'col',
'colgroup', 'tbody', 'thead', 'tfoot');
var $info = array();
var $content_sets = array('Block' => 'table');
function HTMLPurifier_HTMLModule_Tables() {
foreach ($this->elements as $e) {
$this->info[$e] = new HTMLPurifier_ElementDef();
$this->info[$e]->attr = array(0 => array('Common'));
$attr =& $this->info[$e]->attr;
if ($e == 'caption') continue;
if ($e == 'table'){
$attr['border'] = 'Pixels';
$attr['cellpadding'] = 'Length';
$attr['cellspacing'] = 'Length';
$attr['frame'] = new HTMLPurifier_AttrDef_Enum(array(
'void', 'above', 'below', 'hsides', 'lhs', 'rhs',
'vsides', 'box', 'border'
), false);
$attr['rules'] = new HTMLPurifier_AttrDef_Enum(array(
'none', 'groups', 'rows', 'cols', 'all'
), false);
$attr['summary'] = 'Text';
$attr['width'] = 'Length';
continue;
}
if ($e == 'col' || $e == 'colgroup') {
$attr['span'] = 'Number';
$attr['width'] = 'MultiLength';
}
if ($e == 'td' || $e == 'th') {
$attr['abbr'] = 'Text';
$attr['colspan'] = 'Number';
$attr['rowspan'] = 'Number';
}
$attr['align'] = new HTMLPurifier_AttrDef_Enum(array(
'left', 'center', 'right', 'justify', 'char'
), false);
$attr['valign'] = new HTMLPurifier_AttrDef_Enum(array(
'top', 'middle', 'bottom', 'baseline'
), false);
$attr['charoff'] = 'Length';
}
$this->info['caption']->content_model = '#PCDATA | Inline';
$this->info['caption']->content_model_type = 'optional';
// Is done directly because it doesn't leverage substitution
// mechanisms. True model is:
// 'caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ ))'
$this->info['table']->child = new HTMLPurifier_ChildDef_Table();
$this->info['td']->content_model =
$this->info['th']->content_model = '#PCDATA | Flow';
$this->info['td']->content_model_type =
$this->info['th']->content_model_type = 'optional';
$this->info['tr']->content_model = 'td | th';
$this->info['tr']->content_model_type = 'required';
$this->info['col']->content_model_type = 'empty';
$this->info['colgroup']->content_model = 'col';
$this->info['colgroup']->content_model_type = 'optional';
$this->info['tbody']->content_model =
$this->info['thead']->content_model =
$this->info['tfoot']->content_model = 'tr';
$this->info['tbody']->content_model_type =
$this->info['thead']->content_model_type =
$this->info['tfoot']->content_model_type = 'required';
}
}
?>

View File

@ -0,0 +1,78 @@
<?php
require_once 'HTMLPurifier/HTMLModule.php';
/**
* XHTML 1.1 Text Module, defines basic text containers. Core Module.
* @note In the normative XML Schema specification, this module
* is further abstracted into the following modules:
* - Block Phrasal (address, blockquote, pre, h1, h2, h3, h4, h5, h6)
* - Block Structural (div, p)
* - Inline Phrasal (abbr, acronym, cite, code, dfn, em, kbd, q, samp, strong, var)
* - Inline Structural (br, span)
* We have elected not to follow suite, but this may change.
*/
class HTMLPurifier_HTMLModule_Text extends HTMLPurifier_HTMLModule
{
var $name = 'Text';
var $elements = array('abbr', 'acronym', 'address', 'blockquote',
'br', 'cite', 'code', 'dfn', 'div', 'em', 'h1', 'h2', 'h3',
'h4', 'h5', 'h6', 'kbd', 'p', 'pre', 'q', 'samp', 'span', 'strong',
'var');
var $info = array();
var $content_sets = array(
'Heading' => 'h1 | h2 | h3 | h4 | h5 | h6',
'Block' => 'address | blockquote | div | p | pre',
'Inline' => 'abbr | acronym | br | cite | code | dfn | em | kbd | q | samp | span | strong | var',
'Flow' => 'Heading | Block | Inline'
);
function HTMLPurifier_HTMLModule_Text() {
foreach ($this->elements as $element) {
$this->info[$element] = new HTMLPurifier_ElementDef();
// attributes
if ($element == 'br') {
$this->info[$element]->attr = array(0 => array('Core'));
} elseif ($element == 'blockquote' || $element == 'q') {
$this->info[$element]->attr = array(0 => array('Common'), 'cite' => 'URI');
} else {
$this->info[$element]->attr = array(0 => array('Common'));
}
// content models
if ($element == 'br') {
$this->info[$element]->content_model_type = 'empty';
} elseif ($element == 'blockquote') {
$this->info[$element]->content_model = 'Heading | Block | List';
$this->info[$element]->content_model_type = 'optional';
} elseif ($element == 'div') {
$this->info[$element]->content_model = '#PCDATA | Flow';
$this->info[$element]->content_model_type = 'optional';
} else {
$this->info[$element]->content_model = '#PCDATA | Inline';
$this->info[$element]->content_model_type = 'optional';
}
}
// SGML permits exclusions for all descendants, but this is
// not possible with DTDs or XML Schemas. W3C has elected to
// use complicated compositions of content_models to simulate
// exclusion for children, but we go the simpler, SGML-style
// route of flat-out exclusions. Note that the Abstract Module
// is blithely unaware of such distinctions.
$this->info['pre']->excludes = array_flip(array(
'img', 'big', 'small',
'object', 'applet', 'font', 'basefont' // generally not allowed
));
$this->info['p']->auto_close = array_flip(array(
'address', 'blockquote', 'dd', 'dir', 'div', 'dl', 'dt',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'p', 'pre',
'table', 'ul'
));
}
}
?>

View File

@ -0,0 +1,86 @@
<?php
require_once 'HTMLPurifier/ChildDef/StrictBlockquote.php';
require_once 'HTMLPurifier/TagTransform/Simple.php';
require_once 'HTMLPurifier/TagTransform/Center.php';
require_once 'HTMLPurifier/TagTransform/Font.php';
require_once 'HTMLPurifier/AttrTransform/Lang.php';
require_once 'HTMLPurifier/AttrTransform/TextAlign.php';
/**
* Proprietary module that transforms deprecated elements into Strict
* HTML (see HTML 4.01 and XHTML 1.0) when possible.
*/
class HTMLPurifier_HTMLModule_TransformToStrict extends HTMLPurifier_HTMLModule
{
var $name = 'TransformToStrict';
// we're actually modifying these elements, not defining them
var $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'blockquote');
var $info_tag_transform = array(
// placeholders, see constructor for definitions
'font' => false,
'menu' => false,
'dir' => false,
'center'=> false
);
var $attr_collections = array(
'Lang' => array(
'lang' => false // placeholder
)
);
var $info_attr_transform_post = array(
'lang' => false // placeholder
);
function HTMLPurifier_HTMLModule_TransformToStrict() {
// deprecated tag transforms
$this->info_tag_transform['font'] = new HTMLPurifier_TagTransform_Font();
$this->info_tag_transform['menu'] = new HTMLPurifier_TagTransform_Simple('ul');
$this->info_tag_transform['dir'] = new HTMLPurifier_TagTransform_Simple('ul');
$this->info_tag_transform['center'] = new HTMLPurifier_TagTransform_Center();
foreach ($this->elements as $name) {
$this->info[$name] = new HTMLPurifier_ElementDef();
$this->info[$name]->standalone = false;
}
// deprecated attribute transforms
$this->info['h1']->attr_transform_pre['align'] =
$this->info['h2']->attr_transform_pre['align'] =
$this->info['h3']->attr_transform_pre['align'] =
$this->info['h4']->attr_transform_pre['align'] =
$this->info['h5']->attr_transform_pre['align'] =
$this->info['h6']->attr_transform_pre['align'] =
$this->info['p'] ->attr_transform_pre['align'] =
new HTMLPurifier_AttrTransform_TextAlign();
// xml:lang <=> lang mirroring, implement in TransformToStrict,
// this is overridden in TransformToXHTML11
$this->info_attr_transform_post['lang'] = new HTMLPurifier_AttrTransform_Lang();
$this->attr_collections['Lang']['lang'] = new HTMLPurifier_AttrDef_Lang();
// this should not be applied to XHTML 1.0 Transitional, ONLY
// XHTML 1.0 Strict. We may need three classes
$this->info['blockquote']->content_model_type = 'strictblockquote';
$this->info['blockquote']->child = false; // recalculate please!
}
var $defines_child_def = true;
function getChildDef($def) {
if ($def->content_model_type != 'strictblockquote') return false;
return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model);
}
}
?>

View File

@ -0,0 +1,30 @@
<?php
/**
* Proprietary module that transforms XHTML 1.0 deprecated aspects into
* XHTML 1.1 compliant ones, when possible. For maximum effectiveness,
* HTMLPurifier_HTMLModule_TransformToStrict must also be loaded
* (otherwise, elements that were deprecated from Transitional to Strict
* will not be transformed).
*
* XHTML 1.1 compliant document are automatically XHTML 1.0 compliant too,
* although they may not be as friendly to legacy browsers.
*/
class HTMLPurifier_HTMLModule_TransformToXHTML11 extends HTMLPurifier_HTMLModule
{
var $name = 'TransformToXHTML11';
var $attr_collections = array(
'Lang' => array(
'lang' => false // remove it
)
);
var $info_attr_transform_post = array(
'lang' => false // remove it
);
}
?>

View File

@ -0,0 +1,558 @@
<?php
require_once 'HTMLPurifier/HTMLModule.php';
require_once 'HTMLPurifier/ElementDef.php';
require_once 'HTMLPurifier/ContentSets.php';
require_once 'HTMLPurifier/AttrTypes.php';
require_once 'HTMLPurifier/AttrCollections.php';
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/Enum.php';
// W3C modules
require_once 'HTMLPurifier/HTMLModule/CommonAttributes.php';
require_once 'HTMLPurifier/HTMLModule/Text.php';
require_once 'HTMLPurifier/HTMLModule/Hypertext.php';
require_once 'HTMLPurifier/HTMLModule/List.php';
require_once 'HTMLPurifier/HTMLModule/Presentation.php';
require_once 'HTMLPurifier/HTMLModule/Edit.php';
require_once 'HTMLPurifier/HTMLModule/Bdo.php';
require_once 'HTMLPurifier/HTMLModule/Tables.php';
require_once 'HTMLPurifier/HTMLModule/Image.php';
require_once 'HTMLPurifier/HTMLModule/StyleAttribute.php';
require_once 'HTMLPurifier/HTMLModule/Legacy.php';
// proprietary modules
require_once 'HTMLPurifier/HTMLModule/TransformToStrict.php';
require_once 'HTMLPurifier/HTMLModule/TransformToXHTML11.php';
HTMLPurifier_ConfigSchema::define(
'HTML', 'Doctype', null, 'string/null',
'Doctype to use, valid values are HTML 4.01 Transitional, HTML 4.01 '.
'Strict, XHTML 1.0 Transitional, XHTML 1.0 Strict, XHTML 1.1. '.
'Technically speaking this is not actually a doctype (as it does '.
'not identify a corresponding DTD), but we are using this name '.
'for sake of simplicity. This will override any older directives '.
'like %Core.XHTML or %HTML.Strict.'
);
class HTMLPurifier_HTMLModuleManager
{
/**
* Array of HTMLPurifier_Module instances, indexed by module's class name.
* All known modules, regardless of use, are in this array.
*/
var $modules = array();
/**
* String doctype we will validate against. See $validModules for use.
*
* @note
* There is a special doctype '*' that acts both as the "default"
* doctype if a customized system only defines one doctype and
* also a catch-all doctype that gets merged into all the other
* module collections. When possible, use a private collection to
* share modules between doctypes: this special doctype is to
* make life more convenient for users.
*/
var $doctype;
var $doctypeAliases = array(); /**< Lookup array of strings to real doctypes */
/**
* Associative array: $collections[$type][$doctype] = list of modules.
* This is used to logically separate types of functionality so that
* based on the doctype and other configuration settings they may
* be easily switched and on and off. Custom setups may not need
* to use this abstraction, opting to have only one big collection
* with one valid doctype.
*/
var $collections = array();
/**
* Modules that may be used in a valid doctype of this kind.
* Correctional and leniency modules should not be placed in this
* array unless the user said so: don't stuff every possible lenient
* module for this doctype in here.
*/
var $validModules = array();
var $validCollections = array(); /**< Collections to merge into $validModules */
/**
* Modules that we will allow in input, subset of $validModules. Single
* element definitions may result in us consulting validModules.
*/
var $activeModules = array();
var $activeCollections = array(); /**< Collections to merge into $activeModules */
var $counter = 0; /**< Designates next available integer order for modules. */
var $initialized = false; /**< Says whether initialize() was called */
/**
* Specifies what doctype to siphon new modules from addModule() to,
* or false to disable the functionality. Must be used in conjunction
* with $autoCollection.
*/
var $autoDoctype = false;
/**
* Specifies what collection to siphon new modules from addModule() to,
* or false to disable the functionality. Must be used in conjunction
* with $autoCollection.
*/
var $autoCollection = false;
/** Associative array of element name to defining modules (always array) */
var $elementLookup = array();
/** List of prefixes we should use for resolving small names */
var $prefixes = array('HTMLPurifier_HTMLModule_');
var $contentSets; /**< Instance of HTMLPurifier_ContentSets */
var $attrTypes; /**< Instance of HTMLPurifier_AttrTypes */
var $attrCollections; /**< Instance of HTMLPurifier_AttrCollections */
/**
* @param $blank If true, don't do any initializing
*/
function HTMLPurifier_HTMLModuleManager($blank = false) {
// the only editable internal object. The rest need to
// be manipulated through modules
$this->attrTypes = new HTMLPurifier_AttrTypes();
if (!$blank) $this->initialize();
}
function initialize() {
$this->initialized = true;
// load default modules to the recognized modules list (not active)
$modules = array(
// define
'CommonAttributes',
'Text', 'Hypertext', 'List', 'Presentation',
'Edit', 'Bdo', 'Tables', 'Image', 'StyleAttribute',
// define-redefine
'Legacy',
// redefine
'TransformToStrict', 'TransformToXHTML11'
);
foreach ($modules as $module) {
$this->addModule($module);
}
// Safe modules for supported doctypes. These are included
// in the valid and active module lists by default
$this->collections['Safe'] = array(
'_Common' => array( // leading _ indicates private
'CommonAttributes', 'Text', 'Hypertext', 'List',
'Presentation', 'Edit', 'Bdo', 'Tables', 'Image',
'StyleAttribute'
),
// HTML definitions, defer to XHTML definitions
'HTML 4.01 Transitional' => array(array('XHTML 1.0 Transitional')),
'HTML 4.01 Strict' => array(array('XHTML 1.0 Strict')),
// XHTML definitions
'XHTML 1.0 Transitional' => array( array('XHTML 1.0 Strict'), 'Legacy' ),
'XHTML 1.0 Strict' => array(array('_Common')),
'XHTML 1.1' => array(array('_Common')),
);
// Modules that specify elements that are unsafe from untrusted
// third-parties. These should be registered in $validModules but
// almost never $activeModules unless you really know what you're
// doing.
$this->collections['Unsafe'] = array();
// Modules to import if lenient mode (attempt to convert everything
// to a valid representation) is on. These must not be in $validModules
// unless specified so.
$this->collections['Lenient'] = array(
'HTML 4.01 Strict' => array(array('XHTML 1.0 Strict')),
'XHTML 1.0 Strict' => array('TransformToStrict'),
'XHTML 1.1' => array(array('XHTML 1.0 Strict'), 'TransformToXHTML11')
);
// Modules to import if correctional mode (correct everything that
// is feasible to strict mode) is on. These must not be in $validModules
// unless specified so.
$this->collections['Correctional'] = array(
'HTML 4.01 Transitional' => array(array('XHTML 1.0 Transitional')),
'XHTML 1.0 Transitional' => array('TransformToStrict'), // probably want a different one
);
// User-space modules, custom code or whatever
$this->collections['Extension'] = array();
// setup active versus valid modules. ORDER IS IMPORTANT!
// definition modules
$this->makeCollectionActive('Safe');
$this->makeCollectionValid('Unsafe');
// redefinition modules
$this->makeCollectionActive('Lenient');
$this->makeCollectionActive('Correctional');
$this->autoDoctype = '*';
$this->autoCollection = 'Extension';
}
/**
* Adds a module to the recognized module list. This does not
* do anything else: the module must be added to a corresponding
* collection to be "activated".
* @param $module Mixed: string module name, with or without
* HTMLPurifier_HTMLModule prefix, or instance of
* subclass of HTMLPurifier_HTMLModule.
*/
function addModule($module) {
if (is_string($module)) {
$original_module = $module;
if (!class_exists($module)) {
foreach ($this->prefixes as $prefix) {
$module = $prefix . $original_module;
if (class_exists($module)) break;
}
}
if (!class_exists($module)) {
trigger_error($original_module . ' module does not exist',
E_USER_ERROR);
return;
}
$module = new $module();
}
$module->order = $this->counter++; // assign then increment
$this->modules[$module->name] = $module;
if ($this->autoDoctype !== false && $this->autoCollection !== false) {
$this->collections[$this->autoCollection][$this->autoDoctype][] = $module->name;
}
}
/**
* Makes a collection active, while also making it valid if not
* already done so. See $activeModules for the semantics of "active".
* @param $collection_name Name of collection to activate
*/
function makeCollectionActive($collection_name) {
if (!in_array($collection_name, $this->validCollections)) {
$this->makeCollectionValid($collection_name);
}
$this->activeCollections[] = $collection_name;
}
/**
* Makes a collection valid. See $validModules for the semantics of "valid"
*/
function makeCollectionValid($collection_name) {
$this->validCollections[] = $collection_name;
}
/**
* Adds a class prefix that addModule() will use to resolve a
* string name to a concrete class
*/
function addPrefix($prefix) {
$this->prefixes[] = (string) $prefix;
}
function setup($config) {
// load up the autocollection
if ($this->autoCollection !== false) {
$this->makeCollectionActive($this->autoCollection);
}
// retrieve the doctype
$this->doctype = $this->getDoctype($config);
if (isset($this->doctypeAliases[$this->doctype])) {
$this->doctype = $this->doctypeAliases[$this->doctype];
}
// process module collections to module name => module instance form
foreach ($this->collections as $col_i => $x) {
$this->processCollections($this->collections[$col_i]);
}
$this->validModules = $this->assembleModules($this->validCollections);
$this->activeModules = $this->assembleModules($this->activeCollections);
// setup lookup table based on all valid modules
foreach ($this->validModules as $module) {
foreach ($module->info as $name => $def) {
if (!isset($this->elementLookup[$name])) {
$this->elementLookup[$name] = array();
}
$this->elementLookup[$name][] = $module->name;
}
}
// note the different choice
$this->contentSets = new HTMLPurifier_ContentSets(
// content models that contain non-allowed elements are
// harmless because RemoveForeignElements will ensure
// they never get in anyway, and there is usually no
// reason why you should want to restrict a content
// model beyond what is mandated by the doctype.
// Note, however, that this means redefinitions of
// content models can't be tossed in validModels willy-nilly:
// that stuff still is regulated by configuration.
$this->validModules
);
$this->attrCollections = new HTMLPurifier_AttrCollections(
$this->attrTypes,
// only explicitly allowed modules are allowed to affect
// the global attribute collections. This mean's there's
// a distinction between loading the Bdo module, and the
// bdo element: Bdo will enable the dir attribute on all
// elements, while bdo will only define the bdo element,
// which will not have an editable directionality. This might
// catch people who are loading only elements by surprise, so
// we should consider loading an entire module if all the
// elements it defines are requested by the user, especially
// if it affects the global attribute collections.
$this->activeModules
);
}
/**
* Takes a list of collections and merges together all the defined
* modules for the current doctype from those collections.
* @param $collections List of collection suffixes we should grab
* modules from (like 'Safe' or 'Lenient')
*/
function assembleModules($collections) {
$modules = array();
$numOfCollectionsUsed = 0;
foreach ($collections as $name) {
$disable_global = false;
if (!isset($this->collections[$name])) {
trigger_error("$name collection is undefined", E_USER_ERROR);
continue;
}
$cols = $this->collections[$name];
if (isset($cols[$this->doctype])) {
if (isset($cols[$this->doctype]['*'])) {
unset($cols[$this->doctype]['*']);
$disable_global = true;
}
$modules += $cols[$this->doctype];
$numOfCollectionsUsed++;
}
// accept catch-all doctype
if (
$this->doctype !== '*' &&
isset($cols['*']) &&
!$disable_global
) {
$modules += $cols['*'];
}
}
if ($numOfCollectionsUsed < 1) {
// possible XSS injection if user-specified doctypes
// are allowed
trigger_error("Doctype {$this->doctype} does not exist, ".
"check for typos (if you desire a doctype that allows ".
"no elements, use an empty array collection)", E_USER_ERROR);
}
return $modules;
}
/**
* Takes a collection and performs inclusions and substitutions for it.
* @param $cols Reference to collections class member variable
*/
function processCollections(&$cols) {
// $cols is the set of collections
// $col_i is the name (index) of a collection
// $col is a collection/list of modules
// perform inclusions
foreach ($cols as $col_i => $col) {
$seen = array();
if (!empty($col[0]) && is_array($col[0])) {
$seen[$col_i] = true; // recursion reporting
$includes = $col[0];
unset($cols[$col_i][0]); // remove inclusions value, recursion guard
} else {
$includes = array();
}
if (empty($includes)) continue;
for ($i = 0; isset($includes[$i]); $i++) {
$inc = $includes[$i];
if (isset($seen[$inc])) {
trigger_error(
"Circular inclusion detected in $col_i collection",
E_USER_ERROR
);
continue;
} else {
$seen[$inc] = true;
}
if (!isset($cols[$inc])) {
trigger_error(
"Collection $col_i tried to include undefined ".
"collection $inc", E_USER_ERROR);
continue;
}
foreach ($cols[$inc] as $module) {
if (is_array($module)) { // another inclusion!
foreach ($module as $inc2) $includes[] = $inc2;
continue;
}
$cols[$col_i][] = $module; // merge in the other modules
}
}
}
// replace with real modules, invert module from list to
// assoc array of module name to module instance
foreach ($cols as $col_i => $col) {
$ignore_global = false;
$order = array();
foreach ($col as $module_i => $module) {
unset($cols[$col_i][$module_i]);
if (is_array($module)) {
trigger_error("Illegal inclusion array at index".
" $module_i found collection $col_i, inclusion".
" arrays must be at start of collection (index 0)",
E_USER_ERROR);
continue;
}
if ($module_i === '*' && $module === false) {
$ignore_global = true;
continue;
}
if (!isset($this->modules[$module])) {
trigger_error(
"Collection $col_i references undefined ".
"module $module",
E_USER_ERROR
);
continue;
}
$module = $this->modules[$module];
$cols[$col_i][$module->name] = $module;
$order[$module->name] = $module->order;
}
array_multisort(
$order, SORT_ASC, SORT_NUMERIC, $cols[$col_i]
);
if ($ignore_global) $cols[$col_i]['*'] = false;
}
// delete pseudo-collections
foreach ($cols as $col_i => $col) {
if ($col_i[0] == '_') unset($cols[$col_i]);
}
}
/**
* Retrieves the doctype from the configuration object
*/
function getDoctype($config) {
$doctype = $config->get('HTML', 'Doctype');
if ($doctype !== null) {
return $doctype;
}
if (!$this->initialized) {
// don't do HTML-oriented backwards compatibility stuff
// use either the auto-doctype, or the catch-all doctype
return $this->autoDoctype ? $this->autoDoctype : '*';
}
// this is backwards-compatibility stuff
if ($config->get('Core', 'XHTML')) {
$doctype = 'XHTML 1.0';
} else {
$doctype = 'HTML 4.01';
}
if ($config->get('HTML', 'Strict')) {
$doctype .= ' Strict';
} else {
$doctype .= ' Transitional';
}
return $doctype;
}
/**
* Retrieves merged element definitions for all active elements.
* @note We may want to generate an elements array during setup
* and pass that on, because a specific combination of
* elements may trigger the loading of a module.
* @param $config Instance of HTMLPurifier_Config, for determining
* stray elements.
*/
function getElements($config) {
$elements = array();
foreach ($this->activeModules as $module) {
foreach ($module->elements as $name) {
$elements[$name] = $this->getElement($name, $config);
}
}
// standalone elements now loaded
return $elements;
}
/**
* Retrieves a single merged element definition
* @param $name Name of element
* @param $config Instance of HTMLPurifier_Config, may not be necessary.
*/
function getElement($name, $config) {
$def = false;
$modules = $this->validModules;
if (!isset($this->elementLookup[$name])) {
return false;
}
foreach($this->elementLookup[$name] as $module_name) {
$module = $modules[$module_name];
$new_def = $module->info[$name];
if (!$def && $new_def->standalone) {
$def = $new_def;
} elseif ($def) {
$def->mergeIn($new_def);
} else {
// could "save it for another day":
// non-standalone definitions that don't have a standalone
// to merge into could be deferred to the end
continue;
}
// attribute value expansions
$this->attrCollections->performInclusions($def->attr);
$this->attrCollections->expandIdentifiers($def->attr, $this->attrTypes);
// descendants_are_inline, for ChildDef_Chameleon
if (is_string($def->content_model) &&
strpos($def->content_model, 'Inline') !== false) {
if ($name != 'del' && $name != 'ins') {
// this is for you, ins/del
$def->descendants_are_inline = true;
}
}
$this->contentSets->generateChildDef($def, $module);
}
return $def;
}
}
?>

View File

@ -0,0 +1,56 @@
<?php
require_once 'HTMLPurifier/LanguageFactory.php';
class HTMLPurifier_Language
{
/**
* ISO 639 language code of language. Prefers shortest possible version
*/
var $code = 'en';
/**
* Fallback language code
*/
var $fallback = false;
/**
* Array of localizable messages
*/
var $messages = array();
/**
* Has the language object been loaded yet?
* @private
*/
var $_loaded = false;
/**
* Loads language object with necessary info from factory cache
* @note This is a lazy loader
*/
function load() {
if ($this->_loaded) return;
$factory = HTMLPurifier_LanguageFactory::instance();
$factory->loadLanguage($this->code);
foreach ($factory->keys as $key) {
$this->$key = $factory->cache[$this->code][$key];
}
$this->_loaded = true;
}
/**
* Retrieves a localised message. Does not perform any operations.
* @param $key string identifier of message
* @return string localised message
*/
function getMessage($key) {
if (!$this->_loaded) $this->load();
if (!isset($this->messages[$key])) return '';
return $this->messages[$key];
}
}
?>

View File

@ -0,0 +1,12 @@
<?php
// private class for unit testing
class HTMLPurifier_Language_en_x_test extends HTMLPurifier_Language
{
}
?>

View File

@ -0,0 +1,11 @@
<?php
// private language message file for unit testing purposes
$fallback = 'en';
$messages = array(
'htmlpurifier' => 'HTML Purifier X'
);
?>

View File

@ -0,0 +1,12 @@
<?php
$fallback = false;
$messages = array(
'htmlpurifier' => 'HTML Purifier',
'pizza' => 'Pizza', // for unit testing purposes
);
?>

View File

@ -0,0 +1,196 @@
<?php
require_once 'HTMLPurifier/Language.php';
require_once 'HTMLPurifier/AttrDef/Lang.php';
/**
* Class responsible for generating HTMLPurifier_Language objects, managing
* caching and fallbacks.
* @note Thanks to MediaWiki for the general logic, although this version
* has been entirely rewritten
*/
class HTMLPurifier_LanguageFactory
{
/**
* Cache of language code information used to load HTMLPurifier_Language objects
* Structure is: $factory->cache[$language_code][$key] = $value
* @value array map
*/
var $cache;
/**
* Valid keys in the HTMLPurifier_Language object. Designates which
* variables to slurp out of a message file.
* @value array list
*/
var $keys = array('fallback', 'messages');
/**
* Instance of HTMLPurifier_AttrDef_Lang to validate language codes
* @value object HTMLPurifier_AttrDef_Lang
*/
var $validator;
/**
* Cached copy of dirname(__FILE__), directory of current file without
* trailing slash
* @value string filename
*/
var $dir;
/**
* Keys whose contents are a hash map and can be merged
* @value array lookup
*/
var $mergeable_keys_map = array('messages' => true);
/**
* Keys whose contents are a list and can be merged
* @value array lookup
*/
var $mergeable_keys_list = array();
/**
* Retrieve sole instance of the factory.
* @static
* @param $prototype Optional prototype to overload sole instance with,
* or bool true to reset to default factory.
*/
static function &instance($prototype = null) {
static $instance = null;
if ($prototype !== null) {
$instance = $prototype;
} elseif ($instance === null || $prototype == true) {
$instance = new HTMLPurifier_LanguageFactory();
$instance->setup();
}
return $instance;
}
/**
* Sets up the singleton, much like a constructor
* @note Prevents people from getting this outside of the singleton
*/
function setup() {
$this->validator = new HTMLPurifier_AttrDef_Lang();
$this->dir = dirname(__FILE__);
}
/**
* Creates a language object, handles class fallbacks
* @param $code string language code
*/
function create($code) {
$config = $context = false; // hope it doesn't use these!
$code = $this->validator->validate($code, $config, $context);
if ($code === false) $code = 'en'; // malformed code becomes English
$pcode = str_replace('-', '_', $code); // make valid PHP classname
static $depth = 0; // recursion protection
if ($code == 'en') {
$class = 'HTMLPurifier_Language';
$file = $this->dir . '/Language.php';
} else {
$class = 'HTMLPurifier_Language_' . $pcode;
$file = $this->dir . '/Language/classes/' . $code . '.php';
// PHP5/APC deps bug workaround can go here
// you can bypass the conditional include by loading the
// file yourself
if (file_exists($file) && !class_exists($class)) {
include_once $file;
}
}
if (!class_exists($class)) {
// go fallback
$fallback = HTMLPurifier_Language::getFallbackFor($code);
$depth++;
$lang = Language::factory( $fallback );
$depth--;
} else {
$lang = new $class;
}
$lang->code = $code;
return $lang;
}
/**
* Returns the fallback language for language
* @note Loads the original language into cache
* @param $code string language code
*/
function getFallbackFor($code) {
$this->loadLanguage($code);
return $this->cache[$code]['fallback'];
}
/**
* Loads language into the cache, handles message file and fallbacks
* @param $code string language code
*/
function loadLanguage($code) {
static $languages_seen = array(); // recursion guard
// abort if we've already loaded it
if (isset($this->cache[$code])) return;
// generate filename
$filename = $this->dir . '/Language/messages/' . $code . '.php';
// default fallback : may be overwritten by the ensuing include
$fallback = ($code != 'en') ? 'en' : false;
// load primary localisation
if (!file_exists($filename)) {
// skip the include: will rely solely on fallback
$filename = $this->dir . '/Language/messages/en.php';
$cache = array();
} else {
include $filename;
$cache = compact($this->keys);
}
// load fallback localisation
if (!empty($fallback)) {
// infinite recursion guard
if (isset($languages_seen[$code])) {
trigger_error('Circular fallback reference in language ' .
$code, E_USER_ERROR);
$fallback = 'en';
}
$language_seen[$code] = true;
// load the fallback recursively
$this->loadLanguage($fallback);
$fallback_cache = $this->cache[$fallback];
// merge fallback with current language
foreach ( $this->keys as $key ) {
if (isset($cache[$key]) && isset($fallback_cache[$key])) {
if (isset($this->mergeable_keys_map[$key])) {
$cache[$key] = $cache[$key] + $fallback_cache[$key];
} elseif (isset($this->mergeable_keys_list[$key])) {
$cache[$key] = array_merge( $fallback_cache[$key], $cache[$key] );
}
} else {
$cache[$key] = $fallback_cache[$key];
}
}
}
// save to cache for later retrieval
$this->cache[$code] = $cache;
return;
}
}
?>

View File

@ -151,7 +151,8 @@ class HTMLPurifier_Lexer
$lexer = $prototype;
}
if (empty($lexer)) {
if (class_exists('DOMDocument')) { // check for DOM support
if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5
class_exists('DOMDocument')) { // check for DOM support
require_once 'HTMLPurifier/Lexer/DOMLex.php';
$lexer = new HTMLPurifier_Lexer_DOMLex();
} else {

View File

@ -21,7 +21,7 @@ require_once 'HTMLPurifier/TokenFactory.php';
*
* @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
* If this is a huge problem, due to the fact that HTML is hand
* edited and youa re unable to get a parser cache that caches the
* edited and you are unable to get a parser cache that caches the
* the output of HTML Purifier while keeping the original HTML lying
* around, you may want to run Tidy on the resulting output or use
* HTMLPurifier_DirectLex
@ -54,7 +54,13 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
$doc = new DOMDocument();
$doc->encoding = 'UTF-8'; // technically does nothing, but whatever
@$doc->loadHTML($string); // mute all errors, handle it transparently
// DOM will toss errors if the HTML its parsing has really big
// problems, so we're going to mute them. This can cause problems
// if a custom error handler that doesn't implement error_reporting
// is set, as noted by a Drupal plugin of HTML Purifier. Consider
// making our own error reporter to temporarily load in
@$doc->loadHTML($string);
$tokens = array();
$this->tokenizeDOM(

View File

@ -13,6 +13,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer
function render($config) {
$ret = '';
$this->config =& $config;
$this->def = $config->getHTMLDefinition();
$def =& $this->def;
@ -21,16 +22,14 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer
$ret .= $this->element('caption', 'Environment');
$ret .= $this->row('Parent of fragment', $def->info_parent);
$ret .= $this->row('Strict mode', $def->strict);
if ($def->strict) $ret .= $this->row('Block wrap name', $def->info_block_wrapper);
$ret .= $this->renderChildren($def->info_parent_def->child);
$ret .= $this->row('Block wrap name', $def->info_block_wrapper);
$ret .= $this->start('tr');
$ret .= $this->element('th', 'Global attributes');
$ret .= $this->element('td', $this->listifyAttr($def->info_global_attr),0,0);
$ret .= $this->end('tr');
$ret .= $this->renderChildren($def->info_parent_def->child);
$ret .= $this->start('tr');
$ret .= $this->element('th', 'Tag transforms');
$list = array();
@ -81,8 +80,8 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer
$ret .= $this->element('th', "<$name>", array('class'=>'heavy', 'colspan' => 2));
$ret .= $this->end('tr');
$ret .= $this->start('tr');
$ret .= $this->element('th', 'Type');
$ret .= $this->element('td', ucfirst($def->type));
$ret .= $this->element('th', 'Inline content');
$ret .= $this->element('td', $def->descendants_are_inline ? 'Yes' : 'No');
$ret .= $this->end('tr');
if (!empty($def->excludes)) {
$ret .= $this->start('tr');
@ -130,15 +129,17 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer
$elements = array();
$attr = array();
if (isset($def->elements)) {
if ($def->type == 'strictblockquote') $def->validateChildren(array(), $this->config, $context);
if ($def->type == 'strictblockquote') {
$def->validateChildren(array(), $this->config, $context);
}
$elements = $def->elements;
} elseif ($def->type == 'chameleon') {
$attr['rowspan'] = 2;
} elseif ($def->type == 'empty') {
$elements = array();
} elseif ($def->type == 'table') {
$elements = array('col', 'caption', 'colgroup', 'thead',
'tfoot', 'tbody', 'tr');
$elements = array_flip(array('col', 'caption', 'colgroup', 'thead',
'tfoot', 'tbody', 'tr'));
}
$ret .= $this->element('th', 'Allowed children', $attr);
@ -167,6 +168,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer
* @param $array Tag lookup array in form of array('tagname' => true)
*/
function listifyTagLookup($array) {
ksort($array);
$list = array();
foreach ($array as $name => $discard) {
if ($name !== '#PCDATA' && !isset($this->def->info[$name])) continue;
@ -181,6 +183,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer
* @todo Also add information about internal state
*/
function listifyObjectList($array) {
ksort($array);
$list = array();
foreach ($array as $discard => $obj) {
$list[] = $this->getClass($obj, 'AttrTransform_');
@ -193,6 +196,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer
* @param $array Array hash in form of array('attrname' => HTMLPurifier_AttrDef)
*/
function listifyAttr($array) {
ksort($array);
$list = array();
foreach ($array as $name => $obj) {
if ($obj === false) continue;

View File

@ -49,8 +49,8 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
$tokens[] = new HTMLPurifier_Token_End($parent_name);
// setup the context variables
$parent_type = 'unknown'; // reference var that we alter
$context->register('ParentType', $parent_type);
$is_inline = false; // reference var that we alter
$context->register('IsInline', $is_inline);
//####################################################################//
// Loop initialization
@ -115,11 +115,16 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
}
// calculate context
if (isset($parent_def)) {
$parent_type = $parent_def->type;
if ($is_inline === false) {
// check if conditions make it inline
if (!empty($parent_def) && $parent_def->descendants_are_inline) {
$is_inline = $count - 1;
}
} else {
// generally found in specialized elements like UL
$parent_type = 'unknown';
// check if we're out of inline
if ($count === $is_inline) {
$is_inline = false;
}
}
//################################################################//
@ -273,7 +278,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
array_pop($tokens);
// remove context variables
$context->destroy('ParentType');
$context->destroy('IsInline');
//####################################################################//
// Return

View File

@ -1,6 +1,6 @@
<?php
require_once('HTMLPurifier/Token.php');
require_once 'HTMLPurifier/Token.php';
/**
* Defines a mutation of an obsolete tag into a valid tag.
@ -26,132 +26,4 @@ class HTMLPurifier_TagTransform
}
/**
* Simple transformation, just change tag name to something else.
*/
class HTMLPurifier_TagTransform_Simple extends HTMLPurifier_TagTransform
{
/**
* @param $transform_to Tag name to transform to.
*/
function HTMLPurifier_TagTransform_Simple($transform_to) {
$this->transform_to = $transform_to;
}
function transform($tag, $config, &$context) {
$new_tag = $tag->copy();
$new_tag->name = $this->transform_to;
return $new_tag;
}
}
/**
* Transforms CENTER tags into proper version (DIV with text-align CSS)
*
* Takes a CENTER tag, parses the align attribute, and then if it's valid
* assigns it to the CSS property text-align.
*/
class HTMLPurifier_TagTransform_Center extends HTMLPurifier_TagTransform
{
var $transform_to = 'div';
function transform($tag, $config, &$context) {
if ($tag->type == 'end') {
$new_tag = new HTMLPurifier_Token_End($this->transform_to);
return $new_tag;
}
$attr = $tag->attr;
$prepend_css = 'text-align:center;';
if (isset($attr['style'])) {
$attr['style'] = $prepend_css . $attr['style'];
} else {
$attr['style'] = $prepend_css;
}
$new_tag = $tag->copy();
$new_tag->name = $this->transform_to;
$new_tag->attr = $attr;
return $new_tag;
}
}
/**
* Transforms FONT tags to the proper form (SPAN with CSS styling)
*
* This transformation takes the three proprietary attributes of FONT and
* transforms them into their corresponding CSS attributes. These are color,
* face, and size.
*
* @note Size is an interesting case because it doesn't map cleanly to CSS.
* Thanks to
* http://style.cleverchimp.com/font_size_intervals/altintervals.html
* for reasonable mappings.
*/
class HTMLPurifier_TagTransform_Font extends HTMLPurifier_TagTransform
{
var $transform_to = 'span';
var $_size_lookup = array(
'1' => 'xx-small',
'2' => 'small',
'3' => 'medium',
'4' => 'large',
'5' => 'x-large',
'6' => 'xx-large',
'7' => '300%',
'-1' => 'smaller',
'+1' => 'larger',
'-2' => '60%',
'+2' => '150%',
'+4' => '300%'
);
function transform($tag, $config, &$context) {
if ($tag->type == 'end') {
$new_tag = new HTMLPurifier_Token_End($this->transform_to);
return $new_tag;
}
$attr = $tag->attr;
$prepend_style = '';
// handle color transform
if (isset($attr['color'])) {
$prepend_style .= 'color:' . $attr['color'] . ';';
unset($attr['color']);
}
// handle face transform
if (isset($attr['face'])) {
$prepend_style .= 'font-family:' . $attr['face'] . ';';
unset($attr['face']);
}
// handle size transform
if (isset($attr['size'])) {
if (isset($this->_size_lookup[$attr['size']])) {
$prepend_style .= 'font-size:' .
$this->_size_lookup[$attr['size']] . ';';
}
unset($attr['size']);
}
if ($prepend_style) {
$attr['style'] = isset($attr['style']) ?
$prepend_style . $attr['style'] :
$prepend_style;
}
$new_tag = $tag->copy();
$new_tag->name = $this->transform_to;
$new_tag->attr = $attr;
return $new_tag;
}
}
?>

View File

@ -0,0 +1,34 @@
<?php
require_once 'HTMLPurifier/TagTransform.php';
/**
* Transforms CENTER tags into proper version (DIV with text-align CSS)
*
* Takes a CENTER tag, parses the align attribute, and then if it's valid
* assigns it to the CSS property text-align.
*/
class HTMLPurifier_TagTransform_Center extends HTMLPurifier_TagTransform
{
var $transform_to = 'div';
function transform($tag, $config, &$context) {
if ($tag->type == 'end') {
$new_tag = new HTMLPurifier_Token_End($this->transform_to);
return $new_tag;
}
$attr = $tag->attr;
$prepend_css = 'text-align:center;';
if (isset($attr['style'])) {
$attr['style'] = $prepend_css . $attr['style'];
} else {
$attr['style'] = $prepend_css;
}
$new_tag = $tag->copy();
$new_tag->name = $this->transform_to;
$new_tag->attr = $attr;
return $new_tag;
}
}
?>

View File

@ -0,0 +1,83 @@
<?php
require_once 'HTMLPurifier/TagTransform.php';
/**
* Transforms FONT tags to the proper form (SPAN with CSS styling)
*
* This transformation takes the three proprietary attributes of FONT and
* transforms them into their corresponding CSS attributes. These are color,
* face, and size.
*
* @note Size is an interesting case because it doesn't map cleanly to CSS.
* Thanks to
* http://style.cleverchimp.com/font_size_intervals/altintervals.html
* for reasonable mappings.
*/
class HTMLPurifier_TagTransform_Font extends HTMLPurifier_TagTransform
{
var $transform_to = 'span';
var $_size_lookup = array(
'1' => 'xx-small',
'2' => 'small',
'3' => 'medium',
'4' => 'large',
'5' => 'x-large',
'6' => 'xx-large',
'7' => '300%',
'-1' => 'smaller',
'+1' => 'larger',
'-2' => '60%',
'+2' => '150%',
'+4' => '300%'
);
function transform($tag, $config, &$context) {
if ($tag->type == 'end') {
$new_tag = new HTMLPurifier_Token_End($this->transform_to);
return $new_tag;
}
$attr = $tag->attr;
$prepend_style = '';
// handle color transform
if (isset($attr['color'])) {
$prepend_style .= 'color:' . $attr['color'] . ';';
unset($attr['color']);
}
// handle face transform
if (isset($attr['face'])) {
$prepend_style .= 'font-family:' . $attr['face'] . ';';
unset($attr['face']);
}
// handle size transform
if (isset($attr['size'])) {
if (isset($this->_size_lookup[$attr['size']])) {
$prepend_style .= 'font-size:' .
$this->_size_lookup[$attr['size']] . ';';
}
unset($attr['size']);
}
if ($prepend_style) {
$attr['style'] = isset($attr['style']) ?
$prepend_style . $attr['style'] :
$prepend_style;
}
$new_tag = $tag->copy();
$new_tag->name = $this->transform_to;
$new_tag->attr = $attr;
return $new_tag;
}
}
?>

View File

@ -0,0 +1,26 @@
<?php
require_once 'HTMLPurifier/TagTransform.php';
/**
* Simple transformation, just change tag name to something else.
*/
class HTMLPurifier_TagTransform_Simple extends HTMLPurifier_TagTransform
{
/**
* @param $transform_to Tag name to transform to.
*/
function HTMLPurifier_TagTransform_Simple($transform_to) {
$this->transform_to = $transform_to;
}
function transform($tag, $config, &$context) {
$new_tag = $tag->copy();
$new_tag->name = $this->transform_to;
return $new_tag;
}
}
?>

View File

@ -10,7 +10,7 @@ HTMLPurifier_ConfigSchema::define(
'irc' => true, // "Internet Relay Chat", usually needs another app
// for Usenet, these two are similar, but distinct
'nntp' => true, // individual Netnews articles
'news' => true // newsgroup or individual Netnews articles),
'news' => true // newsgroup or individual Netnews articles
), 'lookup',
'Whitelist that defines the schemes that a URI is allowed to have. This '.
'prevents XSS attacks from using pseudo-schemes like javascript or mocha.'

View File

@ -22,6 +22,17 @@ foreach ($_GET as $key => $value) {
@$config->loadArray($get);
/* // sample local definition, obviously needs to be less clunky
$html_definition =& $config->getHTMLDefinition(true);
$module = new HTMLPurifier_HTMLModule();
$module->name = 'Marquee';
$module->info['marquee'] = new HTMLPurifier_ElementDef();
$module->info['marquee']->content_model = '#PCDATA | Inline';
$module->info['marquee']->content_model_type = 'optional';
$module->content_sets = array('Inline' => 'marquee');
$html_definition->manager->addModule($module);
*/
$printer_html_definition = new HTMLPurifier_Printer_HTMLDefinition();
$printer_css_definition = new HTMLPurifier_Printer_CSSDefinition();

View File

@ -1,14 +1,14 @@
<?php
require_once 'HTMLPurifier/AttrDefHarness.php';
require_once 'HTMLPurifier/AttrDef/BackgroundPosition.php';
require_once 'HTMLPurifier/AttrDef/CSS/BackgroundPosition.php';
class HTMLPurifier_AttrDef_BackgroundPositionTest extends HTMLPurifier_AttrDefHarness
class HTMLPurifier_AttrDef_CSS_BackgroundPositionTest extends HTMLPurifier_AttrDefHarness
{
function test() {
$this->def = new HTMLPurifier_AttrDef_BackgroundPosition();
$this->def = new HTMLPurifier_AttrDef_CSS_BackgroundPosition();
// explicitly cited in spec
$this->assertDef('0% 0%');

View File

@ -1,14 +1,15 @@
<?php
require_once 'HTMLPurifier/AttrDefHarness.php';
require_once 'HTMLPurifier/AttrDef/Background.php';
require_once 'HTMLPurifier/AttrDef/CSS/Background.php';
class HTMLPurifier_AttrDef_BackgroundTest extends HTMLPurifier_AttrDefHarness
class HTMLPurifier_AttrDef_CSS_BackgroundTest extends HTMLPurifier_AttrDefHarness
{
function test() {
$this->def = new HTMLPurifier_AttrDef_Background(HTMLPurifier_Config::createDefault());
$config = HTMLPurifier_Config::createDefault();
$this->def = new HTMLPurifier_AttrDef_CSS_Background($config);
$valid = '#333 url(chess.png) repeat fixed 50% top';
$this->assertDef($valid);

View File

@ -1,14 +1,14 @@
<?php
require_once 'HTMLPurifier/AttrDef/Border.php';
require_once 'HTMLPurifier/AttrDef/PixelsTest.php';
require_once 'HTMLPurifier/AttrDef/CSS/Border.php';
class HTMLPurifier_AttrDef_BorderTest extends HTMLPurifier_AttrDef_PixelsTest
class HTMLPurifier_AttrDef_CSS_BorderTest extends HTMLPurifier_AttrDefHarness
{
function test() {
$this->def = new HTMLPurifier_AttrDef_Border(HTMLPurifier_Config::createDefault());
$config = HTMLPurifier_Config::createDefault();
$this->def = new HTMLPurifier_AttrDef_CSS_Border($config);
$this->assertDef('thick solid red', 'thick solid #F00');
$this->assertDef('thick solid');

View File

@ -1,14 +1,14 @@
<?php
require_once 'HTMLPurifier/AttrDef/Color.php';
require_once 'HTMLPurifier/AttrDef/CSS/Color.php';
require_once 'HTMLPurifier/AttrDefHarness.php';
class HTMLPurifier_AttrDef_ColorTest extends HTMLPurifier_AttrDefHarness
class HTMLPurifier_AttrDef_CSS_ColorTest extends HTMLPurifier_AttrDefHarness
{
function test() {
$this->def = new HTMLPurifier_AttrDef_Color();
$this->def = new HTMLPurifier_AttrDef_CSS_Color();
$this->assertDef('#F00');
$this->assertDef('#808080');

View File

@ -1,20 +1,20 @@
<?php
require_once 'HTMLPurifier/AttrDef/Composite.php';
require_once 'HTMLPurifier/AttrDef/CSS/Composite.php';
require_once 'HTMLPurifier/AttrDefHarness.php';
class HTMLPurifier_AttrDef_Composite_Testable extends
HTMLPurifier_AttrDef_Composite
class HTMLPurifier_AttrDef_CSS_Composite_Testable extends
HTMLPurifier_AttrDef_CSS_Composite
{
// we need to pass by ref to get the mocks in
function HTMLPurifier_AttrDef_Composite_Testable(&$defs) {
function HTMLPurifier_AttrDef_CSS_Composite_Testable(&$defs) {
$this->defs =& $defs;
}
}
class HTMLPurifier_AttrDef_CompositeTest extends HTMLPurifier_AttrDefHarness
class HTMLPurifier_AttrDef_CSS_CompositeTest extends HTMLPurifier_AttrDefHarness
{
var $def1, $def2;
@ -32,7 +32,7 @@ class HTMLPurifier_AttrDef_CompositeTest extends HTMLPurifier_AttrDefHarness
$def1 = new HTMLPurifier_AttrDefMock($this);
$def2 = new HTMLPurifier_AttrDefMock($this);
$defs = array(&$def1, &$def2);
$def = new HTMLPurifier_AttrDef_Composite_Testable($defs);
$def = new HTMLPurifier_AttrDef_CSS_Composite_Testable($defs);
$input = 'FOOBAR';
$output = 'foobar';
$def1_params = array($input, $config, $context);
@ -51,7 +51,7 @@ class HTMLPurifier_AttrDef_CompositeTest extends HTMLPurifier_AttrDefHarness
$def1 = new HTMLPurifier_AttrDefMock($this);
$def2 = new HTMLPurifier_AttrDefMock($this);
$defs = array(&$def1, &$def2);
$def = new HTMLPurifier_AttrDef_Composite_Testable($defs);
$def = new HTMLPurifier_AttrDef_CSS_Composite_Testable($defs);
$input = 'BOOMA';
$output = 'booma';
$def_params = array($input, $config, $context);
@ -71,7 +71,7 @@ class HTMLPurifier_AttrDef_CompositeTest extends HTMLPurifier_AttrDefHarness
$def1 = new HTMLPurifier_AttrDefMock($this);
$def2 = new HTMLPurifier_AttrDefMock($this);
$defs = array(&$def1, &$def2);
$def = new HTMLPurifier_AttrDef_Composite_Testable($defs);
$def = new HTMLPurifier_AttrDef_CSS_Composite_Testable($defs);
$input = 'BOOMA';
$output = false;
$def_params = array($input, $config, $context);

View File

@ -1,14 +1,14 @@
<?php
require_once 'HTMLPurifier/AttrDefHarness.php';
require_once 'HTMLPurifier/AttrDef/FontFamily.php';
require_once 'HTMLPurifier/AttrDef/CSS/FontFamily.php';
class HTMLPurifier_AttrDef_FontFamilyTest extends HTMLPurifier_AttrDefHarness
class HTMLPurifier_AttrDef_CSS_FontFamilyTest extends HTMLPurifier_AttrDefHarness
{
function test() {
$this->def = new HTMLPurifier_AttrDef_FontFamily();
$this->def = new HTMLPurifier_AttrDef_CSS_FontFamily();
$this->assertDef('Gill, Helvetica, sans-serif');
$this->assertDef('\'Times New Roman\', serif');

View File

@ -1,14 +1,15 @@
<?php
require_once 'HTMLPurifier/AttrDefHarness.php';
require_once 'HTMLPurifier/AttrDef/Font.php';
require_once 'HTMLPurifier/AttrDef/CSS/Font.php';
class HTMLPurifier_AttrDef_FontTest extends HTMLPurifier_AttrDefHarness
class HTMLPurifier_AttrDef_CSS_FontTest extends HTMLPurifier_AttrDefHarness
{
function test() {
$this->def = new HTMLPurifier_AttrDef_Font(HTMLPurifier_Config::createDefault());
$config = HTMLPurifier_Config::createDefault();
$this->def = new HTMLPurifier_AttrDef_CSS_Font($config);
// hodgepodge of usage cases from W3C spec, but " -> '
$this->assertDef('12px/14px sans-serif');

View File

@ -1,14 +1,14 @@
<?php
require_once 'HTMLPurifier/AttrDef/CSSLength.php';
require_once 'HTMLPurifier/AttrDef/CSS/Length.php';
require_once 'HTMLPurifier/AttrDefHarness.php';
class HTMLPurifier_AttrDef_CSSLengthTest extends HTMLPurifier_AttrDefHarness
class HTMLPurifier_AttrDef_CSS_LengthTest extends HTMLPurifier_AttrDefHarness
{
function test() {
$this->def = new HTMLPurifier_AttrDef_CSSLength();
$this->def = new HTMLPurifier_AttrDef_CSS_Length();
$this->assertDef('0');
$this->assertDef('0px');
@ -31,7 +31,7 @@ class HTMLPurifier_AttrDef_CSSLengthTest extends HTMLPurifier_AttrDefHarness
function testNonNegative() {
$this->def = new HTMLPurifier_AttrDef_CSSLength(true);
$this->def = new HTMLPurifier_AttrDef_CSS_Length(true);
$this->assertDef('3cm');
$this->assertDef('-3mm', false);

View File

@ -1,14 +1,15 @@
<?php
require_once 'HTMLPurifier/AttrDefHarness.php';
require_once 'HTMLPurifier/AttrDef/ListStyle.php';
require_once 'HTMLPurifier/AttrDef/CSS/ListStyle.php';
class HTMLPurifier_AttrDef_ListStyleTest extends HTMLPurifier_AttrDefHarness
class HTMLPurifier_AttrDef_CSS_ListStyleTest extends HTMLPurifier_AttrDefHarness
{
function test() {
$this->def = new HTMLPurifier_AttrDef_ListStyle(HTMLPurifier_Config::createDefault());
$config = HTMLPurifier_Config::createDefault();
$this->def = new HTMLPurifier_AttrDef_CSS_ListStyle($config);
$this->assertDef('lower-alpha');
$this->assertDef('upper-roman inside');

View File

@ -1,16 +1,16 @@
<?php
require_once 'HTMLPurifier/AttrDef/Multiple.php';
require_once 'HTMLPurifier/AttrDef/CSS/Multiple.php';
require_once 'HTMLPurifier/AttrDefHarness.php';
// borrowed for the sakes of this test
require_once 'HTMLPurifier/AttrDef/Integer.php';
class HTMLPurifier_AttrDef_MultipleTest extends HTMLPurifier_AttrDefHarness
class HTMLPurifier_AttrDef_CSS_MultipleTest extends HTMLPurifier_AttrDefHarness
{
function test() {
$this->def = new HTMLPurifier_AttrDef_Multiple(
$this->def = new HTMLPurifier_AttrDef_CSS_Multiple(
new HTMLPurifier_AttrDef_Integer()
);

Some files were not shown because too many files have changed in this diff Show More