mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-22 16:31:53 +00:00
[2.1.2] Implement experimental HTML5 parsing using PH5P
- Fix debugger so that tokens can be printed without an index - Fix some broken PEAR unit tests git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1383 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
parent
423afedbf4
commit
cb92a57e4e
8
NEWS
8
NEWS
@ -11,6 +11,12 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
|
|
||||||
2.1.2, unknown release date
|
2.1.2, unknown release date
|
||||||
! Implemented Object module for trusted users
|
! Implemented Object module for trusted users
|
||||||
|
! Implemented experimental HTML5 parsing mode using PH5P. To use, add
|
||||||
|
this to your code:
|
||||||
|
require_once 'HTMLPurifier/Lexer/PH5P.php';
|
||||||
|
$config->set('Core', 'LexerImpl', PH5P');
|
||||||
|
Note that this Lexer introduces some classes not in the HTMLPurifier
|
||||||
|
namespace.
|
||||||
- Fix non-visible parsing error in DirectLex with empty tags that have
|
- Fix non-visible parsing error in DirectLex with empty tags that have
|
||||||
slashes inside attribute values.
|
slashes inside attribute values.
|
||||||
- Fix typo in CSS definition: border-collapse:seperate; was incorrectly
|
- Fix typo in CSS definition: border-collapse:seperate; was incorrectly
|
||||||
@ -21,6 +27,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
. Unit test refactoring for one logical test per test function
|
. Unit test refactoring for one logical test per test function
|
||||||
. Config and context parameters in ComplexHarness deprecated: instead, edit
|
. Config and context parameters in ComplexHarness deprecated: instead, edit
|
||||||
the $config and $context member variables
|
the $config and $context member variables
|
||||||
|
. HTML wrapper in DOMLex now takes DTD identifiers into account; doesn't
|
||||||
|
really make a difference, but is good for completeness sake
|
||||||
|
|
||||||
2.1.1, released 2007-08-04
|
2.1.1, released 2007-08-04
|
||||||
- Fix show-stopper bug in %URI.MakeAbsolute functionality
|
- Fix show-stopper bug in %URI.MakeAbsolute functionality
|
||||||
|
@ -189,6 +189,9 @@ class HTMLPurifier_Lexer
|
|||||||
return new HTMLPurifier_Lexer_DOMLex();
|
return new HTMLPurifier_Lexer_DOMLex();
|
||||||
case 'DirectLex':
|
case 'DirectLex':
|
||||||
return new HTMLPurifier_Lexer_DirectLex();
|
return new HTMLPurifier_Lexer_DirectLex();
|
||||||
|
case 'PH5P':
|
||||||
|
// experimental Lexer that must be manually included
|
||||||
|
return new HTMLPurifier_Lexer_PH5P();
|
||||||
default:
|
default:
|
||||||
trigger_error("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer), E_USER_ERROR);
|
trigger_error("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer), E_USER_ERROR);
|
||||||
}
|
}
|
||||||
|
@ -53,14 +53,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
|||||||
}
|
}
|
||||||
|
|
||||||
// preprocess html, essential for UTF-8
|
// preprocess html, essential for UTF-8
|
||||||
$html =
|
$html = $this->wrapHTML($html, $config, $context);
|
||||||
'<!DOCTYPE html '.
|
|
||||||
'PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'.
|
|
||||||
'"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'.
|
|
||||||
'<html><head>'.
|
|
||||||
'<meta http-equiv="Content-Type" content="text/html;'.
|
|
||||||
' charset=utf-8" />'.
|
|
||||||
'</head><body><div>'.$html.'</div></body></html>';
|
|
||||||
|
|
||||||
$doc = new DOMDocument();
|
$doc = new DOMDocument();
|
||||||
$doc->encoding = 'UTF-8'; // theoretically, the above has this covered
|
$doc->encoding = 'UTF-8'; // theoretically, the above has this covered
|
||||||
@ -177,5 +170,25 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
|||||||
return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];
|
return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wraps an HTML fragment in the necessary HTML
|
||||||
|
*/
|
||||||
|
function wrapHTML($html, $config, &$context) {
|
||||||
|
$def = $config->getDefinition('HTML');
|
||||||
|
$ret = '';
|
||||||
|
|
||||||
|
if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
|
||||||
|
$ret .= '<!DOCTYPE html ';
|
||||||
|
if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
|
||||||
|
if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" ';
|
||||||
|
$ret .= '>';
|
||||||
|
}
|
||||||
|
|
||||||
|
$ret .= '<html><head>';
|
||||||
|
$ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
|
||||||
|
$ret .= '</head><body><div>'.$html.'</div></body></html>';
|
||||||
|
return $ret;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
3886
library/HTMLPurifier/Lexer/PH5P.php
Normal file
3886
library/HTMLPurifier/Lexer/PH5P.php
Normal file
File diff suppressed because it is too large
Load Diff
45
maintenance/PH5P.patch
Normal file
45
maintenance/PH5P.patch
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
--- old.php 2007-08-19 14:42:33.640625000 -0400
|
||||||
|
+++ new.php 2007-08-19 14:41:51.609375000 -0400
|
||||||
|
@@ -211,7 +211,10 @@
|
||||||
|
// If nothing is returned, emit a U+0026 AMPERSAND character token.
|
||||||
|
// Otherwise, emit the character token that was returned.
|
||||||
|
$char = (!$entity) ? '&' : $entity;
|
||||||
|
- $this->emitToken($char);
|
||||||
|
+ $this->emitToken(array(
|
||||||
|
+ 'type' => self::CHARACTR,
|
||||||
|
+ 'data' => $char
|
||||||
|
+ ));
|
||||||
|
|
||||||
|
// Finally, switch to the data state.
|
||||||
|
$this->state = 'data';
|
||||||
|
@@ -708,7 +711,7 @@
|
||||||
|
} elseif($char === '&') {
|
||||||
|
/* U+0026 AMPERSAND (&)
|
||||||
|
Switch to the entity in attribute value state. */
|
||||||
|
- $this->entityInAttributeValueState('non');
|
||||||
|
+ $this->entityInAttributeValueState();
|
||||||
|
|
||||||
|
} elseif($char === '>') {
|
||||||
|
/* U+003E GREATER-THAN SIGN (>)
|
||||||
|
@@ -738,7 +741,8 @@
|
||||||
|
? '&'
|
||||||
|
: $entity;
|
||||||
|
|
||||||
|
- $this->emitToken($char);
|
||||||
|
+ $last = count($this->token['attr']) - 1;
|
||||||
|
+ $this->token['attr'][$last]['value'] .= $char;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function bogusCommentState() {
|
||||||
|
@@ -1066,6 +1070,11 @@
|
||||||
|
$this->char++;
|
||||||
|
|
||||||
|
if(in_array($id, $this->entities)) {
|
||||||
|
+ if ($e_name[$c-1] !== ';') {
|
||||||
|
+ if ($c < $len && $e_name[$c] == ';') {
|
||||||
|
+ $this->char++; // consume extra semicolon
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
$entity = $id;
|
||||||
|
break;
|
||||||
|
}
|
@ -54,14 +54,14 @@ function isInScopes($array = array()) {
|
|||||||
}
|
}
|
||||||
/**#@-*/
|
/**#@-*/
|
||||||
|
|
||||||
function printTokens($tokens, $index) {
|
function printTokens($tokens, $index = null) {
|
||||||
$string = '<pre>';
|
$string = '<pre>';
|
||||||
$generator = new HTMLPurifier_Generator();
|
$generator = new HTMLPurifier_Generator();
|
||||||
foreach ($tokens as $i => $token) {
|
foreach ($tokens as $i => $token) {
|
||||||
if ($index == $i) $string .= '[<strong>';
|
if ($index === $i) $string .= '[<strong>';
|
||||||
$string .= "<sup>$i</sup>";
|
$string .= "<sup>$i</sup>";
|
||||||
$string .= $generator->escape($generator->generateFromToken($token));
|
$string .= $generator->escape($generator->generateFromToken($token));
|
||||||
if ($index == $i) $string .= '</strong>]';
|
if ($index === $i) $string .= '</strong>]';
|
||||||
}
|
}
|
||||||
$string .= '</pre>';
|
$string .= '</pre>';
|
||||||
echo $string;
|
echo $string;
|
||||||
|
@ -18,6 +18,9 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
require_once 'HTMLPurifier/Lexer/PEARSax3.php';
|
require_once 'HTMLPurifier/Lexer/PEARSax3.php';
|
||||||
$this->_has_pear = true;
|
$this->_has_pear = true;
|
||||||
}
|
}
|
||||||
|
if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
|
||||||
|
require_once 'HTMLPurifier/Lexer/PH5P.php';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// HTMLPurifier_Lexer::create() --------------------------------------------
|
// HTMLPurifier_Lexer::create() --------------------------------------------
|
||||||
@ -139,14 +142,21 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
if ($this->_has_pear) $lexers['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3();
|
if ($this->_has_pear) $lexers['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3();
|
||||||
if (version_compare(PHP_VERSION, "5", ">=") && class_exists('DOMDocument')) {
|
if (version_compare(PHP_VERSION, "5", ">=") && class_exists('DOMDocument')) {
|
||||||
$lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
|
$lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
|
||||||
|
$lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P();
|
||||||
}
|
}
|
||||||
foreach ($lexers as $name => $lexer) {
|
foreach ($lexers as $name => $lexer) {
|
||||||
$result = $lexer->tokenizeHTML($input, $this->config, $this->context);
|
$result = $lexer->tokenizeHTML($input, $this->config, $this->context);
|
||||||
if (isset($alt_expect[$name])) {
|
if (isset($alt_expect[$name])) {
|
||||||
if ($alt_expect[$name] === false) continue;
|
if ($alt_expect[$name] === false) continue;
|
||||||
$this->assertIdentical($result, $alt_expect[$name]);
|
$t_expect = $alt_expect[$name];
|
||||||
|
$this->assertIdentical($result, $alt_expect[$name], "$name: %s");
|
||||||
} else {
|
} else {
|
||||||
$this->assertIdentical($result, $expect);
|
$t_expect = $expect;
|
||||||
|
$this->assertIdentical($result, $expect, "$name: %s");
|
||||||
|
}
|
||||||
|
if ($t_expect != $result) {
|
||||||
|
printTokens($result);
|
||||||
|
//var_dump($result);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -206,8 +216,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
new HTMLPurifier_Token_End('ASDF'),
|
new HTMLPurifier_Token_End('ASDF'),
|
||||||
),
|
),
|
||||||
array(
|
array(
|
||||||
// DOMLex automatically closes invalid tags
|
'DOMLex' => $alt = array(
|
||||||
'DOMLex' => array(
|
|
||||||
new HTMLPurifier_Token_Empty('asdf'),
|
new HTMLPurifier_Token_Empty('asdf'),
|
||||||
new HTMLPurifier_Token_Empty('d'),
|
new HTMLPurifier_Token_Empty('d'),
|
||||||
new HTMLPurifier_Token_Start('pooloka'),
|
new HTMLPurifier_Token_Start('pooloka'),
|
||||||
@ -216,6 +225,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
new HTMLPurifier_Token_End('poolasdf'),
|
new HTMLPurifier_Token_End('poolasdf'),
|
||||||
new HTMLPurifier_Token_End('pooloka'),
|
new HTMLPurifier_Token_End('pooloka'),
|
||||||
),
|
),
|
||||||
|
'PH5P' => $alt,
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -244,7 +254,10 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
function test_tokenizeHTML_comment() {
|
function test_tokenizeHTML_comment() {
|
||||||
$this->assertTokenization(
|
$this->assertTokenization(
|
||||||
'<!-- Comment -->',
|
'<!-- Comment -->',
|
||||||
array( new HTMLPurifier_Token_Comment(' Comment ') )
|
array( new HTMLPurifier_Token_Comment(' Comment ') ),
|
||||||
|
array(
|
||||||
|
'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- Comment --') ),
|
||||||
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -253,7 +266,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
'<!-- not so well formed --->',
|
'<!-- not so well formed --->',
|
||||||
array( new HTMLPurifier_Token_Comment(' not so well formed -') ),
|
array( new HTMLPurifier_Token_Comment(' not so well formed -') ),
|
||||||
array(
|
array(
|
||||||
'PEARSax3' => false, // behavior is undefined
|
'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- not so well formed ---') ),
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -266,6 +279,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
// I like our behavior better, but it's non-standard
|
// I like our behavior better, but it's non-standard
|
||||||
'DOMLex' => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
|
'DOMLex' => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
|
||||||
'PEARSax3' => array( new HTMLPurifier_Token_Start('a', array('href'=>'')) ),
|
'PEARSax3' => array( new HTMLPurifier_Token_Start('a', array('href'=>'')) ),
|
||||||
|
'PH5P' => false, // total barfing, grabs scaffolding too
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -277,13 +291,13 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
new HTMLPurifier_Token_Text('<b>')
|
new HTMLPurifier_Token_Text('<b>')
|
||||||
),
|
),
|
||||||
array(
|
array(
|
||||||
// it is possible to configure PEARSax3 to clump nodes together,
|
// some parsers will separate entities out
|
||||||
// I just don't know how
|
'PEARSax3' => $split = array(
|
||||||
'PEARSax3' => array(
|
|
||||||
new HTMLPurifier_Token_Text('<'),
|
new HTMLPurifier_Token_Text('<'),
|
||||||
new HTMLPurifier_Token_Text('b'),
|
new HTMLPurifier_Token_Text('b'),
|
||||||
new HTMLPurifier_Token_Text('>'),
|
new HTMLPurifier_Token_Text('>'),
|
||||||
)
|
),
|
||||||
|
'PH5P' => $split,
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -298,6 +312,9 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
new HTMLPurifier_Token_Start('a', array('"' => ''))
|
new HTMLPurifier_Token_Start('a', array('"' => ''))
|
||||||
),
|
),
|
||||||
'PEARSax3' => $tokens,
|
'PEARSax3' => $tokens,
|
||||||
|
'PH5P' => array(
|
||||||
|
new HTMLPurifier_Token_Empty('a', array('"' => ''))
|
||||||
|
),
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -312,7 +329,10 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
function test_tokenizeHTML_escapedQuote() {
|
function test_tokenizeHTML_escapedQuote() {
|
||||||
$this->assertTokenization(
|
$this->assertTokenization(
|
||||||
'"',
|
'"',
|
||||||
array( new HTMLPurifier_Token_Text('"') )
|
array( new HTMLPurifier_Token_Text('"') ),
|
||||||
|
array(
|
||||||
|
'PEARSax3' => false, // PEAR barfs on this
|
||||||
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -322,7 +342,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
array( new HTMLPurifier_Token_Text('You <b>can't</b> get me!') ),
|
array( new HTMLPurifier_Token_Text('You <b>can't</b> get me!') ),
|
||||||
array(
|
array(
|
||||||
// PEAR splits up all of the CDATA
|
// PEAR splits up all of the CDATA
|
||||||
'PEARSax3' => array(
|
'PEARSax3' => $split = array(
|
||||||
new HTMLPurifier_Token_Text('You '),
|
new HTMLPurifier_Token_Text('You '),
|
||||||
new HTMLPurifier_Token_Text('<'),
|
new HTMLPurifier_Token_Text('<'),
|
||||||
new HTMLPurifier_Token_Text('b'),
|
new HTMLPurifier_Token_Text('b'),
|
||||||
@ -335,6 +355,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
new HTMLPurifier_Token_Text('>'),
|
new HTMLPurifier_Token_Text('>'),
|
||||||
new HTMLPurifier_Token_Text(' get me!'),
|
new HTMLPurifier_Token_Text(' get me!'),
|
||||||
),
|
),
|
||||||
|
'PH5P' => $split,
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -351,10 +372,11 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
'<![CDATA[→]]>',
|
'<![CDATA[→]]>',
|
||||||
array( new HTMLPurifier_Token_Text("→") ),
|
array( new HTMLPurifier_Token_Text("→") ),
|
||||||
array(
|
array(
|
||||||
'PEARSax3' => array(
|
'PEARSax3' => $split = array(
|
||||||
new HTMLPurifier_Token_Text('&'),
|
new HTMLPurifier_Token_Text('&'),
|
||||||
new HTMLPurifier_Token_Text('rarr;'),
|
new HTMLPurifier_Token_Text('rarr;'),
|
||||||
),
|
),
|
||||||
|
'PH5P' => $split,
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -403,6 +425,13 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
new HTMLPurifier_Token_End('b'),
|
new HTMLPurifier_Token_End('b'),
|
||||||
),
|
),
|
||||||
'PEARSax3' => false, // totally mangled
|
'PEARSax3' => false, // totally mangled
|
||||||
|
'PH5P' => array( // interesting grouping
|
||||||
|
new HTMLPurifier_Token_Start('b'),
|
||||||
|
new HTMLPurifier_Token_Text('Whoa! '),
|
||||||
|
new HTMLPurifier_Token_Text('<'),
|
||||||
|
new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
|
||||||
|
new HTMLPurifier_Token_End('b'),
|
||||||
|
),
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -426,7 +455,8 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
array( new HTMLPurifier_Token_Comment(' This >< comment') ),
|
array( new HTMLPurifier_Token_Comment(' This >< comment') ),
|
||||||
array(
|
array(
|
||||||
'DOMLex' => false,
|
'DOMLex' => false,
|
||||||
'PEARSax3' => false
|
'PEARSax3' => false,
|
||||||
|
'PH5P' => false,
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -434,14 +464,17 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|||||||
function test_tokenizeHTML_scriptCDATAContents() {
|
function test_tokenizeHTML_scriptCDATAContents() {
|
||||||
$this->config->set('HTML', 'Trusted', true);
|
$this->config->set('HTML', 'Trusted', true);
|
||||||
$this->assertTokenization(
|
$this->assertTokenization(
|
||||||
'<script>alert("<foo>");</script>',
|
'Foo: <script>alert("<foo>");</script>',
|
||||||
array(
|
array(
|
||||||
|
new HTMLPurifier_Token_Text('Foo: '),
|
||||||
new HTMLPurifier_Token_Start('script'),
|
new HTMLPurifier_Token_Start('script'),
|
||||||
new HTMLPurifier_Token_Text('alert("<foo>");'),
|
new HTMLPurifier_Token_Text('alert("<foo>");'),
|
||||||
new HTMLPurifier_Token_End('script'),
|
new HTMLPurifier_Token_End('script'),
|
||||||
),
|
),
|
||||||
array(
|
array(
|
||||||
'PEARSax3' => false,
|
'PEARSax3' => false,
|
||||||
|
// PH5P, for some reason, bubbles the script to <head>
|
||||||
|
'PH5P' => false,
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user