mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2024-12-22 16:31:53 +00:00
2f29c27a59
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1698 48356398-32a2-884e-a903-53898d9a118a
593 lines
20 KiB
PHP
593 lines
20 KiB
PHP
<?php
|
|
|
|
class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
|
|
{
|
|
|
|
protected $_has_pear = false;
|
|
|
|
public function __construct() {
|
|
parent::__construct();
|
|
// E_STRICT = 2048, int used for PHP4 compat: this check disables
|
|
// PEAR if PHP 5 strict mode is on, since the class is not strict safe
|
|
if (
|
|
$GLOBALS['HTMLPurifierTest']['PEAR'] &&
|
|
((error_reporting() & 2048) != 2048) // ought to be a better way
|
|
) {
|
|
require_once 'HTMLPurifier/Lexer/PEARSax3.php';
|
|
$this->_has_pear = true;
|
|
}
|
|
if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
|
|
require_once 'HTMLPurifier/Lexer/PH5P.php';
|
|
}
|
|
}
|
|
|
|
// HTMLPurifier_Lexer::create() --------------------------------------------
|
|
|
|
function test_create() {
|
|
$this->config->set('Core', 'MaintainLineNumbers', true);
|
|
$lexer = HTMLPurifier_Lexer::create($this->config);
|
|
$this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
|
|
}
|
|
|
|
// HTMLPurifier_Lexer->parseData() -----------------------------------------
|
|
|
|
function assertParseData($input, $expect = true) {
|
|
if ($expect === true) $expect = $input;
|
|
$lexer = new HTMLPurifier_Lexer();
|
|
$this->assertIdentical($expect, $lexer->parseData($input));
|
|
}
|
|
|
|
function test_parseData_plainText() {
|
|
$this->assertParseData('asdf');
|
|
}
|
|
|
|
function test_parseData_ampersandEntity() {
|
|
$this->assertParseData('&', '&');
|
|
}
|
|
|
|
function test_parseData_quotEntity() {
|
|
$this->assertParseData('"', '"');
|
|
}
|
|
|
|
function test_parseData_aposNumericEntity() {
|
|
$this->assertParseData(''', "'");
|
|
}
|
|
|
|
function test_parseData_aposCompactNumericEntity() {
|
|
$this->assertParseData(''', "'");
|
|
}
|
|
|
|
function test_parseData_adjacentAmpersandEntities() {
|
|
$this->assertParseData('&&&', '&&&');
|
|
}
|
|
|
|
function test_parseData_trailingUnescapedAmpersand() {
|
|
$this->assertParseData('&&', '&&');
|
|
}
|
|
|
|
function test_parseData_internalUnescapedAmpersand() {
|
|
$this->assertParseData('Procter & Gamble');
|
|
}
|
|
|
|
function test_parseData_improperEntityFaultToleranceTest() {
|
|
$this->assertParseData('-');
|
|
}
|
|
|
|
// HTMLPurifier_Lexer->extractBody() ---------------------------------------
|
|
|
|
function assertExtractBody($text, $extract = true) {
|
|
$lexer = new HTMLPurifier_Lexer();
|
|
$result = $lexer->extractBody($text);
|
|
if ($extract === true) $extract = $text;
|
|
$this->assertIdentical($extract, $result);
|
|
}
|
|
|
|
function test_extractBody_noBodyTags() {
|
|
$this->assertExtractBody('<b>Bold</b>');
|
|
}
|
|
|
|
function test_extractBody_lowercaseBodyTags() {
|
|
$this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
|
|
}
|
|
|
|
function test_extractBody_uppercaseBodyTags() {
|
|
$this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
|
|
}
|
|
|
|
function test_extractBody_realisticUseCase() {
|
|
$this->assertExtractBody(
|
|
'<?xml version="1.0"
|
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
|
<head>
|
|
<title>xyz</title>
|
|
</head>
|
|
<body>
|
|
<form method="post" action="whatever1">
|
|
<div>
|
|
<input type="text" name="username" />
|
|
<input type="text" name="password" />
|
|
<input type="submit" />
|
|
</div>
|
|
</form>
|
|
</body>
|
|
</html>',
|
|
'
|
|
<form method="post" action="whatever1">
|
|
<div>
|
|
<input type="text" name="username" />
|
|
<input type="text" name="password" />
|
|
<input type="submit" />
|
|
</div>
|
|
</form>
|
|
');
|
|
}
|
|
|
|
function test_extractBody_bodyWithAttributes() {
|
|
$this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
|
|
}
|
|
|
|
function test_extractBody_preserveUnclosedBody() {
|
|
$this->assertExtractBody('<body>asdf'); // not closed, don't accept
|
|
}
|
|
|
|
// HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
|
|
|
|
function assertTokenization($input, $expect, $alt_expect = array()) {
|
|
$lexers = array();
|
|
$lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
|
|
if ($this->_has_pear) $lexers['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3();
|
|
if (class_exists('DOMDocument')) {
|
|
$lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
|
|
$lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P();
|
|
}
|
|
foreach ($lexers as $name => $lexer) {
|
|
$result = $lexer->tokenizeHTML($input, $this->config, $this->context);
|
|
if (isset($alt_expect[$name])) {
|
|
if ($alt_expect[$name] === false) continue;
|
|
$t_expect = $alt_expect[$name];
|
|
$this->assertIdentical($result, $alt_expect[$name], "$name: %s");
|
|
} else {
|
|
$t_expect = $expect;
|
|
$this->assertIdentical($result, $expect, "$name: %s");
|
|
}
|
|
if ($t_expect != $result) {
|
|
printTokens($result);
|
|
//var_dump($result);
|
|
}
|
|
}
|
|
}
|
|
|
|
function test_tokenizeHTML_emptyInput() {
|
|
$this->assertTokenization('', array());
|
|
}
|
|
|
|
function test_tokenizeHTML_plainText() {
|
|
$this->assertTokenization(
|
|
'This is regular text.',
|
|
array(
|
|
new HTMLPurifier_Token_Text('This is regular text.')
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_textAndTags() {
|
|
$this->assertTokenization(
|
|
'This is <b>bold</b> text',
|
|
array(
|
|
new HTMLPurifier_Token_Text('This is '),
|
|
new HTMLPurifier_Token_Start('b', array()),
|
|
new HTMLPurifier_Token_Text('bold'),
|
|
new HTMLPurifier_Token_End('b'),
|
|
new HTMLPurifier_Token_Text(' text'),
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_normalizeCase() {
|
|
$this->assertTokenization(
|
|
'<DIV>Totally rad dude. <b>asdf</b></div>',
|
|
array(
|
|
new HTMLPurifier_Token_Start('DIV', array()),
|
|
new HTMLPurifier_Token_Text('Totally rad dude. '),
|
|
new HTMLPurifier_Token_Start('b', array()),
|
|
new HTMLPurifier_Token_Text('asdf'),
|
|
new HTMLPurifier_Token_End('b'),
|
|
new HTMLPurifier_Token_End('div'),
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_notWellFormed() {
|
|
$this->assertTokenization(
|
|
'<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
|
|
array(
|
|
new HTMLPurifier_Token_Start('asdf'),
|
|
new HTMLPurifier_Token_End('asdf'),
|
|
new HTMLPurifier_Token_Start('d'),
|
|
new HTMLPurifier_Token_End('d'),
|
|
new HTMLPurifier_Token_Start('poOloka'),
|
|
new HTMLPurifier_Token_Start('poolasdf'),
|
|
new HTMLPurifier_Token_Start('ds'),
|
|
new HTMLPurifier_Token_End('asdf'),
|
|
new HTMLPurifier_Token_End('ASDF'),
|
|
),
|
|
array(
|
|
'DOMLex' => $alt = array(
|
|
new HTMLPurifier_Token_Empty('asdf'),
|
|
new HTMLPurifier_Token_Empty('d'),
|
|
new HTMLPurifier_Token_Start('pooloka'),
|
|
new HTMLPurifier_Token_Start('poolasdf'),
|
|
new HTMLPurifier_Token_Empty('ds'),
|
|
new HTMLPurifier_Token_End('poolasdf'),
|
|
new HTMLPurifier_Token_End('pooloka'),
|
|
),
|
|
'PH5P' => $alt,
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_whitespaceInTag() {
|
|
$this->assertTokenization(
|
|
'<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
|
|
array(
|
|
new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
|
|
new HTMLPurifier_Token_Text('Link to '),
|
|
new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
|
|
new HTMLPurifier_Token_Text('foobar'),
|
|
new HTMLPurifier_Token_End('b'),
|
|
new HTMLPurifier_Token_End('a'),
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_emptyTag() {
|
|
$this->assertTokenization(
|
|
'<br />',
|
|
array( new HTMLPurifier_Token_Empty('br') )
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_comment() {
|
|
$this->assertTokenization(
|
|
'<!-- Comment -->',
|
|
array( new HTMLPurifier_Token_Comment(' Comment ') ),
|
|
array(
|
|
'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- Comment --') ),
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_malformedComment() {
|
|
$this->assertTokenization(
|
|
'<!-- not so well formed --->',
|
|
array( new HTMLPurifier_Token_Comment(' not so well formed -') ),
|
|
array(
|
|
'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- not so well formed ---') ),
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_unterminatedTag() {
|
|
$this->assertTokenization(
|
|
'<a href=""',
|
|
array( new HTMLPurifier_Token_Text('<a href=""') ),
|
|
array(
|
|
// I like our behavior better, but it's non-standard
|
|
'DOMLex' => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
|
|
'PEARSax3' => array( new HTMLPurifier_Token_Start('a', array('href'=>'')) ),
|
|
'PH5P' => false, // total barfing, grabs scaffolding too
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_specialEntities() {
|
|
$this->assertTokenization(
|
|
'<b>',
|
|
array(
|
|
new HTMLPurifier_Token_Text('<b>')
|
|
),
|
|
array(
|
|
// some parsers will separate entities out
|
|
'PEARSax3' => $split = array(
|
|
new HTMLPurifier_Token_Text('<'),
|
|
new HTMLPurifier_Token_Text('b'),
|
|
new HTMLPurifier_Token_Text('>'),
|
|
),
|
|
'PH5P' => $split,
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_earlyQuote() {
|
|
$this->assertTokenization(
|
|
'<a "=>',
|
|
array( new HTMLPurifier_Token_Empty('a') ),
|
|
array(
|
|
// we barf on this input
|
|
'DirectLex' => $tokens = array(
|
|
new HTMLPurifier_Token_Start('a', array('"' => ''))
|
|
),
|
|
'PEARSax3' => $tokens,
|
|
'PH5P' => false, // behavior varies; handle this personally
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_earlyQuote_PH5P() {
|
|
if (!class_exists('DOMDocument')) return;
|
|
$lexer = new HTMLPurifier_Lexer_PH5P();
|
|
$result = $lexer->tokenizeHTML('<a "=>', $this->config, $this->context);
|
|
if ($this->context->get('PH5PError', true)) {
|
|
$this->assertIdentical(array(
|
|
new HTMLPurifier_Token_Start('a', array('"' => ''))
|
|
), $result);
|
|
} else {
|
|
$this->assertIdentical(array(
|
|
new HTMLPurifier_Token_Empty('a', array('"' => ''))
|
|
), $result);
|
|
}
|
|
}
|
|
|
|
function test_tokenizeHTML_unescapedQuote() {
|
|
$this->assertTokenization(
|
|
'"',
|
|
array( new HTMLPurifier_Token_Text('"') )
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_escapedQuote() {
|
|
$this->assertTokenization(
|
|
'"',
|
|
array( new HTMLPurifier_Token_Text('"') ),
|
|
array(
|
|
'PEARSax3' => false, // PEAR barfs on this
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_cdata() {
|
|
$this->assertTokenization(
|
|
'<![CDATA[You <b>can't</b> get me!]]>',
|
|
array( new HTMLPurifier_Token_Text('You <b>can't</b> get me!') ),
|
|
array(
|
|
// PEAR splits up all of the CDATA
|
|
'PEARSax3' => $split = array(
|
|
new HTMLPurifier_Token_Text('You '),
|
|
new HTMLPurifier_Token_Text('<'),
|
|
new HTMLPurifier_Token_Text('b'),
|
|
new HTMLPurifier_Token_Text('>'),
|
|
new HTMLPurifier_Token_Text('can'),
|
|
new HTMLPurifier_Token_Text('&'),
|
|
new HTMLPurifier_Token_Text('#39;t'),
|
|
new HTMLPurifier_Token_Text('<'),
|
|
new HTMLPurifier_Token_Text('/b'),
|
|
new HTMLPurifier_Token_Text('>'),
|
|
new HTMLPurifier_Token_Text(' get me!'),
|
|
),
|
|
'PH5P' => $split,
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_characterEntity() {
|
|
$this->assertTokenization(
|
|
'θ',
|
|
array( new HTMLPurifier_Token_Text("\xCE\xB8") )
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_characterEntityInCDATA() {
|
|
$this->assertTokenization(
|
|
'<![CDATA[→]]>',
|
|
array( new HTMLPurifier_Token_Text("→") ),
|
|
array(
|
|
'PEARSax3' => $split = array(
|
|
new HTMLPurifier_Token_Text('&'),
|
|
new HTMLPurifier_Token_Text('rarr;'),
|
|
),
|
|
'PH5P' => $split,
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_entityInAttribute() {
|
|
$this->assertTokenization(
|
|
'<a href="index.php?title=foo&id=bar">Link</a>',
|
|
array(
|
|
new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
|
|
new HTMLPurifier_Token_Text('Link'),
|
|
new HTMLPurifier_Token_End('a'),
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_preserveUTF8() {
|
|
$this->assertTokenization(
|
|
"\xCE\xB8",
|
|
array( new HTMLPurifier_Token_Text("\xCE\xB8") )
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_specialEntityInAttribute() {
|
|
$this->assertTokenization(
|
|
'<br test="x < 6" />',
|
|
array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_emoticonProtection() {
|
|
$this->config->set('Core', 'AggressivelyFixLt', true);
|
|
$this->assertTokenization(
|
|
'<b>Whoa! <3 That\'s not good >.></b>',
|
|
array(
|
|
new HTMLPurifier_Token_Start('b'),
|
|
new HTMLPurifier_Token_Text('Whoa! '),
|
|
new HTMLPurifier_Token_Text('<3 That\'s not good >'),
|
|
new HTMLPurifier_Token_Text('.>'),
|
|
new HTMLPurifier_Token_End('b')
|
|
),
|
|
array(
|
|
// text is absorbed together
|
|
'DOMLex' => array(
|
|
new HTMLPurifier_Token_Start('b'),
|
|
new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
|
|
new HTMLPurifier_Token_End('b'),
|
|
),
|
|
'PEARSax3' => false, // totally mangled
|
|
'PH5P' => array( // interesting grouping
|
|
new HTMLPurifier_Token_Start('b'),
|
|
new HTMLPurifier_Token_Text('Whoa! '),
|
|
new HTMLPurifier_Token_Text('<'),
|
|
new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
|
|
new HTMLPurifier_Token_End('b'),
|
|
),
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_commentWithFunkyChars() {
|
|
$this->assertTokenization(
|
|
'<!-- This >< comment --><br />',
|
|
array(
|
|
new HTMLPurifier_Token_Comment(' This >< comment '),
|
|
new HTMLPurifier_Token_Empty('br'),
|
|
),
|
|
array(
|
|
'PEARSax3' => false,
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_unterminatedComment() {
|
|
$this->assertTokenization(
|
|
'<!-- This >< comment',
|
|
array( new HTMLPurifier_Token_Comment(' This >< comment') ),
|
|
array(
|
|
'DOMLex' => false,
|
|
'PEARSax3' => false,
|
|
'PH5P' => false,
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_scriptCDATAContents() {
|
|
$this->config->set('HTML', 'Trusted', true);
|
|
$this->assertTokenization(
|
|
'Foo: <script>alert("<foo>");</script>',
|
|
array(
|
|
new HTMLPurifier_Token_Text('Foo: '),
|
|
new HTMLPurifier_Token_Start('script'),
|
|
new HTMLPurifier_Token_Text('alert("<foo>");'),
|
|
new HTMLPurifier_Token_End('script'),
|
|
),
|
|
array(
|
|
'PEARSax3' => false,
|
|
// PH5P, for some reason, bubbles the script to <head>
|
|
'PH5P' => false,
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_entitiesInComment() {
|
|
$this->config->set('Core', 'AggressivelyFixLt', true);
|
|
$this->assertTokenization(
|
|
'<!-- This comment < < & -->',
|
|
array( new HTMLPurifier_Token_Comment(' This comment < < & ') ),
|
|
array(
|
|
'PEARSax3' => false
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_attributeWithSpecialCharacters() {
|
|
$this->assertTokenization(
|
|
'<a href="><>">',
|
|
array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
|
|
array(
|
|
'DirectLex' => array(
|
|
new HTMLPurifier_Token_Start('a', array('href' => '')),
|
|
new HTMLPurifier_Token_Text('<">'),
|
|
),
|
|
'PEARSax3' => false,
|
|
)
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_emptyTagWithSlashInAttribute() {
|
|
$this->assertTokenization(
|
|
'<param name="src" value="http://example.com/video.wmv" />',
|
|
array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_style() {
|
|
$extra = array(
|
|
// PH5P doesn't seem to like style tags
|
|
'PH5P' => false,
|
|
// DirectLex defers to RemoveForeignElements for textification
|
|
'DirectLex' => array(
|
|
new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
|
|
new HTMLPurifier_Token_Comment("\ndiv {}\n"),
|
|
new HTMLPurifier_Token_End('style'),
|
|
),
|
|
);
|
|
if (!defined('LIBXML_VERSION') || LIBXML_VERSION < 20628) {
|
|
// libxml's behavior is wrong prior to this version, so make
|
|
// appropriate accomodations
|
|
// :NOTE: LIBXML_VERSION is missing in early versions of PHP
|
|
// prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
|
|
// this translates to 5.0.x. In such cases, we assume that an old
|
|
// version of libxml is being used, although that *might* not
|
|
// be the case (it's very unlikely though)
|
|
$extra['DOMLex'] = $extra['DirectLex'];
|
|
}
|
|
$this->assertTokenization(
|
|
'<style type="text/css"><!--
|
|
div {}
|
|
--></style>',
|
|
array(
|
|
new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
|
|
new HTMLPurifier_Token_Text("\ndiv {}\n"),
|
|
new HTMLPurifier_Token_End('style'),
|
|
),
|
|
$extra
|
|
);
|
|
}
|
|
|
|
function test_tokenizeHTML_() {
|
|
$this->assertTokenization(
|
|
'<a@>>',
|
|
array(
|
|
new HTMLPurifier_Token_Start('a'),
|
|
new HTMLPurifier_Token_Text('>'),
|
|
new HTMLPurifier_Token_End('a'),
|
|
),
|
|
array(
|
|
'DirectLex' => array(
|
|
// Technically this is invalid, but it won't be a
|
|
// problem with invalid element removal; also, this
|
|
// mimics Mozilla's parsing of the tag.
|
|
new HTMLPurifier_Token_Start('a@'),
|
|
new HTMLPurifier_Token_Text('>'),
|
|
),
|
|
)
|
|
);
|
|
}
|
|
|
|
/*
|
|
|
|
function test_tokenizeHTML_() {
|
|
$this->assertTokenization(
|
|
,
|
|
array(
|
|
|
|
)
|
|
);
|
|
}
|
|
*/
|
|
|
|
}
|
|
|