0
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-01-05 14:11:52 +00:00
htmlpurifier/tests/HTMLPurifier/LexerTest.php
Edward Z. Yang ac18672aba Fix extant broken PEARSax3 parsing patterns.
Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
2010-02-26 21:14:52 -05:00

716 lines
24 KiB
PHP

<?php
class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
{
protected $_has_pear = false;
public function __construct() {
parent::__construct();
if ($GLOBALS['HTMLPurifierTest']['PEAR']) {
require_once 'HTMLPurifier/Lexer/PEARSax3.php';
$this->_has_pear = true;
}
if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
require_once 'HTMLPurifier/Lexer/PH5P.php';
}
}
// HTMLPurifier_Lexer::create() --------------------------------------------
function test_create() {
$this->config->set('Core.MaintainLineNumbers', true);
$lexer = HTMLPurifier_Lexer::create($this->config);
$this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
}
function test_create_objectLexerImpl() {
$this->config->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex());
$lexer = HTMLPurifier_Lexer::create($this->config);
$this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
}
function test_create_unknownLexer() {
$this->config->set('Core.LexerImpl', 'AsdfAsdf');
$this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
HTMLPurifier_Lexer::create($this->config);
}
function test_create_incompatibleLexer() {
$this->config->set('Core.LexerImpl', 'DOMLex');
$this->config->set('Core.MaintainLineNumbers', true);
$this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
HTMLPurifier_Lexer::create($this->config);
}
// HTMLPurifier_Lexer->parseData() -----------------------------------------
function assertParseData($input, $expect = true) {
if ($expect === true) $expect = $input;
$lexer = new HTMLPurifier_Lexer();
$this->assertIdentical($expect, $lexer->parseData($input));
}
function test_parseData_plainText() {
$this->assertParseData('asdf');
}
function test_parseData_ampersandEntity() {
$this->assertParseData('&amp;', '&');
}
function test_parseData_quotEntity() {
$this->assertParseData('&quot;', '"');
}
function test_parseData_aposNumericEntity() {
$this->assertParseData('&#039;', "'");
}
function test_parseData_aposCompactNumericEntity() {
$this->assertParseData('&#39;', "'");
}
function test_parseData_adjacentAmpersandEntities() {
$this->assertParseData('&amp;&amp;&amp;', '&&&');
}
function test_parseData_trailingUnescapedAmpersand() {
$this->assertParseData('&amp;&', '&&');
}
function test_parseData_internalUnescapedAmpersand() {
$this->assertParseData('Procter & Gamble');
}
function test_parseData_improperEntityFaultToleranceTest() {
$this->assertParseData('&#x2D;');
}
// HTMLPurifier_Lexer->extractBody() ---------------------------------------
function assertExtractBody($text, $extract = true) {
$lexer = new HTMLPurifier_Lexer();
$result = $lexer->extractBody($text);
if ($extract === true) $extract = $text;
$this->assertIdentical($extract, $result);
}
function test_extractBody_noBodyTags() {
$this->assertExtractBody('<b>Bold</b>');
}
function test_extractBody_lowercaseBodyTags() {
$this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
}
function test_extractBody_uppercaseBodyTags() {
$this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
}
function test_extractBody_realisticUseCase() {
$this->assertExtractBody(
'<?xml version="1.0"
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>xyz</title>
</head>
<body>
<form method="post" action="whatever1">
<div>
<input type="text" name="username" />
<input type="text" name="password" />
<input type="submit" />
</div>
</form>
</body>
</html>',
'
<form method="post" action="whatever1">
<div>
<input type="text" name="username" />
<input type="text" name="password" />
<input type="submit" />
</div>
</form>
');
}
function test_extractBody_bodyWithAttributes() {
$this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
}
function test_extractBody_preserveUnclosedBody() {
$this->assertExtractBody('<body>asdf'); // not closed, don't accept
}
function test_extractBody_useLastBody() {
$this->assertExtractBody('<body>foo</body>bar</body>', 'foo</body>bar');
}
// HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
function assertTokenization($input, $expect, $alt_expect = array()) {
$lexers = array();
$lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
if ($this->_has_pear) $lexers['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3();
if (class_exists('DOMDocument')) {
$lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
$lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P();
}
foreach ($lexers as $name => $lexer) {
$result = $lexer->tokenizeHTML($input, $this->config, $this->context);
if (isset($alt_expect[$name])) {
if ($alt_expect[$name] === false) continue;
$t_expect = $alt_expect[$name];
$this->assertIdentical($result, $alt_expect[$name], "$name: %s");
} else {
$t_expect = $expect;
$this->assertIdentical($result, $expect, "$name: %s");
}
if ($t_expect != $result) {
printTokens($result);
}
}
}
function test_tokenizeHTML_emptyInput() {
$this->assertTokenization('', array());
}
function test_tokenizeHTML_plainText() {
$this->assertTokenization(
'This is regular text.',
array(
new HTMLPurifier_Token_Text('This is regular text.')
)
);
}
function test_tokenizeHTML_textAndTags() {
$this->assertTokenization(
'This is <b>bold</b> text',
array(
new HTMLPurifier_Token_Text('This is '),
new HTMLPurifier_Token_Start('b', array()),
new HTMLPurifier_Token_Text('bold'),
new HTMLPurifier_Token_End('b'),
new HTMLPurifier_Token_Text(' text'),
)
);
}
function test_tokenizeHTML_normalizeCase() {
$this->assertTokenization(
'<DIV>Totally rad dude. <b>asdf</b></div>',
array(
new HTMLPurifier_Token_Start('DIV', array()),
new HTMLPurifier_Token_Text('Totally rad dude. '),
new HTMLPurifier_Token_Start('b', array()),
new HTMLPurifier_Token_Text('asdf'),
new HTMLPurifier_Token_End('b'),
new HTMLPurifier_Token_End('div'),
)
);
}
function test_tokenizeHTML_notWellFormed() {
$this->assertTokenization(
'<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
array(
new HTMLPurifier_Token_Start('asdf'),
new HTMLPurifier_Token_End('asdf'),
new HTMLPurifier_Token_Start('d'),
new HTMLPurifier_Token_End('d'),
new HTMLPurifier_Token_Start('poOloka'),
new HTMLPurifier_Token_Start('poolasdf'),
new HTMLPurifier_Token_Start('ds'),
new HTMLPurifier_Token_End('asdf'),
new HTMLPurifier_Token_End('ASDF'),
),
array(
'DOMLex' => $alt = array(
new HTMLPurifier_Token_Empty('asdf'),
new HTMLPurifier_Token_Empty('d'),
new HTMLPurifier_Token_Start('pooloka'),
new HTMLPurifier_Token_Start('poolasdf'),
new HTMLPurifier_Token_Empty('ds'),
new HTMLPurifier_Token_End('poolasdf'),
new HTMLPurifier_Token_End('pooloka'),
),
'PH5P' => $alt,
)
);
}
function test_tokenizeHTML_whitespaceInTag() {
$this->assertTokenization(
'<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
array(
new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
new HTMLPurifier_Token_Text('Link to '),
new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
new HTMLPurifier_Token_Text('foobar'),
new HTMLPurifier_Token_End('b'),
new HTMLPurifier_Token_End('a'),
)
);
}
function test_tokenizeHTML_emptyTag() {
$this->assertTokenization(
'<br />',
array( new HTMLPurifier_Token_Empty('br') )
);
}
function test_tokenizeHTML_comment() {
$this->assertTokenization(
'<!-- Comment -->',
array( new HTMLPurifier_Token_Comment(' Comment ') )
);
}
function test_tokenizeHTML_malformedComment() {
$this->assertTokenization(
'<!-- not so well formed --->',
array( new HTMLPurifier_Token_Comment(' not so well formed -') )
);
}
function test_tokenizeHTML_unterminatedTag() {
$this->assertTokenization(
'<a href=""',
array( new HTMLPurifier_Token_Text('<a href=""') ),
array(
// I like our behavior better, but it's non-standard
'DOMLex' => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
'PEARSax3' => array( new HTMLPurifier_Token_Start('a', array('href'=>'')) ),
'PH5P' => false, // total barfing, grabs scaffolding too
)
);
}
function test_tokenizeHTML_specialEntities() {
$this->assertTokenization(
'&lt;b&gt;',
array(
new HTMLPurifier_Token_Text('<b>')
),
array(
// some parsers will separate entities out
'PEARSax3' => $split = array(
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text('b'),
new HTMLPurifier_Token_Text('>'),
),
'PH5P' => $split,
)
);
}
function test_tokenizeHTML_earlyQuote() {
$this->assertTokenization(
'<a "=>',
array( new HTMLPurifier_Token_Empty('a') ),
array(
// we barf on this input
'DirectLex' => $tokens = array(
new HTMLPurifier_Token_Start('a', array('"' => ''))
),
'PEARSax3' => $tokens,
'PH5P' => false, // behavior varies; handle this personally
)
);
}
function test_tokenizeHTML_earlyQuote_PH5P() {
if (!class_exists('DOMDocument')) return;
$lexer = new HTMLPurifier_Lexer_PH5P();
$result = $lexer->tokenizeHTML('<a "=>', $this->config, $this->context);
if ($this->context->get('PH5PError', true)) {
$this->assertIdentical(array(
new HTMLPurifier_Token_Start('a', array('"' => ''))
), $result);
} else {
$this->assertIdentical(array(
new HTMLPurifier_Token_Empty('a', array('"' => ''))
), $result);
}
}
function test_tokenizeHTML_unescapedQuote() {
$this->assertTokenization(
'"',
array( new HTMLPurifier_Token_Text('"') )
);
}
function test_tokenizeHTML_escapedQuote() {
$this->assertTokenization(
'&quot;',
array( new HTMLPurifier_Token_Text('"') ),
array(
'PEARSax3' => false, // PEAR barfs on this
)
);
}
function test_tokenizeHTML_cdata() {
$this->assertTokenization(
'<![CDATA[You <b>can&#39;t</b> get me!]]>',
array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
array(
// PEAR splits up all of the CDATA
'PEARSax3' => $split = array(
new HTMLPurifier_Token_Text('You '),
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text('b'),
new HTMLPurifier_Token_Text('>'),
new HTMLPurifier_Token_Text('can'),
new HTMLPurifier_Token_Text('&'),
new HTMLPurifier_Token_Text('#39;t'),
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text('/b'),
new HTMLPurifier_Token_Text('>'),
new HTMLPurifier_Token_Text(' get me!'),
),
'PH5P' => $split,
)
);
}
function test_tokenizeHTML_characterEntity() {
$this->assertTokenization(
'&theta;',
array( new HTMLPurifier_Token_Text("\xCE\xB8") )
);
}
function test_tokenizeHTML_characterEntityInCDATA() {
$this->assertTokenization(
'<![CDATA[&rarr;]]>',
array( new HTMLPurifier_Token_Text("&rarr;") ),
array(
'PEARSax3' => $split = array(
new HTMLPurifier_Token_Text('&'),
new HTMLPurifier_Token_Text('rarr;'),
),
'PH5P' => $split,
)
);
}
function test_tokenizeHTML_entityInAttribute() {
$this->assertTokenization(
'<a href="index.php?title=foo&amp;id=bar">Link</a>',
array(
new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
new HTMLPurifier_Token_Text('Link'),
new HTMLPurifier_Token_End('a'),
)
);
}
function test_tokenizeHTML_preserveUTF8() {
$this->assertTokenization(
"\xCE\xB8",
array( new HTMLPurifier_Token_Text("\xCE\xB8") )
);
}
function test_tokenizeHTML_specialEntityInAttribute() {
$this->assertTokenization(
'<br test="x &lt; 6" />',
array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
);
}
function test_tokenizeHTML_emoticonProtection() {
$this->assertTokenization(
'<b>Whoa! <3 That\'s not good >.></b>',
array(
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('Whoa! '),
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
new HTMLPurifier_Token_End('b')
),
array(
// text is absorbed together
'DOMLex' => array(
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
new HTMLPurifier_Token_End('b'),
),
'PEARSax3' => false, // totally mangled
'PH5P' => array( // interesting grouping
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('Whoa! '),
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
new HTMLPurifier_Token_End('b'),
),
)
);
}
function test_tokenizeHTML_commentWithFunkyChars() {
$this->assertTokenization(
'<!-- This >< comment --><br />',
array(
new HTMLPurifier_Token_Comment(' This >< comment '),
new HTMLPurifier_Token_Empty('br'),
),
array(
'PEARSax3' => false,
)
);
}
function test_tokenizeHTML_unterminatedComment() {
$this->assertTokenization(
'<!-- This >< comment',
array( new HTMLPurifier_Token_Comment(' This >< comment') ),
array(
'DOMLex' => false,
'PEARSax3' => false,
'PH5P' => false,
)
);
}
function test_tokenizeHTML_scriptCDATAContents() {
$this->config->set('HTML.Trusted', true);
$this->assertTokenization(
'Foo: <script>alert("<foo>");</script>',
array(
new HTMLPurifier_Token_Text('Foo: '),
new HTMLPurifier_Token_Start('script'),
new HTMLPurifier_Token_Text('alert("<foo>");'),
new HTMLPurifier_Token_End('script'),
),
array(
'PEARSax3' => false,
// PH5P, for some reason, bubbles the script to <head>
'PH5P' => false,
)
);
}
function test_tokenizeHTML_entitiesInComment() {
$this->assertTokenization(
'<!-- This comment < &lt; & -->',
array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') ),
array(
'PEARSax3' => false
)
);
}
function test_tokenizeHTML_attributeWithSpecialCharacters() {
$this->assertTokenization(
'<a href="><>">',
array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
array(
'DirectLex' => array(
new HTMLPurifier_Token_Start('a', array('href' => '')),
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text('">'),
),
'PEARSax3' => false,
)
);
}
function test_tokenizeHTML_emptyTagWithSlashInAttribute() {
$this->assertTokenization(
'<param name="src" value="http://example.com/video.wmv" />',
array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
);
}
function test_tokenizeHTML_style() {
$extra = array(
// PH5P doesn't seem to like style tags
'PH5P' => false,
// DirectLex defers to RemoveForeignElements for textification
'DirectLex' => array(
new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
new HTMLPurifier_Token_Comment("\ndiv {}\n"),
new HTMLPurifier_Token_End('style'),
),
);
if (!defined('LIBXML_VERSION')) {
// LIBXML_VERSION is missing in early versions of PHP
// prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
// this translates to 5.0.x. In such cases, punt the test entirely.
return;
} elseif (LIBXML_VERSION < 20628) {
// libxml's behavior is wrong prior to this version, so make
// appropriate accomodations
$extra['DOMLex'] = $extra['DirectLex'];
}
$this->assertTokenization(
'<style type="text/css"><!--
div {}
--></style>',
array(
new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
new HTMLPurifier_Token_Text("\ndiv {}\n"),
new HTMLPurifier_Token_End('style'),
),
$extra
);
}
function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
$alt_expect = array(
// Technically this is invalid, but it won't be a
// problem with invalid element removal; also, this
// mimics Mozilla's parsing of the tag.
new HTMLPurifier_Token_Start('a@'),
new HTMLPurifier_Token_Text('>'),
);
$this->assertTokenization(
'<a@>>',
array(
new HTMLPurifier_Token_Start('a'),
new HTMLPurifier_Token_Text('>'),
new HTMLPurifier_Token_End('a'),
),
array(
'DirectLex' => $alt_expect,
'PEARSax3' => $alt_expect,
)
);
}
function test_tokenizeHTML_emoticonHeart() {
$this->assertTokenization(
'<br /><3<br />',
array(
new HTMLPurifier_Token_Empty('br'),
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text('3'),
new HTMLPurifier_Token_Empty('br'),
),
array(
'DOMLex' => array(
new HTMLPurifier_Token_Empty('br'),
new HTMLPurifier_Token_Text('<3'),
new HTMLPurifier_Token_Empty('br'),
),
'PEARSax3' => array(
// bah too lazy to fix this
new HTMLPurifier_Token_Empty('br'),
new HTMLPurifier_Token_Empty('3<br'),
),
)
);
}
function test_tokenizeHTML_emoticonShiftyEyes() {
$this->assertTokenization(
'<b><<</b>',
array(
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_End('b'),
),
array(
'DOMLex' => array(
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('<<'),
new HTMLPurifier_Token_End('b'),
),
'PEARSax3' => array(
// also too lazy to fix
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Empty('<<'),
new HTMLPurifier_Token_Text('b>'),
),
)
);
}
function test_tokenizeHTML_eon1996() {
$this->assertTokenization(
'< <b>test</b>',
array(
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text(' '),
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('test'),
new HTMLPurifier_Token_End('b'),
),
array(
'DOMLex' => array(
new HTMLPurifier_Token_Text('< '),
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('test'),
new HTMLPurifier_Token_End('b'),
),
'PEARSax3' => array(
// totally doing the wrong thing here
new HTMLPurifier_Token_Text(' '),
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('test'),
new HTMLPurifier_Token_End('b'),
),
)
);
}
function test_tokenizeHTML_bodyInCDATA() {
$alt_tokens = array(
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text('body'),
new HTMLPurifier_Token_Text('>'),
new HTMLPurifier_Token_Text('Foo'),
new HTMLPurifier_Token_Text('<'),
new HTMLPurifier_Token_Text('/body'),
new HTMLPurifier_Token_Text('>'),
);
$this->assertTokenization(
'<![CDATA[<body>Foo</body>]]>',
array(
new HTMLPurifier_Token_Text('<body>Foo</body>'),
),
array(
'PH5P' => $alt_tokens,
'PEARSax3' => $alt_tokens,
)
);
}
function test_tokenizeHTML_() {
$this->assertTokenization(
'<a><img /></a>',
array(
new HTMLPurifier_Token_Start('a'),
new HTMLPurifier_Token_Empty('img'),
new HTMLPurifier_Token_End('a'),
)
);
}
/*
function test_tokenizeHTML_() {
$this->assertTokenization(
,
array(
)
);
}
*/
}
// vim: et sw=4 sts=4