2006-07-23 00:11:03 +00:00
< ? php
2007-08-01 14:06:59 +00:00
class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
2006-07-23 00:11:03 +00:00
{
2008-12-06 07:28:20 +00:00
2007-11-25 02:24:39 +00:00
protected $_has_pear = false ;
2008-12-06 07:28:20 +00:00
2008-04-21 15:24:18 +00:00
public function __construct () {
parent :: __construct ();
2010-04-26 22:27:32 +00:00
if ( $GLOBALS [ 'HTMLPurifierTest' ][ 'PEAR' ] &&
// PEARSax3 is not maintained and throws loads of DEPRECATED
// errors in PHP 5.3
version_compare ( PHP_VERSION , '5.3' , '<' )) {
2006-08-10 12:41:39 +00:00
require_once 'HTMLPurifier/Lexer/PEARSax3.php' ;
2007-08-16 06:48:24 +00:00
$this -> _has_pear = true ;
2006-07-23 00:11:03 +00:00
}
2007-08-19 18:49:35 +00:00
if ( $GLOBALS [ 'HTMLPurifierTest' ][ 'PH5P' ]) {
require_once 'HTMLPurifier/Lexer/PH5P.php' ;
}
2006-07-23 00:11:03 +00:00
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
// HTMLPurifier_Lexer::create() --------------------------------------------
2008-12-06 07:28:20 +00:00
2007-06-18 02:01:01 +00:00
function test_create () {
2009-02-20 00:17:49 +00:00
$this -> config -> set ( 'Core.MaintainLineNumbers' , true );
2007-08-16 06:48:24 +00:00
$lexer = HTMLPurifier_Lexer :: create ( $this -> config );
2007-06-18 02:01:01 +00:00
$this -> assertIsA ( $lexer , 'HTMLPurifier_Lexer_DirectLex' );
}
2008-12-06 07:28:20 +00:00
2008-09-05 18:04:23 +00:00
function test_create_objectLexerImpl () {
2009-02-20 00:17:49 +00:00
$this -> config -> set ( 'Core.LexerImpl' , new HTMLPurifier_Lexer_DirectLex ());
2008-09-05 18:04:23 +00:00
$lexer = HTMLPurifier_Lexer :: create ( $this -> config );
$this -> assertIsA ( $lexer , 'HTMLPurifier_Lexer_DirectLex' );
}
2008-12-06 07:28:20 +00:00
2008-09-05 18:04:23 +00:00
function test_create_unknownLexer () {
2009-02-20 00:17:49 +00:00
$this -> config -> set ( 'Core.LexerImpl' , 'AsdfAsdf' );
2008-09-05 18:04:23 +00:00
$this -> expectException ( new HTMLPurifier_Exception ( 'Cannot instantiate unrecognized Lexer type AsdfAsdf' ));
HTMLPurifier_Lexer :: create ( $this -> config );
}
2008-12-06 07:28:20 +00:00
2008-09-05 18:04:23 +00:00
function test_create_incompatibleLexer () {
2009-02-20 00:17:49 +00:00
$this -> config -> set ( 'Core.LexerImpl' , 'DOMLex' );
$this -> config -> set ( 'Core.MaintainLineNumbers' , true );
2008-09-05 18:04:23 +00:00
$this -> expectException ( new HTMLPurifier_Exception ( 'Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)' ));
HTMLPurifier_Lexer :: create ( $this -> config );
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
// HTMLPurifier_Lexer->parseData() -----------------------------------------
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function assertParseData ( $input , $expect = true ) {
if ( $expect === true ) $expect = $input ;
$lexer = new HTMLPurifier_Lexer ();
$this -> assertIdentical ( $expect , $lexer -> parseData ( $input ));
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_parseData_plainText () {
$this -> assertParseData ( 'asdf' );
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_parseData_ampersandEntity () {
$this -> assertParseData ( '&' , '&' );
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_parseData_quotEntity () {
$this -> assertParseData ( '"' , '"' );
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_parseData_aposNumericEntity () {
$this -> assertParseData ( ''' , " ' " );
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_parseData_aposCompactNumericEntity () {
$this -> assertParseData ( ''' , " ' " );
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_parseData_adjacentAmpersandEntities () {
$this -> assertParseData ( '&&&' , '&&&' );
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_parseData_trailingUnescapedAmpersand () {
$this -> assertParseData ( '&&' , '&&' );
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_parseData_internalUnescapedAmpersand () {
$this -> assertParseData ( 'Procter & Gamble' );
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_parseData_improperEntityFaultToleranceTest () {
$this -> assertParseData ( '-' );
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
// HTMLPurifier_Lexer->extractBody() ---------------------------------------
2008-12-06 07:28:20 +00:00
2006-08-15 00:53:24 +00:00
function assertExtractBody ( $text , $extract = true ) {
2007-08-16 06:48:24 +00:00
$lexer = new HTMLPurifier_Lexer ();
$result = $lexer -> extractBody ( $text );
2006-08-15 00:53:24 +00:00
if ( $extract === true ) $extract = $text ;
$this -> assertIdentical ( $extract , $result );
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_extractBody_noBodyTags () {
$this -> assertExtractBody ( '<b>Bold</b>' );
2006-09-27 02:09:54 +00:00
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_extractBody_lowercaseBodyTags () {
2006-08-15 00:53:24 +00:00
$this -> assertExtractBody ( '<html><body><b>Bold</b></body></html>' , '<b>Bold</b>' );
2007-08-16 06:48:24 +00:00
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_extractBody_uppercaseBodyTags () {
2006-08-15 01:14:39 +00:00
$this -> assertExtractBody ( '<HTML><BODY><B>Bold</B></BODY></HTML>' , '<B>Bold</B>' );
2007-08-16 06:48:24 +00:00
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_extractBody_realisticUseCase () {
2006-08-15 00:53:24 +00:00
$this -> assertExtractBody (
2007-06-27 13:58:32 +00:00
' < ? xml version = " 1.0 "
2006-08-15 00:53:24 +00:00
<! DOCTYPE html PUBLIC " -//W3C//DTD XHTML 1.0 Strict//EN "
" http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd " >
< html xmlns = " http://www.w3.org/1999/xhtml " xml : lang = " en " lang = " en " >
< head >
< title > xyz </ title >
</ head >
< body >
< form method = " post " action = " whatever1 " >
< div >
< input type = " text " name = " username " />
< input type = " text " name = " password " />
< input type = " submit " />
</ div >
</ form >
</ body >
</ html > ' ,
'
< form method = " post " action = " whatever1 " >
< div >
< input type = " text " name = " username " />
< input type = " text " name = " password " />
< input type = " submit " />
</ div >
</ form >
' );
2007-08-16 06:48:24 +00:00
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_extractBody_bodyWithAttributes () {
2006-08-15 00:53:24 +00:00
$this -> assertExtractBody ( '<html><body bgcolor="#F00"><b>Bold</b></body></html>' , '<b>Bold</b>' );
2007-08-16 06:48:24 +00:00
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_extractBody_preserveUnclosedBody () {
2006-08-15 00:53:24 +00:00
$this -> assertExtractBody ( '<body>asdf' ); // not closed, don't accept
2007-08-16 06:48:24 +00:00
}
2008-12-06 07:28:20 +00:00
2009-07-08 02:19:04 +00:00
function test_extractBody_useLastBody () {
$this -> assertExtractBody ( '<body>foo</body>bar</body>' , 'foo</body>bar' );
}
2007-08-16 06:48:24 +00:00
// HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function assertTokenization ( $input , $expect , $alt_expect = array ()) {
$lexers = array ();
$lexers [ 'DirectLex' ] = new HTMLPurifier_Lexer_DirectLex ();
if ( $this -> _has_pear ) $lexers [ 'PEARSax3' ] = new HTMLPurifier_Lexer_PEARSax3 ();
2008-04-26 19:47:22 +00:00
if ( class_exists ( 'DOMDocument' )) {
2007-08-16 06:48:24 +00:00
$lexers [ 'DOMLex' ] = new HTMLPurifier_Lexer_DOMLex ();
2007-08-19 18:49:35 +00:00
$lexers [ 'PH5P' ] = new HTMLPurifier_Lexer_PH5P ();
2007-08-16 06:48:24 +00:00
}
foreach ( $lexers as $name => $lexer ) {
$result = $lexer -> tokenizeHTML ( $input , $this -> config , $this -> context );
if ( isset ( $alt_expect [ $name ])) {
if ( $alt_expect [ $name ] === false ) continue ;
2007-08-19 18:49:35 +00:00
$t_expect = $alt_expect [ $name ];
$this -> assertIdentical ( $result , $alt_expect [ $name ], " $name : %s " );
2007-08-16 06:48:24 +00:00
} else {
2007-08-19 18:49:35 +00:00
$t_expect = $expect ;
$this -> assertIdentical ( $result , $expect , " $name : %s " );
}
if ( $t_expect != $result ) {
printTokens ( $result );
2007-08-16 06:48:24 +00:00
}
}
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_emptyInput () {
$this -> assertTokenization ( '' , array ());
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_plainText () {
$this -> assertTokenization (
'This is regular text.' ,
array (
new HTMLPurifier_Token_Text ( 'This is regular text.' )
)
2007-04-04 02:22:27 +00:00
);
2007-08-16 06:48:24 +00:00
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_textAndTags () {
$this -> assertTokenization (
'This is <b>bold</b> text' ,
array (
new HTMLPurifier_Token_Text ( 'This is ' ),
new HTMLPurifier_Token_Start ( 'b' , array ()),
new HTMLPurifier_Token_Text ( 'bold' ),
new HTMLPurifier_Token_End ( 'b' ),
new HTMLPurifier_Token_Text ( ' text' ),
)
2007-06-27 16:40:18 +00:00
);
2007-08-16 06:48:24 +00:00
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_normalizeCase () {
$this -> assertTokenization (
'<DIV>Totally rad dude. <b>asdf</b></div>' ,
array (
new HTMLPurifier_Token_Start ( 'DIV' , array ()),
new HTMLPurifier_Token_Text ( 'Totally rad dude. ' ),
new HTMLPurifier_Token_Start ( 'b' , array ()),
new HTMLPurifier_Token_Text ( 'asdf' ),
new HTMLPurifier_Token_End ( 'b' ),
new HTMLPurifier_Token_End ( 'div' ),
)
2007-06-21 14:44:26 +00:00
);
2007-08-16 06:48:24 +00:00
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_notWellFormed () {
$this -> assertTokenization (
'<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>' ,
array (
new HTMLPurifier_Token_Start ( 'asdf' ),
new HTMLPurifier_Token_End ( 'asdf' ),
new HTMLPurifier_Token_Start ( 'd' ),
new HTMLPurifier_Token_End ( 'd' ),
new HTMLPurifier_Token_Start ( 'poOloka' ),
new HTMLPurifier_Token_Start ( 'poolasdf' ),
new HTMLPurifier_Token_Start ( 'ds' ),
new HTMLPurifier_Token_End ( 'asdf' ),
new HTMLPurifier_Token_End ( 'ASDF' ),
),
array (
2007-08-19 18:49:35 +00:00
'DOMLex' => $alt = array (
2007-08-16 06:48:24 +00:00
new HTMLPurifier_Token_Empty ( 'asdf' ),
new HTMLPurifier_Token_Empty ( 'd' ),
new HTMLPurifier_Token_Start ( 'pooloka' ),
new HTMLPurifier_Token_Start ( 'poolasdf' ),
new HTMLPurifier_Token_Empty ( 'ds' ),
new HTMLPurifier_Token_End ( 'poolasdf' ),
new HTMLPurifier_Token_End ( 'pooloka' ),
),
2007-08-19 18:49:35 +00:00
'PH5P' => $alt ,
2007-08-16 06:48:24 +00:00
)
2007-06-21 14:44:26 +00:00
);
2007-08-16 06:48:24 +00:00
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_whitespaceInTag () {
$this -> assertTokenization (
'<a' . " \t " . 'href="foobar.php"' . " \n " . 'title="foo!">Link to <b id="asdf">foobar</b></a>' ,
array (
new HTMLPurifier_Token_Start ( 'a' , array ( 'href' => 'foobar.php' , 'title' => 'foo!' )),
new HTMLPurifier_Token_Text ( 'Link to ' ),
new HTMLPurifier_Token_Start ( 'b' , array ( 'id' => 'asdf' )),
new HTMLPurifier_Token_Text ( 'foobar' ),
new HTMLPurifier_Token_End ( 'b' ),
new HTMLPurifier_Token_End ( 'a' ),
)
2007-06-26 16:08:42 +00:00
);
2007-08-16 06:48:24 +00:00
}
2008-12-06 07:28:20 +00:00
2010-06-01 02:44:18 +00:00
function test_tokenizeHTML_singleAttribute () {
$this -> assertTokenization (
'<br style="&" />' ,
array (
new HTMLPurifier_Token_Empty ( 'br' , array ( 'style' => '&' ))
)
);
}
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_emptyTag () {
$this -> assertTokenization (
'<br />' ,
array ( new HTMLPurifier_Token_Empty ( 'br' ) )
2007-07-05 21:29:07 +00:00
);
2007-08-16 06:48:24 +00:00
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_comment () {
$this -> assertTokenization (
'<!-- Comment -->' ,
2010-02-27 02:14:52 +00:00
array ( new HTMLPurifier_Token_Comment ( ' Comment ' ) )
2007-06-27 16:40:18 +00:00
);
2007-08-16 06:48:24 +00:00
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_malformedComment () {
$this -> assertTokenization (
'<!-- not so well formed --->' ,
2010-02-27 02:14:52 +00:00
array ( new HTMLPurifier_Token_Comment ( ' not so well formed -' ) )
2007-08-16 06:48:24 +00:00
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_unterminatedTag () {
$this -> assertTokenization (
'<a href=""' ,
array ( new HTMLPurifier_Token_Text ( '<a href=""' ) ),
array (
// I like our behavior better, but it's non-standard
'DOMLex' => array ( new HTMLPurifier_Token_Empty ( 'a' , array ( 'href' => '' )) ),
'PEARSax3' => array ( new HTMLPurifier_Token_Start ( 'a' , array ( 'href' => '' )) ),
2007-08-19 18:49:35 +00:00
'PH5P' => false , // total barfing, grabs scaffolding too
2007-08-16 06:48:24 +00:00
)
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_specialEntities () {
$this -> assertTokenization (
'<b>' ,
array (
new HTMLPurifier_Token_Text ( '<b>' )
),
array (
2007-08-19 18:49:35 +00:00
// some parsers will separate entities out
'PEARSax3' => $split = array (
2007-08-16 06:48:24 +00:00
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( 'b' ),
new HTMLPurifier_Token_Text ( '>' ),
2007-08-19 18:49:35 +00:00
),
'PH5P' => $split ,
2007-08-16 06:48:24 +00:00
)
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_earlyQuote () {
$this -> assertTokenization (
'<a "=>' ,
array ( new HTMLPurifier_Token_Empty ( 'a' ) ),
array (
// we barf on this input
'DirectLex' => $tokens = array (
new HTMLPurifier_Token_Start ( 'a' , array ( '"' => '' ))
),
'PEARSax3' => $tokens ,
2008-04-26 19:47:22 +00:00
'PH5P' => false , // behavior varies; handle this personally
2007-08-16 06:48:24 +00:00
)
);
}
2008-12-06 07:28:20 +00:00
2008-04-26 19:47:22 +00:00
function test_tokenizeHTML_earlyQuote_PH5P () {
if ( ! class_exists ( 'DOMDocument' )) return ;
$lexer = new HTMLPurifier_Lexer_PH5P ();
$result = $lexer -> tokenizeHTML ( '<a "=>' , $this -> config , $this -> context );
if ( $this -> context -> get ( 'PH5PError' , true )) {
$this -> assertIdentical ( array (
new HTMLPurifier_Token_Start ( 'a' , array ( '"' => '' ))
), $result );
} else {
$this -> assertIdentical ( array (
new HTMLPurifier_Token_Empty ( 'a' , array ( '"' => '' ))
), $result );
}
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_unescapedQuote () {
$this -> assertTokenization (
'"' ,
array ( new HTMLPurifier_Token_Text ( '"' ) )
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_escapedQuote () {
$this -> assertTokenization (
'"' ,
2007-08-19 18:49:35 +00:00
array ( new HTMLPurifier_Token_Text ( '"' ) ),
array (
'PEARSax3' => false , // PEAR barfs on this
)
2007-08-16 06:48:24 +00:00
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_cdata () {
$this -> assertTokenization (
'<![CDATA[You <b>can't</b> get me!]]>' ,
array ( new HTMLPurifier_Token_Text ( 'You <b>can't</b> get me!' ) ),
array (
// PEAR splits up all of the CDATA
2007-08-19 18:49:35 +00:00
'PEARSax3' => $split = array (
2007-08-16 06:48:24 +00:00
new HTMLPurifier_Token_Text ( 'You ' ),
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( 'b' ),
new HTMLPurifier_Token_Text ( '>' ),
new HTMLPurifier_Token_Text ( 'can' ),
new HTMLPurifier_Token_Text ( '&' ),
new HTMLPurifier_Token_Text ( '#39;t' ),
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( '/b' ),
new HTMLPurifier_Token_Text ( '>' ),
new HTMLPurifier_Token_Text ( ' get me!' ),
),
2007-08-19 18:49:35 +00:00
'PH5P' => $split ,
2007-08-16 06:48:24 +00:00
)
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_characterEntity () {
$this -> assertTokenization (
'θ' ,
array ( new HTMLPurifier_Token_Text ( " \xCE \xB8 " ) )
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_characterEntityInCDATA () {
$this -> assertTokenization (
'<![CDATA[→]]>' ,
array ( new HTMLPurifier_Token_Text ( " → " ) ),
array (
2007-08-19 18:49:35 +00:00
'PEARSax3' => $split = array (
2007-08-16 06:48:24 +00:00
new HTMLPurifier_Token_Text ( '&' ),
new HTMLPurifier_Token_Text ( 'rarr;' ),
),
2007-08-19 18:49:35 +00:00
'PH5P' => $split ,
2007-08-16 06:48:24 +00:00
)
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_entityInAttribute () {
$this -> assertTokenization (
'<a href="index.php?title=foo&id=bar">Link</a>' ,
array (
new HTMLPurifier_Token_Start ( 'a' , array ( 'href' => 'index.php?title=foo&id=bar' )),
new HTMLPurifier_Token_Text ( 'Link' ),
new HTMLPurifier_Token_End ( 'a' ),
)
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_preserveUTF8 () {
$this -> assertTokenization (
" \xCE \xB8 " ,
array ( new HTMLPurifier_Token_Text ( " \xCE \xB8 " ) )
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_specialEntityInAttribute () {
$this -> assertTokenization (
'<br test="x < 6" />' ,
array ( new HTMLPurifier_Token_Empty ( 'br' , array ( 'test' => 'x < 6' )) )
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_emoticonProtection () {
$this -> assertTokenization (
'<b>Whoa! <3 That\'s not good >.></b>' ,
array (
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( 'Whoa! ' ),
2008-06-28 04:43:02 +00:00
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( '3 That\'s not good >.>' ),
2007-08-16 06:48:24 +00:00
new HTMLPurifier_Token_End ( 'b' )
),
array (
// text is absorbed together
'DOMLex' => array (
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( 'Whoa! <3 That\'s not good >.>' ),
new HTMLPurifier_Token_End ( 'b' ),
),
'PEARSax3' => false , // totally mangled
2007-08-19 18:49:35 +00:00
'PH5P' => array ( // interesting grouping
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( 'Whoa! ' ),
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( '3 That\'s not good >.>' ),
new HTMLPurifier_Token_End ( 'b' ),
),
2007-08-16 06:48:24 +00:00
)
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_commentWithFunkyChars () {
$this -> assertTokenization (
'<!-- This >< comment --><br />' ,
array (
new HTMLPurifier_Token_Comment ( ' This >< comment ' ),
new HTMLPurifier_Token_Empty ( 'br' ),
),
array (
'PEARSax3' => false ,
)
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_unterminatedComment () {
$this -> assertTokenization (
'<!-- This >< comment' ,
array ( new HTMLPurifier_Token_Comment ( ' This >< comment' ) ),
array (
'DOMLex' => false ,
2007-08-19 18:49:35 +00:00
'PEARSax3' => false ,
'PH5P' => false ,
2007-08-16 06:48:24 +00:00
)
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_scriptCDATAContents () {
2009-02-20 00:17:49 +00:00
$this -> config -> set ( 'HTML.Trusted' , true );
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
2007-08-19 18:49:35 +00:00
'Foo: <script>alert("<foo>");</script>' ,
2007-08-16 06:48:24 +00:00
array (
2007-08-19 18:49:35 +00:00
new HTMLPurifier_Token_Text ( 'Foo: ' ),
2007-08-16 06:48:24 +00:00
new HTMLPurifier_Token_Start ( 'script' ),
new HTMLPurifier_Token_Text ( 'alert("<foo>");' ),
new HTMLPurifier_Token_End ( 'script' ),
),
array (
'PEARSax3' => false ,
2007-08-19 18:49:35 +00:00
// PH5P, for some reason, bubbles the script to <head>
'PH5P' => false ,
2007-08-16 06:48:24 +00:00
)
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_entitiesInComment () {
$this -> assertTokenization (
'<!-- This comment < < & -->' ,
array ( new HTMLPurifier_Token_Comment ( ' This comment < < & ' ) ),
array (
'PEARSax3' => false
)
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_attributeWithSpecialCharacters () {
$this -> assertTokenization (
'<a href="><>">' ,
array ( new HTMLPurifier_Token_Empty ( 'a' , array ( 'href' => '><>' )) ),
array (
'DirectLex' => array (
new HTMLPurifier_Token_Start ( 'a' , array ( 'href' => '' )),
2008-06-28 04:43:02 +00:00
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( '">' ),
2007-08-16 06:48:24 +00:00
),
'PEARSax3' => false ,
)
);
}
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_emptyTagWithSlashInAttribute () {
$this -> assertTokenization (
'<param name="src" value="http://example.com/video.wmv" />' ,
array ( new HTMLPurifier_Token_Empty ( 'param' , array ( 'name' => 'src' , 'value' => 'http://example.com/video.wmv' )) )
);
}
2008-12-06 07:28:20 +00:00
2008-04-05 04:28:37 +00:00
function test_tokenizeHTML_style () {
2008-02-20 01:28:19 +00:00
$extra = array (
// PH5P doesn't seem to like style tags
'PH5P' => false ,
// DirectLex defers to RemoveForeignElements for textification
'DirectLex' => array (
new HTMLPurifier_Token_Start ( 'style' , array ( 'type' => 'text/css' )),
new HTMLPurifier_Token_Comment ( " \n div { } \n " ),
new HTMLPurifier_Token_End ( 'style' ),
),
);
2009-02-16 07:59:48 +00:00
if ( ! defined ( 'LIBXML_VERSION' )) {
// LIBXML_VERSION is missing in early versions of PHP
// prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
// this translates to 5.0.x. In such cases, punt the test entirely.
return ;
} elseif ( LIBXML_VERSION < 20628 ) {
2008-02-20 01:28:19 +00:00
// libxml's behavior is wrong prior to this version, so make
// appropriate accomodations
$extra [ 'DOMLex' ] = $extra [ 'DirectLex' ];
}
2008-02-20 00:15:44 +00:00
$this -> assertTokenization (
' < style type = " text/css " ><!--
div {}
--></ style > ' ,
array (
new HTMLPurifier_Token_Start ( 'style' , array ( 'type' => 'text/css' )),
new HTMLPurifier_Token_Text ( " \n div { } \n " ),
new HTMLPurifier_Token_End ( 'style' ),
),
2008-02-20 01:28:19 +00:00
$extra
2008-02-20 00:15:44 +00:00
);
}
2008-12-06 07:28:20 +00:00
2008-06-28 04:43:02 +00:00
function test_tokenizeHTML_tagWithAtSignAndExtraGt () {
2010-02-27 02:14:52 +00:00
$alt_expect = array (
// Technically this is invalid, but it won't be a
// problem with invalid element removal; also, this
// mimics Mozilla's parsing of the tag.
new HTMLPurifier_Token_Start ( 'a@' ),
new HTMLPurifier_Token_Text ( '>' ),
);
2008-04-05 04:28:37 +00:00
$this -> assertTokenization (
'<a@>>' ,
array (
new HTMLPurifier_Token_Start ( 'a' ),
new HTMLPurifier_Token_Text ( '>' ),
new HTMLPurifier_Token_End ( 'a' ),
),
array (
2010-02-27 02:14:52 +00:00
'DirectLex' => $alt_expect ,
'PEARSax3' => $alt_expect ,
2008-04-05 04:28:37 +00:00
)
);
}
2008-12-06 07:28:20 +00:00
2008-06-28 04:43:02 +00:00
function test_tokenizeHTML_emoticonHeart () {
$this -> assertTokenization (
'<br /><3<br />' ,
array (
new HTMLPurifier_Token_Empty ( 'br' ),
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( '3' ),
new HTMLPurifier_Token_Empty ( 'br' ),
),
array (
'DOMLex' => array (
new HTMLPurifier_Token_Empty ( 'br' ),
new HTMLPurifier_Token_Text ( '<3' ),
new HTMLPurifier_Token_Empty ( 'br' ),
),
2010-02-27 02:14:52 +00:00
'PEARSax3' => array (
// bah too lazy to fix this
new HTMLPurifier_Token_Empty ( 'br' ),
new HTMLPurifier_Token_Empty ( '3<br' ),
),
2008-06-28 04:43:02 +00:00
)
);
}
2008-12-06 07:28:20 +00:00
2008-06-28 04:43:02 +00:00
function test_tokenizeHTML_emoticonShiftyEyes () {
$this -> assertTokenization (
'<b><<</b>' ,
array (
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_End ( 'b' ),
),
array (
'DOMLex' => array (
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( '<<' ),
new HTMLPurifier_Token_End ( 'b' ),
),
2010-02-27 02:14:52 +00:00
'PEARSax3' => array (
// also too lazy to fix
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Empty ( '<<' ),
new HTMLPurifier_Token_Text ( 'b>' ),
),
2008-06-28 04:43:02 +00:00
)
);
}
2008-12-06 07:28:20 +00:00
2008-06-28 04:43:02 +00:00
function test_tokenizeHTML_eon1996 () {
$this -> assertTokenization (
'< <b>test</b>' ,
array (
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( ' ' ),
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( 'test' ),
new HTMLPurifier_Token_End ( 'b' ),
),
array (
'DOMLex' => array (
new HTMLPurifier_Token_Text ( '< ' ),
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( 'test' ),
new HTMLPurifier_Token_End ( 'b' ),
),
2010-02-27 02:14:52 +00:00
'PEARSax3' => array (
// totally doing the wrong thing here
new HTMLPurifier_Token_Text ( ' ' ),
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( 'test' ),
new HTMLPurifier_Token_End ( 'b' ),
),
2008-06-28 04:43:02 +00:00
)
);
}
2008-12-06 07:28:20 +00:00
2008-08-01 23:06:28 +00:00
function test_tokenizeHTML_bodyInCDATA () {
2010-02-27 02:14:52 +00:00
$alt_tokens = array (
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( 'body' ),
new HTMLPurifier_Token_Text ( '>' ),
new HTMLPurifier_Token_Text ( 'Foo' ),
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( '/body' ),
new HTMLPurifier_Token_Text ( '>' ),
);
2008-08-01 23:06:28 +00:00
$this -> assertTokenization (
'<![CDATA[<body>Foo</body>]]>' ,
array (
new HTMLPurifier_Token_Text ( '<body>Foo</body>' ),
),
array (
2010-02-27 02:14:52 +00:00
'PH5P' => $alt_tokens ,
'PEARSax3' => $alt_tokens ,
2008-08-01 23:06:28 +00:00
)
);
}
2008-12-06 07:28:20 +00:00
2010-02-27 01:42:42 +00:00
function test_tokenizeHTML_ () {
$this -> assertTokenization (
'<a><img /></a>' ,
array (
new HTMLPurifier_Token_Start ( 'a' ),
new HTMLPurifier_Token_Empty ( 'img' ),
new HTMLPurifier_Token_End ( 'a' ),
)
);
}
2010-06-18 13:08:54 +00:00
function test_tokenizeHTML_ignoreIECondComment () {
$this -> assertTokenization (
'<!--[if IE]>foo<a>bar<!-- baz --><![endif]-->' ,
array ()
);
}
2010-06-21 01:26:44 +00:00
function test_tokenizeHTML_removeProcessingInstruction () {
$this -> config -> set ( 'Core.RemoveProcessingInstructions' , true );
$this -> assertTokenization (
'<?xml blah blah ?>' ,
array ()
);
}
2010-09-10 20:51:55 +00:00
function test_tokenizeHTML_removeNewline () {
2010-09-15 06:49:24 +00:00
$this -> config -> set ( 'Core.NormalizeNewlines' , true );
2010-09-28 14:22:38 +00:00
$this -> assertTokenization (
" plain \r text \r \n " ,
array (
new HTMLPurifier_Token_Text ( " plain \n text \n " )
)
2010-09-10 20:51:55 +00:00
);
}
function test_tokenizeHTML_noRemoveNewline () {
2010-09-15 06:49:24 +00:00
$this -> config -> set ( 'Core.NormalizeNewlines' , false );
2010-09-28 14:22:38 +00:00
$this -> assertTokenization (
" plain \r text \r \n " ,
array (
new HTMLPurifier_Token_Text ( " plain \r text \r \n " )
)
2010-09-10 20:51:55 +00:00
);
}
2010-09-28 14:22:38 +00:00
function test_tokenizeHTML_conditionalCommentUngreedy () {
$this -> assertTokenization (
'<!--[if gte mso 9]>a<![endif]-->b<!--[if gte mso 9]>c<![endif]-->' ,
array (
new HTMLPurifier_Token_Text ( " b " )
)
);
}
2010-10-28 16:24:07 +00:00
function test_tokenizeHTML_imgTag () {
2010-11-12 15:43:10 +00:00
$start = array (
new HTMLPurifier_Token_Start ( 'img' ,
array (
'src' => 'img_11775.jpg' ,
'alt' => '[Img #11775]' ,
'id' => 'EMBEDDED_IMG_11775' ,
)
)
);
2010-10-28 16:24:07 +00:00
$this -> assertTokenization (
'<img src="img_11775.jpg" alt="[Img #11775]" id="EMBEDDED_IMG_11775" >' ,
array (
new HTMLPurifier_Token_Empty ( 'img' ,
array (
'src' => 'img_11775.jpg' ,
'alt' => '[Img #11775]' ,
'id' => 'EMBEDDED_IMG_11775' ,
)
)
),
array (
2010-11-12 15:43:10 +00:00
'DirectLex' => $start ,
'PEARSax3' => $start ,
2010-10-28 16:24:07 +00:00
)
);
}
2010-09-10 20:51:55 +00:00
2007-08-16 06:48:24 +00:00
/*
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
function test_tokenizeHTML_ () {
$this -> assertTokenization (
,
array (
2008-12-06 07:28:20 +00:00
2007-08-16 06:48:24 +00:00
)
2007-08-08 05:05:30 +00:00
);
2006-07-23 00:11:03 +00:00
}
2007-08-16 06:48:24 +00:00
*/
2008-12-06 07:28:20 +00:00
2006-07-23 00:11:03 +00:00
}
2008-12-06 09:24:59 +00:00
// vim: et sw=4 sts=4