2006-07-23 00:11:03 +00:00
< ? php
2007-08-01 14:06:59 +00:00
class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
2006-07-23 00:11:03 +00:00
{
2008-12-06 02:28:20 -05:00
2007-11-25 02:24:39 +00:00
protected $_has_pear = false ;
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function __construct ()
{
2008-04-21 15:24:18 +00:00
parent :: __construct ();
2007-08-19 18:49:35 +00:00
if ( $GLOBALS [ 'HTMLPurifierTest' ][ 'PH5P' ]) {
require_once 'HTMLPurifier/Lexer/PH5P.php' ;
}
2006-07-23 00:11:03 +00:00
}
2008-12-06 02:28:20 -05:00
2007-08-16 06:48:24 +00:00
// HTMLPurifier_Lexer::create() --------------------------------------------
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_create ()
{
2009-02-19 19:17:49 -05:00
$this -> config -> set ( 'Core.MaintainLineNumbers' , true );
2007-08-16 06:48:24 +00:00
$lexer = HTMLPurifier_Lexer :: create ( $this -> config );
2007-06-18 02:01:01 +00:00
$this -> assertIsA ( $lexer , 'HTMLPurifier_Lexer_DirectLex' );
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_create_objectLexerImpl ()
{
2009-02-19 19:17:49 -05:00
$this -> config -> set ( 'Core.LexerImpl' , new HTMLPurifier_Lexer_DirectLex ());
2008-09-05 14:04:23 -04:00
$lexer = HTMLPurifier_Lexer :: create ( $this -> config );
$this -> assertIsA ( $lexer , 'HTMLPurifier_Lexer_DirectLex' );
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_create_unknownLexer ()
{
2009-02-19 19:17:49 -05:00
$this -> config -> set ( 'Core.LexerImpl' , 'AsdfAsdf' );
2008-09-05 14:04:23 -04:00
$this -> expectException ( new HTMLPurifier_Exception ( 'Cannot instantiate unrecognized Lexer type AsdfAsdf' ));
HTMLPurifier_Lexer :: create ( $this -> config );
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_create_incompatibleLexer ()
{
2009-02-19 19:17:49 -05:00
$this -> config -> set ( 'Core.LexerImpl' , 'DOMLex' );
$this -> config -> set ( 'Core.MaintainLineNumbers' , true );
2008-09-05 14:04:23 -04:00
$this -> expectException ( new HTMLPurifier_Exception ( 'Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)' ));
HTMLPurifier_Lexer :: create ( $this -> config );
}
2008-12-06 02:28:20 -05:00
2007-08-16 06:48:24 +00:00
// HTMLPurifier_Lexer->parseData() -----------------------------------------
2008-12-06 02:28:20 -05:00
2017-03-07 13:34:55 -08:00
public function assertParseData ( $input , $expect = true , $is_attr = false )
2013-07-16 13:56:14 +02:00
{
2007-08-16 06:48:24 +00:00
if ( $expect === true ) $expect = $input ;
$lexer = new HTMLPurifier_Lexer ();
2017-03-07 13:34:55 -08:00
$this -> assertIdentical ( $expect , $lexer -> parseData ( $input , $is_attr , $this -> config ));
2007-08-16 06:48:24 +00:00
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_parseData_plainText ()
{
2007-08-16 06:48:24 +00:00
$this -> assertParseData ( 'asdf' );
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_parseData_ampersandEntity ()
{
2007-08-16 06:48:24 +00:00
$this -> assertParseData ( '&' , '&' );
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_parseData_quotEntity ()
{
2007-08-16 06:48:24 +00:00
$this -> assertParseData ( '"' , '"' );
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_parseData_aposNumericEntity ()
{
2007-08-16 06:48:24 +00:00
$this -> assertParseData ( ''' , " ' " );
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_parseData_aposCompactNumericEntity ()
{
2007-08-16 06:48:24 +00:00
$this -> assertParseData ( ''' , " ' " );
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_parseData_adjacentAmpersandEntities ()
{
2007-08-16 06:48:24 +00:00
$this -> assertParseData ( '&&&' , '&&&' );
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_parseData_trailingUnescapedAmpersand ()
{
2007-08-16 06:48:24 +00:00
$this -> assertParseData ( '&&' , '&&' );
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_parseData_internalUnescapedAmpersand ()
{
2007-08-16 06:48:24 +00:00
$this -> assertParseData ( 'Procter & Gamble' );
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_parseData_improperEntityFaultToleranceTest ()
{
2017-03-07 13:34:55 -08:00
$this -> assertParseData ( '-' , '-' );
}
public function test_parseData_noTrailingSemi ()
{
$this -> assertParseData ( '&A' , '&A' );
}
public function test_parseData_noTrailingSemiAttr ()
{
$this -> assertParseData ( '&A' , '&A' , true );
}
public function test_parseData_T119 ()
{
$this -> assertParseData ( '&A' , '&A' , true );
}
public function test_parseData_T119b ()
{
$this -> assertParseData ( '&trade=' , true , true );
}
public function test_parseData_legacy1 ()
{
$this -> config -> set ( 'Core.LegacyEntityDecoder' , true );
$this -> assertParseData ( '&a' , true );
$this -> assertParseData ( '&=' , " &= " );
$this -> assertParseData ( '&a' , true , true );
$this -> assertParseData ( '&=' , " &= " , true );
$this -> assertParseData ( '<a' , true );
$this -> assertParseData ( '<=' , " <= " );
$this -> assertParseData ( '<a' , true , true );
$this -> assertParseData ( '<=' , " <= " , true );
}
public function test_parseData_nonlegacy1 ()
{
$this -> assertParseData ( '&a' , " &a " );
$this -> assertParseData ( '&=' , " &= " );
$this -> assertParseData ( '&a' , true , true );
$this -> assertParseData ( '&=' , true , true );
$this -> assertParseData ( '<a' , " <a " );
$this -> assertParseData ( '<=' , " <= " );
$this -> assertParseData ( '<a' , true , true );
$this -> assertParseData ( '<=' , true , true );
$this -> assertParseData ( '<a;' , " <a; " );
}
public function test_parseData_noTrailingSemiNever ()
{
$this -> assertParseData ( '&imath' );
2007-08-16 06:48:24 +00:00
}
2008-12-06 02:28:20 -05:00
2007-08-16 06:48:24 +00:00
// HTMLPurifier_Lexer->extractBody() ---------------------------------------
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function assertExtractBody ( $text , $extract = true )
{
2007-08-16 06:48:24 +00:00
$lexer = new HTMLPurifier_Lexer ();
$result = $lexer -> extractBody ( $text );
2006-08-15 00:53:24 +00:00
if ( $extract === true ) $extract = $text ;
$this -> assertIdentical ( $extract , $result );
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_extractBody_noBodyTags ()
{
2007-08-16 06:48:24 +00:00
$this -> assertExtractBody ( '<b>Bold</b>' );
2006-09-27 02:09:54 +00:00
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_extractBody_lowercaseBodyTags ()
{
2006-08-15 00:53:24 +00:00
$this -> assertExtractBody ( '<html><body><b>Bold</b></body></html>' , '<b>Bold</b>' );
2007-08-16 06:48:24 +00:00
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_extractBody_uppercaseBodyTags ()
{
2006-08-15 01:14:39 +00:00
$this -> assertExtractBody ( '<HTML><BODY><B>Bold</B></BODY></HTML>' , '<B>Bold</B>' );
2007-08-16 06:48:24 +00:00
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_extractBody_realisticUseCase ()
{
2006-08-15 00:53:24 +00:00
$this -> assertExtractBody (
2007-06-27 13:58:32 +00:00
' < ? xml version = " 1.0 "
2006-08-15 00:53:24 +00:00
<! DOCTYPE html PUBLIC " -//W3C//DTD XHTML 1.0 Strict//EN "
" http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd " >
< html xmlns = " http://www.w3.org/1999/xhtml " xml : lang = " en " lang = " en " >
< head >
< title > xyz </ title >
</ head >
< body >
< form method = " post " action = " whatever1 " >
< div >
< input type = " text " name = " username " />
< input type = " text " name = " password " />
< input type = " submit " />
</ div >
</ form >
</ body >
</ html > ' ,
'
< form method = " post " action = " whatever1 " >
< div >
< input type = " text " name = " username " />
< input type = " text " name = " password " />
< input type = " submit " />
</ div >
</ form >
' );
2007-08-16 06:48:24 +00:00
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_extractBody_bodyWithAttributes ()
{
2006-08-15 00:53:24 +00:00
$this -> assertExtractBody ( '<html><body bgcolor="#F00"><b>Bold</b></body></html>' , '<b>Bold</b>' );
2007-08-16 06:48:24 +00:00
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_extractBody_preserveUnclosedBody ()
{
2006-08-15 00:53:24 +00:00
$this -> assertExtractBody ( '<body>asdf' ); // not closed, don't accept
2007-08-16 06:48:24 +00:00
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_extractBody_useLastBody ()
{
2009-07-07 22:19:04 -04:00
$this -> assertExtractBody ( '<body>foo</body>bar</body>' , 'foo</body>bar' );
}
2016-03-27 15:19:32 -07:00
public function test_extractBody_ignoreCommented ()
{
$this -> assertExtractBody ( '$<!-- <body>foo</body> -->^' );
}
public function test_extractBody_butCanStillWork ()
{
$this -> assertExtractBody ( '<!-- b --><body>a</body>' , 'a' );
}
2007-08-16 06:48:24 +00:00
// HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function assertTokenization ( $input , $expect , $alt_expect = array ())
{
2007-08-16 06:48:24 +00:00
$lexers = array ();
$lexers [ 'DirectLex' ] = new HTMLPurifier_Lexer_DirectLex ();
2008-04-26 19:47:22 +00:00
if ( class_exists ( 'DOMDocument' )) {
2007-08-16 06:48:24 +00:00
$lexers [ 'DOMLex' ] = new HTMLPurifier_Lexer_DOMLex ();
2007-08-19 18:49:35 +00:00
$lexers [ 'PH5P' ] = new HTMLPurifier_Lexer_PH5P ();
2007-08-16 06:48:24 +00:00
}
foreach ( $lexers as $name => $lexer ) {
$result = $lexer -> tokenizeHTML ( $input , $this -> config , $this -> context );
if ( isset ( $alt_expect [ $name ])) {
if ( $alt_expect [ $name ] === false ) continue ;
2007-08-19 18:49:35 +00:00
$t_expect = $alt_expect [ $name ];
$this -> assertIdentical ( $result , $alt_expect [ $name ], " $name : %s " );
2007-08-16 06:48:24 +00:00
} else {
2007-08-19 18:49:35 +00:00
$t_expect = $expect ;
$this -> assertIdentical ( $result , $expect , " $name : %s " );
}
if ( $t_expect != $result ) {
printTokens ( $result );
2007-08-16 06:48:24 +00:00
}
}
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_emptyInput ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization ( '' , array ());
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_plainText ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'This is regular text.' ,
array (
new HTMLPurifier_Token_Text ( 'This is regular text.' )
)
2007-04-04 02:22:27 +00:00
);
2007-08-16 06:48:24 +00:00
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_textAndTags ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'This is <b>bold</b> text' ,
array (
new HTMLPurifier_Token_Text ( 'This is ' ),
new HTMLPurifier_Token_Start ( 'b' , array ()),
new HTMLPurifier_Token_Text ( 'bold' ),
new HTMLPurifier_Token_End ( 'b' ),
new HTMLPurifier_Token_Text ( ' text' ),
)
2007-06-27 16:40:18 +00:00
);
2007-08-16 06:48:24 +00:00
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_normalizeCase ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<DIV>Totally rad dude. <b>asdf</b></div>' ,
array (
new HTMLPurifier_Token_Start ( 'DIV' , array ()),
new HTMLPurifier_Token_Text ( 'Totally rad dude. ' ),
new HTMLPurifier_Token_Start ( 'b' , array ()),
new HTMLPurifier_Token_Text ( 'asdf' ),
new HTMLPurifier_Token_End ( 'b' ),
new HTMLPurifier_Token_End ( 'div' ),
)
2007-06-21 14:44:26 +00:00
);
2007-08-16 06:48:24 +00:00
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_notWellFormed ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>' ,
array (
new HTMLPurifier_Token_Start ( 'asdf' ),
new HTMLPurifier_Token_End ( 'asdf' ),
new HTMLPurifier_Token_Start ( 'd' ),
new HTMLPurifier_Token_End ( 'd' ),
new HTMLPurifier_Token_Start ( 'poOloka' ),
new HTMLPurifier_Token_Start ( 'poolasdf' ),
new HTMLPurifier_Token_Start ( 'ds' ),
new HTMLPurifier_Token_End ( 'asdf' ),
new HTMLPurifier_Token_End ( 'ASDF' ),
),
array (
2007-08-19 18:49:35 +00:00
'DOMLex' => $alt = array (
2007-08-16 06:48:24 +00:00
new HTMLPurifier_Token_Empty ( 'asdf' ),
new HTMLPurifier_Token_Empty ( 'd' ),
new HTMLPurifier_Token_Start ( 'pooloka' ),
new HTMLPurifier_Token_Start ( 'poolasdf' ),
new HTMLPurifier_Token_Empty ( 'ds' ),
new HTMLPurifier_Token_End ( 'poolasdf' ),
new HTMLPurifier_Token_End ( 'pooloka' ),
),
2014-08-31 08:50:33 +01:00
// 20140831: Weird, but whatever...
'PH5P' => array ( new HTMLPurifier_Token_Empty ( 'asdf' )),
2007-08-16 06:48:24 +00:00
)
2007-06-21 14:44:26 +00:00
);
2007-08-16 06:48:24 +00:00
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_whitespaceInTag ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<a' . " \t " . 'href="foobar.php"' . " \n " . 'title="foo!">Link to <b id="asdf">foobar</b></a>' ,
array (
new HTMLPurifier_Token_Start ( 'a' , array ( 'href' => 'foobar.php' , 'title' => 'foo!' )),
new HTMLPurifier_Token_Text ( 'Link to ' ),
new HTMLPurifier_Token_Start ( 'b' , array ( 'id' => 'asdf' )),
new HTMLPurifier_Token_Text ( 'foobar' ),
new HTMLPurifier_Token_End ( 'b' ),
new HTMLPurifier_Token_End ( 'a' ),
)
2007-06-26 16:08:42 +00:00
);
2007-08-16 06:48:24 +00:00
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_singleAttribute ()
{
2010-05-31 19:44:18 -07:00
$this -> assertTokenization (
'<br style="&" />' ,
array (
new HTMLPurifier_Token_Empty ( 'br' , array ( 'style' => '&' ))
)
);
}
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_emptyTag ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<br />' ,
array ( new HTMLPurifier_Token_Empty ( 'br' ) )
2007-07-05 21:29:07 +00:00
);
2007-08-16 06:48:24 +00:00
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_comment ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<!-- Comment -->' ,
2010-02-26 21:14:52 -05:00
array ( new HTMLPurifier_Token_Comment ( ' Comment ' ) )
2007-06-27 16:40:18 +00:00
);
2007-08-16 06:48:24 +00:00
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_malformedComment ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<!-- not so well formed --->' ,
2010-02-26 21:14:52 -05:00
array ( new HTMLPurifier_Token_Comment ( ' not so well formed -' ) )
2007-08-16 06:48:24 +00:00
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_unterminatedTag ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<a href=""' ,
array ( new HTMLPurifier_Token_Text ( '<a href=""' ) ),
array (
// I like our behavior better, but it's non-standard
'DOMLex' => array ( new HTMLPurifier_Token_Empty ( 'a' , array ( 'href' => '' )) ),
2007-08-19 18:49:35 +00:00
'PH5P' => false , // total barfing, grabs scaffolding too
2007-08-16 06:48:24 +00:00
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_specialEntities ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<b>' ,
array (
new HTMLPurifier_Token_Text ( '<b>' )
),
array (
2007-08-19 18:49:35 +00:00
// some parsers will separate entities out
2012-01-03 20:40:17 +08:00
'PH5P' => array (
2007-08-16 06:48:24 +00:00
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( 'b' ),
new HTMLPurifier_Token_Text ( '>' ),
2007-08-19 18:49:35 +00:00
),
2007-08-16 06:48:24 +00:00
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_earlyQuote ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<a "=>' ,
array ( new HTMLPurifier_Token_Empty ( 'a' ) ),
array (
// we barf on this input
2012-01-03 20:40:17 +08:00
'DirectLex' => array (
2007-08-16 06:48:24 +00:00
new HTMLPurifier_Token_Start ( 'a' , array ( '"' => '' ))
),
2008-04-26 19:47:22 +00:00
'PH5P' => false , // behavior varies; handle this personally
2007-08-16 06:48:24 +00:00
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_earlyQuote_PH5P ()
{
2008-04-26 19:47:22 +00:00
if ( ! class_exists ( 'DOMDocument' )) return ;
$lexer = new HTMLPurifier_Lexer_PH5P ();
$result = $lexer -> tokenizeHTML ( '<a "=>' , $this -> config , $this -> context );
if ( $this -> context -> get ( 'PH5PError' , true )) {
$this -> assertIdentical ( array (
new HTMLPurifier_Token_Start ( 'a' , array ( '"' => '' ))
), $result );
} else {
$this -> assertIdentical ( array (
new HTMLPurifier_Token_Empty ( 'a' , array ( '"' => '' ))
), $result );
}
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_unescapedQuote ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'"' ,
array ( new HTMLPurifier_Token_Text ( '"' ) )
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_escapedQuote ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'"' ,
2012-01-03 20:40:17 +08:00
array ( new HTMLPurifier_Token_Text ( '"' ) )
2007-08-16 06:48:24 +00:00
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_cdata ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<![CDATA[You <b>can't</b> get me!]]>' ,
array ( new HTMLPurifier_Token_Text ( 'You <b>can't</b> get me!' ) ),
array (
2012-01-03 20:40:17 +08:00
'PH5P' => array (
2007-08-16 06:48:24 +00:00
new HTMLPurifier_Token_Text ( 'You ' ),
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( 'b' ),
new HTMLPurifier_Token_Text ( '>' ),
new HTMLPurifier_Token_Text ( 'can' ),
new HTMLPurifier_Token_Text ( '&' ),
new HTMLPurifier_Token_Text ( '#39;t' ),
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( '/b' ),
new HTMLPurifier_Token_Text ( '>' ),
new HTMLPurifier_Token_Text ( ' get me!' ),
),
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_characterEntity ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'θ' ,
array ( new HTMLPurifier_Token_Text ( " \xCE \xB8 " ) )
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_characterEntityInCDATA ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<![CDATA[→]]>' ,
array ( new HTMLPurifier_Token_Text ( " → " ) ),
array (
2012-01-03 20:40:17 +08:00
'PH5P' => array (
2007-08-16 06:48:24 +00:00
new HTMLPurifier_Token_Text ( '&' ),
new HTMLPurifier_Token_Text ( 'rarr;' ),
),
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_entityInAttribute ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<a href="index.php?title=foo&id=bar">Link</a>' ,
array (
new HTMLPurifier_Token_Start ( 'a' , array ( 'href' => 'index.php?title=foo&id=bar' )),
new HTMLPurifier_Token_Text ( 'Link' ),
new HTMLPurifier_Token_End ( 'a' ),
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_preserveUTF8 ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
" \xCE \xB8 " ,
array ( new HTMLPurifier_Token_Text ( " \xCE \xB8 " ) )
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_specialEntityInAttribute ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<br test="x < 6" />' ,
array ( new HTMLPurifier_Token_Empty ( 'br' , array ( 'test' => 'x < 6' )) )
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_emoticonProtection ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<b>Whoa! <3 That\'s not good >.></b>' ,
array (
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( 'Whoa! ' ),
2008-06-28 00:43:02 -04:00
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( '3 That\'s not good >.>' ),
2007-08-16 06:48:24 +00:00
new HTMLPurifier_Token_End ( 'b' )
),
array (
// text is absorbed together
'DOMLex' => array (
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( 'Whoa! <3 That\'s not good >.>' ),
new HTMLPurifier_Token_End ( 'b' ),
),
2007-08-19 18:49:35 +00:00
'PH5P' => array ( // interesting grouping
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( 'Whoa! ' ),
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( '3 That\'s not good >.>' ),
new HTMLPurifier_Token_End ( 'b' ),
),
2007-08-16 06:48:24 +00:00
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_commentWithFunkyChars ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<!-- This >< comment --><br />' ,
array (
new HTMLPurifier_Token_Comment ( ' This >< comment ' ),
new HTMLPurifier_Token_Empty ( 'br' ),
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_unterminatedComment ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<!-- This >< comment' ,
array ( new HTMLPurifier_Token_Comment ( ' This >< comment' ) ),
array (
'DOMLex' => false ,
2007-08-19 18:49:35 +00:00
'PH5P' => false ,
2007-08-16 06:48:24 +00:00
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_scriptCDATAContents ()
{
2009-02-19 19:17:49 -05:00
$this -> config -> set ( 'HTML.Trusted' , true );
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
2007-08-19 18:49:35 +00:00
'Foo: <script>alert("<foo>");</script>' ,
2007-08-16 06:48:24 +00:00
array (
2007-08-19 18:49:35 +00:00
new HTMLPurifier_Token_Text ( 'Foo: ' ),
2007-08-16 06:48:24 +00:00
new HTMLPurifier_Token_Start ( 'script' ),
new HTMLPurifier_Token_Text ( 'alert("<foo>");' ),
new HTMLPurifier_Token_End ( 'script' ),
),
array (
2007-08-19 18:49:35 +00:00
// PH5P, for some reason, bubbles the script to <head>
'PH5P' => false ,
2007-08-16 06:48:24 +00:00
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_entitiesInComment ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<!-- This comment < < & -->' ,
2012-01-03 20:40:17 +08:00
array ( new HTMLPurifier_Token_Comment ( ' This comment < < & ' ) )
2007-08-16 06:48:24 +00:00
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_attributeWithSpecialCharacters ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<a href="><>">' ,
array ( new HTMLPurifier_Token_Empty ( 'a' , array ( 'href' => '><>' )) ),
array (
'DirectLex' => array (
new HTMLPurifier_Token_Start ( 'a' , array ( 'href' => '' )),
2008-06-28 00:43:02 -04:00
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( '">' ),
2012-01-03 20:40:17 +08:00
)
2007-08-16 06:48:24 +00:00
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_emptyTagWithSlashInAttribute ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
'<param name="src" value="http://example.com/video.wmv" />' ,
array ( new HTMLPurifier_Token_Empty ( 'param' , array ( 'name' => 'src' , 'value' => 'http://example.com/video.wmv' )) )
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_style ()
{
2008-02-20 01:28:19 +00:00
$extra = array (
// PH5P doesn't seem to like style tags
'PH5P' => false ,
// DirectLex defers to RemoveForeignElements for textification
'DirectLex' => array (
new HTMLPurifier_Token_Start ( 'style' , array ( 'type' => 'text/css' )),
new HTMLPurifier_Token_Comment ( " \n div { } \n " ),
new HTMLPurifier_Token_End ( 'style' ),
),
);
2009-02-16 02:59:48 -05:00
if ( ! defined ( 'LIBXML_VERSION' )) {
// LIBXML_VERSION is missing in early versions of PHP
// prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
// this translates to 5.0.x. In such cases, punt the test entirely.
return ;
} elseif ( LIBXML_VERSION < 20628 ) {
2008-02-20 01:28:19 +00:00
// libxml's behavior is wrong prior to this version, so make
// appropriate accomodations
$extra [ 'DOMLex' ] = $extra [ 'DirectLex' ];
}
2008-02-20 00:15:44 +00:00
$this -> assertTokenization (
' < style type = " text/css " ><!--
div {}
--></ style > ' ,
array (
new HTMLPurifier_Token_Start ( 'style' , array ( 'type' => 'text/css' )),
new HTMLPurifier_Token_Text ( " \n div { } \n " ),
new HTMLPurifier_Token_End ( 'style' ),
),
2008-02-20 01:28:19 +00:00
$extra
2008-02-20 00:15:44 +00:00
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_tagWithAtSignAndExtraGt ()
{
2010-02-26 21:14:52 -05:00
$alt_expect = array (
// Technically this is invalid, but it won't be a
// problem with invalid element removal; also, this
// mimics Mozilla's parsing of the tag.
new HTMLPurifier_Token_Start ( 'a@' ),
new HTMLPurifier_Token_Text ( '>' ),
);
2008-04-05 04:28:37 +00:00
$this -> assertTokenization (
'<a@>>' ,
array (
new HTMLPurifier_Token_Start ( 'a' ),
new HTMLPurifier_Token_Text ( '>' ),
new HTMLPurifier_Token_End ( 'a' ),
),
array (
2010-02-26 21:14:52 -05:00
'DirectLex' => $alt_expect ,
2008-04-05 04:28:37 +00:00
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_emoticonHeart ()
{
2008-06-28 00:43:02 -04:00
$this -> assertTokenization (
'<br /><3<br />' ,
array (
new HTMLPurifier_Token_Empty ( 'br' ),
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( '3' ),
new HTMLPurifier_Token_Empty ( 'br' ),
),
array (
'DOMLex' => array (
new HTMLPurifier_Token_Empty ( 'br' ),
new HTMLPurifier_Token_Text ( '<3' ),
new HTMLPurifier_Token_Empty ( 'br' ),
),
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_emoticonShiftyEyes ()
{
2008-06-28 00:43:02 -04:00
$this -> assertTokenization (
'<b><<</b>' ,
array (
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_End ( 'b' ),
),
array (
'DOMLex' => array (
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( '<<' ),
new HTMLPurifier_Token_End ( 'b' ),
),
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_eon1996 ()
{
2008-06-28 00:43:02 -04:00
$this -> assertTokenization (
'< <b>test</b>' ,
array (
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( ' ' ),
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( 'test' ),
new HTMLPurifier_Token_End ( 'b' ),
),
array (
'DOMLex' => array (
new HTMLPurifier_Token_Text ( '< ' ),
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( 'test' ),
new HTMLPurifier_Token_End ( 'b' ),
),
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_bodyInCDATA ()
{
2010-02-26 21:14:52 -05:00
$alt_tokens = array (
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( 'body' ),
new HTMLPurifier_Token_Text ( '>' ),
new HTMLPurifier_Token_Text ( 'Foo' ),
new HTMLPurifier_Token_Text ( '<' ),
new HTMLPurifier_Token_Text ( '/body' ),
new HTMLPurifier_Token_Text ( '>' ),
);
2008-08-01 19:06:28 -04:00
$this -> assertTokenization (
'<![CDATA[<body>Foo</body>]]>' ,
array (
new HTMLPurifier_Token_Text ( '<body>Foo</body>' ),
),
array (
2010-02-26 21:14:52 -05:00
'PH5P' => $alt_tokens ,
2008-08-01 19:06:28 -04:00
)
);
}
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_ ()
{
2010-02-26 20:42:42 -05:00
$this -> assertTokenization (
'<a><img /></a>' ,
array (
new HTMLPurifier_Token_Start ( 'a' ),
new HTMLPurifier_Token_Empty ( 'img' ),
new HTMLPurifier_Token_End ( 'a' ),
)
);
}
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_ignoreIECondComment ()
{
2010-06-18 06:08:54 -07:00
$this -> assertTokenization (
'<!--[if IE]>foo<a>bar<!-- baz --><![endif]-->' ,
array ()
);
}
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_removeProcessingInstruction ()
{
2010-06-20 18:26:44 -07:00
$this -> config -> set ( 'Core.RemoveProcessingInstructions' , true );
$this -> assertTokenization (
'<?xml blah blah ?>' ,
array ()
);
}
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_removeNewline ()
{
2010-09-15 02:49:24 -04:00
$this -> config -> set ( 'Core.NormalizeNewlines' , true );
2010-09-28 10:22:38 -04:00
$this -> assertTokenization (
" plain \r text \r \n " ,
array (
new HTMLPurifier_Token_Text ( " plain \n text \n " )
)
2010-09-10 21:51:55 +01:00
);
}
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_noRemoveNewline ()
{
2010-09-15 02:49:24 -04:00
$this -> config -> set ( 'Core.NormalizeNewlines' , false );
2010-09-28 10:22:38 -04:00
$this -> assertTokenization (
" plain \r text \r \n " ,
array (
new HTMLPurifier_Token_Text ( " plain \r text \r \n " )
)
2010-09-10 21:51:55 +01:00
);
}
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_conditionalCommentUngreedy ()
{
2010-09-28 10:22:38 -04:00
$this -> assertTokenization (
'<!--[if gte mso 9]>a<![endif]-->b<!--[if gte mso 9]>c<![endif]-->' ,
array (
new HTMLPurifier_Token_Text ( " b " )
)
);
}
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_imgTag ()
{
2010-11-12 15:43:10 +00:00
$start = array (
new HTMLPurifier_Token_Start ( 'img' ,
array (
'src' => 'img_11775.jpg' ,
'alt' => '[Img #11775]' ,
'id' => 'EMBEDDED_IMG_11775' ,
)
)
);
2010-10-28 17:24:07 +01:00
$this -> assertTokenization (
'<img src="img_11775.jpg" alt="[Img #11775]" id="EMBEDDED_IMG_11775" >' ,
array (
new HTMLPurifier_Token_Empty ( 'img' ,
array (
'src' => 'img_11775.jpg' ,
'alt' => '[Img #11775]' ,
'id' => 'EMBEDDED_IMG_11775' ,
)
)
),
array (
2010-11-12 15:43:10 +00:00
'DirectLex' => $start ,
2010-10-28 17:24:07 +01:00
)
);
}
2014-08-31 08:50:33 +01:00
public function test_tokenizeHTML_prematureDivClose ()
{
$this -> assertTokenization (
2017-03-06 22:54:54 -08:00
'</div>dont<b>die</b>' ,
2014-08-31 08:50:33 +01:00
array (
new HTMLPurifier_Token_End ( 'div' ),
2017-03-06 22:54:54 -08:00
new HTMLPurifier_Token_Text ( 'dont' ),
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( 'die' ),
new HTMLPurifier_Token_End ( 'b' ),
2014-08-31 08:50:33 +01:00
),
array (
2017-03-06 22:54:54 -08:00
'DOMLex' => $alt = array (
new HTMLPurifier_Token_Text ( 'dont' ),
new HTMLPurifier_Token_Start ( 'b' ),
new HTMLPurifier_Token_Text ( 'die' ),
new HTMLPurifier_Token_End ( 'b' )
),
2014-08-31 08:50:33 +01:00
'PH5P' => $alt
)
);
}
2010-09-10 21:51:55 +01:00
2007-08-16 06:48:24 +00:00
/*
2008-12-06 02:28:20 -05:00
2013-07-16 13:56:14 +02:00
public function test_tokenizeHTML_ ()
{
2007-08-16 06:48:24 +00:00
$this -> assertTokenization (
,
array (
2008-12-06 02:28:20 -05:00
2007-08-16 06:48:24 +00:00
)
2007-08-08 05:05:30 +00:00
);
2006-07-23 00:11:03 +00:00
}
2007-08-16 06:48:24 +00:00
*/
2008-12-06 02:28:20 -05:00
2006-07-23 00:11:03 +00:00
}
2008-12-06 04:24:59 -05:00
// vim: et sw=4 sts=4