From 6a5ff6d2a9f55ddd412101188f66b73fcb6dba03 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sun, 23 Jul 2006 00:01:11 +0000 Subject: [PATCH] Branch the LexerTest in order to partition DirectLex specific unit tests. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@94 48356398-32a2-884e-a903-53898d9a118a --- tests/HTMLPurifier/Lexer/DirectLexTest.php | 264 +++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 tests/HTMLPurifier/Lexer/DirectLexTest.php diff --git a/tests/HTMLPurifier/Lexer/DirectLexTest.php b/tests/HTMLPurifier/Lexer/DirectLexTest.php new file mode 100644 index 00000000..fc9c769b --- /dev/null +++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php @@ -0,0 +1,264 @@ +DirectLex = new HTMLPurifier_Lexer_DirectLex(); + $this->PEARSax3 = new HTMLPurifier_Lexer_PEARSax3(); + + $this->_has_dom = version_compare(PHP_VERSION, '5', '>='); + + if ($this->_has_dom) { + require_once 'HTMLPurifier/Lexer/DOMLex.php'; + $this->DOMLex = new HTMLPurifier_Lexer_DOMLex(); + } + + } + + function test_nextWhiteSpace() { + $HP =& $this->DirectLex; + $this->assertIdentical(false, $HP->nextWhiteSpace('asdf')); + $this->assertIdentical(0, $HP->nextWhiteSpace(' asdf')); + $this->assertIdentical(0, $HP->nextWhiteSpace("\nasdf")); + $this->assertIdentical(1, $HP->nextWhiteSpace("a\tsdf")); + $this->assertIdentical(4, $HP->nextWhiteSpace("asdf\r")); + $this->assertIdentical(2, $HP->nextWhiteSpace("as\t\r\nasdf as")); + $this->assertIdentical(3, $HP->nextWhiteSpace('a a ', 2)); + } + + function test_parseData() { + $HP =& $this->DirectLex; + $this->assertIdentical('asdf', $HP->parseData('asdf')); + $this->assertIdentical('&', $HP->parseData('&')); + $this->assertIdentical('"', $HP->parseData('"')); + $this->assertIdentical("'", $HP->parseData(''')); + $this->assertIdentical('-', $HP->parseData('-')); + // UTF-8 needed!!! + } + + function test_tokenizeHTML() { + + $input = array(); + $expect = array(); + $sax_expect = array(); + + $input[0] = ''; + $expect[0] = array(); + + $input[1] = 'This is regular text.'; + $expect[1] = array( + new HTMLPurifier_Token_Text('This is regular text.') + ); + + $input[2] = 'This is bold text'; + $expect[2] = array( + new HTMLPurifier_Token_Text('This is ') + ,new HTMLPurifier_Token_Start('b', array()) + ,new HTMLPurifier_Token_Text('bold') + ,new HTMLPurifier_Token_End('b') + ,new HTMLPurifier_Token_Text(' text') + ); + + $input[3] = '
Totally rad dude. asdf
'; + $expect[3] = array( + new HTMLPurifier_Token_Start('DIV', array()) + ,new HTMLPurifier_Token_Text('Totally rad dude. ') + ,new HTMLPurifier_Token_Start('b', array()) + ,new HTMLPurifier_Token_Text('asdf') + ,new HTMLPurifier_Token_End('b') + ,new HTMLPurifier_Token_End('div') + ); + + // [XML-INVALID] + $input[4] = ''; + $expect[4] = array( + new HTMLPurifier_Token_Start('asdf') + ,new HTMLPurifier_Token_End('asdf') + ,new HTMLPurifier_Token_Start('d') + ,new HTMLPurifier_Token_End('d') + ,new HTMLPurifier_Token_Start('poOloka') + ,new HTMLPurifier_Token_Start('poolasdf') + ,new HTMLPurifier_Token_Start('ds') + ,new HTMLPurifier_Token_End('asdf') + ,new HTMLPurifier_Token_End('ASDF') + ); + // DOM is different because it condenses empty tags into REAL empty ones + // as well as makes it well-formed + $dom_expect[4] = array( + new HTMLPurifier_Token_Empty('asdf') + ,new HTMLPurifier_Token_Empty('d') + ,new HTMLPurifier_Token_Start('pooloka') + ,new HTMLPurifier_Token_Start('poolasdf') + ,new HTMLPurifier_Token_Empty('ds') + ,new HTMLPurifier_Token_End('poolasdf') + ,new HTMLPurifier_Token_End('pooloka') + ); + + $input[5] = 'Link to foobar'; + $expect[5] = array( + new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')) + ,new HTMLPurifier_Token_Text('Link to ') + ,new HTMLPurifier_Token_Start('b',array('id'=>'asdf')) + ,new HTMLPurifier_Token_Text('foobar') + ,new HTMLPurifier_Token_End('b') + ,new HTMLPurifier_Token_End('a') + ); + + $input[6] = '
'; + $expect[6] = array( + new HTMLPurifier_Token_Empty('br') + ); + + // [SGML-INVALID] [RECOVERABLE] + $input[7] = ' '; + $expect[7] = array( + new HTMLPurifier_Token_Comment(' Comment ') + ,new HTMLPurifier_Token_Text(' ') + ,new HTMLPurifier_Token_Comment(' not so well formed -') + ); + $sax_expect[7] = false; // we need to figure out proper comment output + + // [SGML-INVALID] + $input[8] = ''')) + ); + // DOM parses it into an empty tag + $dom_expect[8] = array( + new HTMLPurifier_Token_Empty('a', array('href'=>'')) + ); + + $input[9] = '<b>'; + $expect[9] = array( + new HTMLPurifier_Token_Text('') + ); + $sax_expect[9] = array( + new HTMLPurifier_Token_Text('<') + ,new HTMLPurifier_Token_Text('b') + ,new HTMLPurifier_Token_Text('>') + ); + // note that SAX can clump text nodes together. We won't be + // too picky though + + // [SGML-INVALID] + $input[10] = ''; + // We barf on this, aim for no attributes + $expect[10] = array( + new HTMLPurifier_Token_Start('a', array('"' => '')) + ); + // DOM correctly has no attributes, but also closes the tag + $dom_expect[10] = array( + new HTMLPurifier_Token_Empty('a') + ); + // SAX barfs on this + $sax_expect[10] = array( + new HTMLPurifier_Token_Start('a', array('"' => '')) + ); + + // [INVALID] [RECOVERABLE] + $input[11] = '"'; + $expect[11] = array( new HTMLPurifier_Token_Text('"') ); + + // compare with this valid one: + $input[12] = '"'; + $expect[12] = array( new HTMLPurifier_Token_Text('"') ); + $sax_expect[12] = false; + // SAX chokes on this? We do have entity parsing on, so it should work! + + foreach($input as $i => $discard) { + $result = $this->DirectLex->tokenizeHTML($input[$i]); + $this->assertEqual($expect[$i], $result, 'Test '.$i.': %s'); + paintIf($result, $expect[$i] != $result); + + // assert unless I say otherwise + $sax_result = $this->PEARSax3->tokenizeHTML($input[$i]); + if (!isset($sax_expect[$i])) { + // by default, assert with normal result + $this->assertEqual($expect[$i], $sax_result, 'Test '.$i.': %s'); + paintIf($sax_result, $expect[$i] != $sax_result); + } elseif ($sax_expect[$i] === false) { + // assertions were turned off, optionally dump + // paintIf($sax_expect, $i == NUMBER); + } else { + // match with a custom SAX result array + $this->assertEqual($sax_expect[$i], $sax_result, 'Test '.$i.': %s'); + paintIf($sax_result, $sax_expect[$i] != $sax_result); + } + if ($this->_has_dom) { + $dom_result = $this->DOMLex->tokenizeHTML($input[$i]); + // same structure as SAX + if (!isset($dom_expect[$i])) { + $this->assertEqual($expect[$i], $dom_result, 'Test '.$i.': %s'); + paintIf($dom_result, $expect[$i] != $dom_result); + } elseif ($dom_expect[$i] === false) { + // paintIf($dom_result, $i == NUMBER); + } else { + $this->assertEqual($dom_expect[$i], $dom_result, 'Test '.$i.': %s'); + paintIf($dom_result, $dom_expect[$i] != $dom_result); + } + } + + } + + } + + // internals testing + function test_tokenizeAttributeString() { + + $input[0] = 'href="asdf" boom="assdf"'; + $expect[0] = array('href'=>'asdf', 'boom'=>'assdf'); + + $input[1] = "href='r'"; + $expect[1] = array('href'=>'r'); + + $input[2] = 'onclick="javascript:alert(\'asdf\');"'; + $expect[2] = array('onclick' => "javascript:alert('asdf');"); + + $input[3] = 'selected'; + $expect[3] = array('selected'=>'selected'); + + $input[4] = '="asdf"'; + $expect[4] = array(); + + $input[5] = 'missile=launch'; + $expect[5] = array('missile' => 'launch'); + + $input[6] = 'href="foo'; + $expect[6] = array('href' => 'foo'); + + $input[7] = '"='; + $expect[7] = array('"' => ''); + // 0123456789012345678901234567890123 + $input[8] = 'href ="about:blank"rel ="nofollow"'; + $expect[8] = array('href' => 'about:blank', 'rel' => 'nofollow'); + + $input[9] = 'foo bar'; + $expect[9] = array('foo' => 'foo', 'bar' => 'bar'); + + $input[10] = 'foo="bar" blue'; + $expect[10] = array('foo' => 'bar', 'blue' => 'blue'); + + $size = count($input); + for($i = 0; $i < $size; $i++) { + $result = $this->DirectLex->tokenizeAttributeString($input[$i]); + $this->assertEqual($expect[$i], $result, 'Test ' . $i . ': %s'); + paintIf($result, $expect[$i] != $result); + } + + } + + +} + +?> \ No newline at end of file