From 4bf3305dffd68db38ad5d73e364fdc6d410c6443 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sat, 22 Jul 2006 13:50:05 +0000 Subject: [PATCH] Build another lexer from PHP5's DOM library. Extremely fast! git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@80 48356398-32a2-884e-a903-53898d9a118a --- Lexer/DOMLex.php | 77 ++++++++++++++++++++++++++++++++++++++++++++++++ Token.php | 4 +++ tests/Lexer.php | 59 ++++++++++++++++++++++++++++++------- 3 files changed, 130 insertions(+), 10 deletions(-) create mode 100644 Lexer/DOMLex.php diff --git a/Lexer/DOMLex.php b/Lexer/DOMLex.php new file mode 100644 index 00000000..8b72aa24 --- /dev/null +++ b/Lexer/DOMLex.php @@ -0,0 +1,77 @@ +
'.$string.'
'; + @$doc->loadHTML($string); // mute all errors, handle it transparently + return $this->tokenizeDOM( + $doc->childNodes->item(1)-> // html + childNodes->item(0)-> // body + childNodes->item(0) // div + ); + } + + protected function tokenizeDOM($node, $tokens = array(), $collect = false) { + // recursive goodness! + + // intercept non element nodes + + if ( !($node instanceof DOMElement) ) { + if ($node instanceof DOMComment) { + $tokens[] = new HTMLPurifier_Token_Comment($node->data); + } elseif ($node instanceof DOMText) { + $tokens[] = new HTMLPurifier_Token_Text($node->data); + } + // quite possibly, the object wasn't handled, that's fine + return $tokens; + } + + // We still have to make sure that the element actually IS empty + if (!$node->hasChildNodes()) { + if ($collect) { + $tokens[] = new HTMLPurifier_Token_Empty( + $node->tagName, + $this->transformAttrToAssoc($node->attributes) + ); + } + } else { + if ($collect) { // don't wrap on first iteration + $tokens[] = new HTMLPurifier_Token_Start( + $tag_name = $node->tagName, // somehow, it get's dropped + $this->transformAttrToAssoc($node->attributes) + ); + } + foreach ($node->childNodes as $node) { + // remember, it's an accumulator. Otherwise, we'd have + // to use array_merge + $tokens = $this->tokenizeDOM($node, $tokens, true); + } + if ($collect) { + $tokens[] = new HTMLPurifier_Token_End($tag_name); + } + } + + return $tokens; + + } + + protected function transformAttrToAssoc($attribute_list) { + $attribute_array = array(); + // undocumented behavior + foreach ($attribute_list as $key => $attr) { + $attribute_array[$key] = $attr->value; + } + return $attribute_array; + } + +} + +?> \ No newline at end of file diff --git a/Token.php b/Token.php index 1ca8a1db..90b0e3e4 100644 --- a/Token.php +++ b/Token.php @@ -9,6 +9,8 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract var $is_tag = true; var $name; function HTMLPurifier_Token_Tag($name) { + // watch out, actually XML is case-sensitive, while HTML + // is case insensitive, which means we can't use this for XML $this->name = strtolower($name); // for some reason, the SAX parser // uses uppercase. Investigate? } @@ -24,6 +26,8 @@ class HTMLPurifier_Token_RichTag extends HTMLPurifier_Token_Tag // abstract } } +// start CONCRETE ones + class HTMLPurifier_Token_Start extends HTMLPurifier_Token_RichTag { var $type = 'start'; diff --git a/tests/Lexer.php b/tests/Lexer.php index 18c7ca18..8c632745 100644 --- a/tests/Lexer.php +++ b/tests/Lexer.php @@ -2,16 +2,20 @@ require_once 'HTMLPurifier/Lexer/DirectLex.php'; require_once 'HTMLPurifier/Lexer/PEARSax3.php'; +require_once 'HTMLPurifier/Lexer/DOMLex.php'; class Test_HTMLPurifier_Lexer extends UnitTestCase { - var $DirectLex; - var $PEARSax3; + var $DirectLex, $PEARSax3, $DOMLex; + var $_has_dom; function setUp() { - $this->DirectLex =& new HTMLPurifier_Lexer_DirectLex(); - $this->PEARSax3 =& new HTMLPurifier_Lexer_PEARSax3(); + $this->DirectLex = new HTMLPurifier_Lexer_DirectLex(); + $this->PEARSax3 = new HTMLPurifier_Lexer_PEARSax3(); + $this->DOMLex = new HTMLPurifier_Lexer_DOMLex(); + + $this->_has_dom = version_compare(PHP_VERSION, '5', '>='); } function test_nextWhiteSpace() { @@ -67,6 +71,7 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase ,new HTMLPurifier_Token_End('div') ); + // [XML-INVALID] $input[4] = ''; $expect[4] = array( new HTMLPurifier_Token_Start('asdf') @@ -79,6 +84,17 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase ,new HTMLPurifier_Token_End('asdf') ,new HTMLPurifier_Token_End('ASDF') ); + // DOM is different because it condenses empty tags into REAL empty ones + // as well as makes it well-formed + $dom_expect[4] = array( + new HTMLPurifier_Token_Empty('asdf') + ,new HTMLPurifier_Token_Empty('d') + ,new HTMLPurifier_Token_Start('pooloka') + ,new HTMLPurifier_Token_Start('poolasdf') + ,new HTMLPurifier_Token_Empty('ds') + ,new HTMLPurifier_Token_End('poolasdf') + ,new HTMLPurifier_Token_End('pooloka') + ); $input[5] = 'Link to foobar'; $expect[5] = array( @@ -95,7 +111,7 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase new HTMLPurifier_Token_Empty('br') ); - // [INVALID] [RECOVERABLE] + // [SGML-INVALID] [RECOVERABLE] $input[7] = ' '; $expect[7] = array( new HTMLPurifier_Token_Comment(' Comment ') @@ -104,7 +120,7 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase ); $sax_expect[7] = false; // we need to figure out proper comment output - // [INVALID] + // [SGML-INVALID] $input[8] = ''')) ); + // DOM parses it into an empty tag + $dom_expect[8] = array( + new HTMLPurifier_Token_Empty('a', array('href'=>'')) + ); $input[9] = '<b>'; $expect[9] = array( @@ -126,11 +146,15 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase // note that SAX can clump text nodes together. We won't be // too picky though - // [INVALID] + // [SGML-INVALID] $input[10] = ''; $expect[10] = array( new HTMLPurifier_Token_Start('a', array('"' => '')) ); + // DOM doesn't register an invalid attribute + $dom_expect[10] = array( + new HTMLPurifier_Token_Empty('a') + ); // [INVALID] [RECOVERABLE] $input[11] = '"'; @@ -144,27 +168,42 @@ class Test_HTMLPurifier_Lexer extends UnitTestCase foreach($input as $i => $discard) { $result = $this->DirectLex->tokenizeHTML($input[$i]); - $this->assertEqual($expect[$i], $result); + $this->assertEqual($expect[$i], $result, 'Test '.$i.': %s'); paintIf($result, $expect[$i] != $result); // assert unless I say otherwise $sax_result = $this->PEARSax3->tokenizeHTML($input[$i]); if (!isset($sax_expect[$i])) { // by default, assert with normal result - $this->assertEqual($expect[$i], $sax_result); + $this->assertEqual($expect[$i], $sax_result, 'Test '.$i.': %s'); paintIf($sax_result, $expect[$i] != $sax_result); } elseif ($sax_expect[$i] === false) { // assertions were turned off, optionally dump // paintIf($sax_expect, $i == NUMBER); } else { // match with a custom SAX result array - $this->assertEqual($sax_expect[$i], $sax_result); + $this->assertEqual($sax_expect[$i], $sax_result, 'Test '.$i.': %s'); paintIf($sax_result, $sax_expect[$i] != $sax_result); } + if ($this->_has_dom) { + $dom_result = $this->DOMLex->tokenizeHTML($input[$i]); + // same structure as SAX + if (!isset($dom_expect[$i])) { + $this->assertEqual($expect[$i], $dom_result, 'Test '.$i.': %s'); + paintIf($dom_result, $expect[$i] != $dom_result); + } elseif ($dom_expect[$i] === false) { + // paintIf($dom_result, $i == NUMBER); + } else { + $this->assertEqual($dom_expect[$i], $dom_result, 'Test '.$i.': %s'); + paintIf($dom_result, $dom_expect[$i] != $dom_result); + } + } + } } + // internals testing function test_tokenizeAttributeString() { $input[] = 'href="asdf" boom="assdf"';