From 9a35dfa6b9d2aa77d7cc2ee66b6c400aab3d29fb Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Tue, 15 Aug 2006 00:53:24 +0000 Subject: [PATCH] Add support for full document parsing, aka discard everything that's not in-between body if applicable. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@258 48356398-32a2-884e-a903-53898d9a118a --- library/HTMLPurifier/Lexer.php | 12 ++++++-- library/HTMLPurifier/Lexer/DOMLex.php | 20 ++++++++----- library/HTMLPurifier/Lexer/DirectLex.php | 4 +++ library/HTMLPurifier/Lexer/PEARSax3.php | 3 ++ tests/HTMLPurifier/LexerTest.php | 37 ++++++++++++++++++++++++ 5 files changed, 66 insertions(+), 10 deletions(-) diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index e8fbf1a2..ae012ed1 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -239,9 +239,15 @@ class HTMLPurifier_Lexer /** * Takes a string of HTML (fragment or document) and returns the content */ - function extractBody($html) { - if (strpos($html, ']*>(.+?)!is', $html, $matches); + if ($return_bool) return $result; + if ($result) { + return $matches[1]; + } else { + return $html; + } } } diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index 3018423b..0df13ae5 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -28,25 +28,31 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer public function tokenizeHTML($string, $config = null) { if (!$config) $config = HTMLPurifier_Config::createDefault(); + if ($config->get('Core', 'AcceptFullDocuments')) { + $is_full = $this->extractBody($string, true); + } + $doc = new DOMDocument(); - $doc->encoding = 'UTF-8'; // technically does nothing, but comprehensive + $doc->encoding = 'UTF-8'; // technically does nothing, but whatever // replace and escape the CDATA sections, since parsing under HTML // mode won't get 'em. $string = $this->escapeCDATA($string); + if (!$is_full) { // preprocess string, essential for UTF-8 $string = - ''. - ''. - '
'.$string.'
'; + ''. + ''. + ''.$string.''; + } @$doc->loadHTML($string); // mute all errors, handle it transparently + return $this->tokenizeDOM( $doc->childNodes->item(1)-> // html - childNodes->item(1)-> // body - childNodes->item(0) // div + getElementsByTagName('body')->item(0) // body ); } diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index 29634b69..535b3866 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -114,6 +114,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $string = @ (string) $string; if ($string == '') return array(); + if ($config->get('Core', 'AcceptFullDocuments')) { + $string = $this->extractBody($string); + } + $cursor = 0; // our location in the text $inside_tag = false; // whether or not we're parsing the inside of a tag $array = array(); // result array diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php index da3843b0..02b3a484 100644 --- a/library/HTMLPurifier/Lexer/PEARSax3.php +++ b/library/HTMLPurifier/Lexer/PEARSax3.php @@ -32,6 +32,9 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer function tokenizeHTML($html, $config = null) { if (!$config) $config = HTMLPurifier_Config::createDefault(); $html = $this->escapeCDATA($html); + if ($config->get('Core', 'AcceptFullDocuments')) { + $html = $this->extractBody($html); + } $html = $this->substituteNonSpecialEntities($html); $parser=& new XML_HTMLSax3(); $parser->set_object($this); diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index 9f71a838..b42b5edb 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -40,7 +40,44 @@ class HTMLPurifier_LexerTest extends UnitTestCase $this->Lexer->substituteNonSpecialEntities('"') ); } + function assertExtractBody($text, $extract = true) { + $result = $this->Lexer->extractBody($text); + if ($extract === true) $extract = $text; + $this->assertIdentical($extract, $result); + } + function test_extractBody() { + $this->assertExtractBody('Bold'); + $this->assertExtractBody('Bold', 'Bold'); + $this->assertExtractBody( +' + + + + xyz + + +
+
+ + + +
+
+ +', + ' +
+
+ + + +
+
+ '); + $this->assertExtractBody('Bold', 'Bold'); + $this->assertExtractBody('asdf'); // not closed, don't accept }