From 299236f69562be5680b8c787ffd7bc24505c0033 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Mon, 14 Aug 2006 13:27:18 +0000 Subject: [PATCH] Fix DOM bug where default encoding for HTML docs is not UTF-8. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@252 48356398-32a2-884e-a903-53898d9a118a --- library/HTMLPurifier/Lexer/DOMLex.php | 13 +++++++---- smoketests/utf8.php | 31 +++++++++++++++++++++++++++ tests/HTMLPurifier/LexerTest.php | 5 +++++ 3 files changed, 45 insertions(+), 4 deletions(-) create mode 100644 smoketests/utf8.php diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index 69e08098..2320f9c2 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -27,18 +27,23 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer public function tokenizeHTML($string) { $doc = new DOMDocument(); - - // preprocess string - $string = '
'.$string.'
'; + $doc->encoding = 'UTF-8'; // technically does nothing, but comprehensive // replace and escape the CDATA sections, since parsing under HTML // mode won't get 'em. $string = $this->escapeCDATA($string); + // preprocess string, essential for UTF-8 + $string = + ''. + ''. + '
'.$string.'
'; + @$doc->loadHTML($string); // mute all errors, handle it transparently return $this->tokenizeDOM( $doc->childNodes->item(1)-> // html - childNodes->item(0)-> // body + childNodes->item(1)-> // body childNodes->item(0) // div ); } diff --git a/smoketests/utf8.php b/smoketests/utf8.php new file mode 100644 index 00000000..1b43f368 --- /dev/null +++ b/smoketests/utf8.php @@ -0,0 +1,31 @@ + + + +HTMLPurifier UTF-8 Smoketest + + + +

HTMLPurifier UTF-8 Smoketest

+ +
  • Chinese - 太極拳
  • +
  • Russian - ЊЎЖ
  • +
  • Arabic - لمنس
  • + +'; + +?> +

    Raw

    + +

    Purified

    +purify($string); ?> + + \ No newline at end of file diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index 15869944..dfc0a4d3 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -216,6 +216,11 @@ class HTMLPurifier_LexerTest extends UnitTestCase ); $sax_expect[16] = false; // PEARSax doesn't support it! + // test that UTF-8 is preserved + $char_hearts = $this->_entity_lookup->table['hearts']; + $input[17] = $char_hearts; + $expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) ); + foreach($input as $i => $discard) { $result = $this->DirectLex->tokenizeHTML($input[$i]); $this->assertEqual($expect[$i], $result, 'DirectLexTest '.$i.': %s');