From 299236f69562be5680b8c787ffd7bc24505c0033 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <edwardzyang@thewritingpot.com>
Date: Mon, 14 Aug 2006 13:27:18 +0000
Subject: [PATCH] Fix DOM bug where default encoding for HTML docs is not
 UTF-8.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@252 48356398-32a2-884e-a903-53898d9a118a
---
 library/HTMLPurifier/Lexer/DOMLex.php | 13 +++++++----
 smoketests/utf8.php                   | 31 +++++++++++++++++++++++++++
 tests/HTMLPurifier/LexerTest.php      |  5 +++++
 3 files changed, 45 insertions(+), 4 deletions(-)
 create mode 100644 smoketests/utf8.php
diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php
index 69e08098..2320f9c2 100644
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -27,18 +27,23 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
     
     public function tokenizeHTML($string) {
         $doc = new DOMDocument();
-        
-        // preprocess string
-        $string = '<html><body><div>'.$string.'</div></body></html>';
+        $doc->encoding = 'UTF-8'; // technically does nothing, but comprehensive
         
         // replace and escape the CDATA sections, since parsing under HTML
         // mode won't get 'em.
         $string = $this->escapeCDATA($string);
         
+        // preprocess string, essential for UTF-8
+        $string =
+        '<html><head>'.
+        '<meta http-equiv="Content-Type" content="text/html;'.
+            ' charset=utf-8" />'.
+        '</head><body><div>'.$string.'</div></body></html>';
+        
         @$doc->loadHTML($string); // mute all errors, handle it transparently
         return $this->tokenizeDOM(
             $doc->childNodes->item(1)-> // html
-                  childNodes->item(0)-> // body
+                  childNodes->item(1)-> // body
                   childNodes->item(0)   // div
             );
     }
diff --git a/smoketests/utf8.php b/smoketests/utf8.php
new file mode 100644
index 00000000..1b43f368
--- /dev/null
+++ b/smoketests/utf8.php
@@ -0,0 +1,31 @@
+<!DOCTYPE html 
+     PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html>
+<head>
+<title>HTMLPurifier UTF-8 Smoketest</title>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+</head>
+<body>
+<h1>HTMLPurifier UTF-8 Smoketest</h1>
+<?php
+
+set_include_path('../library' . PATH_SEPARATOR . get_include_path());
+require_once 'HTMLPurifier.php';
+
+$purifier = new HTMLPurifier();
+$string = '
+<ul>
+    <li><b>Chinese</b> - 太極拳</li>
+    <li><b>Russian</b> - ЊЎЖ</li>
+    <li><b>Arabic</b> - لمنس</li>
+</ul>
+';
+
+?>
+<h2>Raw</h2>
+<?php echo $string; ?>
+<h2>Purified</h2>
+<?php echo $purifier->purify($string); ?>
+</body>
+</html>
\ No newline at end of file
diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php
index 15869944..dfc0a4d3 100644
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@@ -216,6 +216,11 @@ class HTMLPurifier_LexerTest extends UnitTestCase
             );
         $sax_expect[16] = false; // PEARSax doesn't support it!
         
+        // test that UTF-8 is preserved
+        $char_hearts = $this->_entity_lookup->table['hearts'];
+        $input[17] = $char_hearts;
+        $expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) );
+        
         foreach($input as $i => $discard) {
             $result = $this->DirectLex->tokenizeHTML($input[$i]);
             $this->assertEqual($expect[$i], $result, 'DirectLexTest '.$i.': %s');