diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php
index 69e08098..2320f9c2 100644
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -27,18 +27,23 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
public function tokenizeHTML($string) {
$doc = new DOMDocument();
-
- // preprocess string
- $string = '
'.$string.'
';
+ $doc->encoding = 'UTF-8'; // technically does nothing, but comprehensive
// replace and escape the CDATA sections, since parsing under HTML
// mode won't get 'em.
$string = $this->escapeCDATA($string);
+ // preprocess string, essential for UTF-8
+ $string =
+ ''.
+ ''.
+ ''.$string.'
';
+
@$doc->loadHTML($string); // mute all errors, handle it transparently
return $this->tokenizeDOM(
$doc->childNodes->item(1)-> // html
- childNodes->item(0)-> // body
+ childNodes->item(1)-> // body
childNodes->item(0) // div
);
}
diff --git a/smoketests/utf8.php b/smoketests/utf8.php
new file mode 100644
index 00000000..1b43f368
--- /dev/null
+++ b/smoketests/utf8.php
@@ -0,0 +1,31 @@
+
+
+
+HTMLPurifier UTF-8 Smoketest
+
+
+
+HTMLPurifier UTF-8 Smoketest
+
+ Chinese - 太極拳
+ Russian - ЊЎЖ
+ Arabic - لمنس
+
+';
+
+?>
+Raw
+
+Purified
+purify($string); ?>
+
+
\ No newline at end of file
diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php
index 15869944..dfc0a4d3 100644
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@@ -216,6 +216,11 @@ class HTMLPurifier_LexerTest extends UnitTestCase
);
$sax_expect[16] = false; // PEARSax doesn't support it!
+ // test that UTF-8 is preserved
+ $char_hearts = $this->_entity_lookup->table['hearts'];
+ $input[17] = $char_hearts;
+ $expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) );
+
foreach($input as $i => $discard) {
$result = $this->DirectLex->tokenizeHTML($input[$i]);
$this->assertEqual($expect[$i], $result, 'DirectLexTest '.$i.': %s');