Fix DOM bug where default encoding for HTML docs is not UTF-8.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@252 48356398-32a2-884e-a903-53898d9a118a
2025-03-15 19:07:05 +00:00 · 2006-08-14 13:27:18 +00:00 · 2006-08-14 13:27:18 +00:00 · 299236f695
commit 299236f695
parent ebf0da9b78
3 changed files with 45 additions and 4 deletions
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@ -27,18 +27,23 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
    
    public function tokenizeHTML($string) {
        $doc = new DOMDocument();
-        
-        // preprocess string
-        $string = '<html><body><div>'.$string.'</div></body></html>';
+        $doc->encoding = 'UTF-8'; // technically does nothing, but comprehensive
        
        // replace and escape the CDATA sections, since parsing under HTML
        // mode won't get 'em.
        $string = $this->escapeCDATA($string);
        
+        // preprocess string, essential for UTF-8
+        $string =
+        '<html><head>'.
+        '<meta http-equiv="Content-Type" content="text/html;'.
+            ' charset=utf-8" />'.
+        '</head><body><div>'.$string.'</div></body></html>';
+        
        @$doc->loadHTML($string); // mute all errors, handle it transparently
        return $this->tokenizeDOM(
            $doc->childNodes->item(1)-> // html
-                  childNodes->item(0)-> // body
+                  childNodes->item(1)-> // body
                  childNodes->item(0)   // div
            );
    }
--- a/smoketests/utf8.php
+++ b/smoketests/utf8.php
@ -0,0 +1,31 @@
+<!DOCTYPE html 
+     PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html>
+<head>
+<title>HTMLPurifier UTF-8 Smoketest</title>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+</head>
+<body>
+<h1>HTMLPurifier UTF-8 Smoketest</h1>
+<?php
+
+set_include_path('../library' . PATH_SEPARATOR . get_include_path());
+require_once 'HTMLPurifier.php';
+
+$purifier = new HTMLPurifier();
+$string = '
+<ul>
+    <li><b>Chinese</b> - 太極拳</li>
+    <li><b>Russian</b> - ЊЎЖ</li>
+    <li><b>Arabic</b> - لمنس</li>
+</ul>
+';
+
+?>
+<h2>Raw</h2>
+<?php echo $string; ?>
+<h2>Purified</h2>
+<?php echo $purifier->purify($string); ?>
+</body>
+</html>
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@ -216,6 +216,11 @@ class HTMLPurifier_LexerTest extends UnitTestCase
            );
        $sax_expect[16] = false; // PEARSax doesn't support it!
        
+        // test that UTF-8 is preserved
+        $char_hearts = $this->_entity_lookup->table['hearts'];
+        $input[17] = $char_hearts;
+        $expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) );
+        
        foreach($input as $i => $discard) {
            $result = $this->DirectLex->tokenizeHTML($input[$i]);
            $this->assertEqual($expect[$i], $result, 'DirectLexTest '.$i.': %s');